Ejemplo n.º 1
0
    def test_crawl(self):
        # Running the bfs crawler
        num_nodes_before_crash = 2
        checkpoint_freq = 2
        webnetwork = TestWebCaller("test", self.network_edges)
        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True,
                           checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash])
        self.crawler.close()

        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        print("Node visit order is", node_visit_order)
        self.compare_values(node_visit_order,
                            test_visit_order=self.correct_visit_order)
    def test_crawl(self):
        # Running the bfs crawler
        num_nodes_before_crash = 2
        checkpoint_freq = 2
        webnetwork = TestWebCaller("test", self.network_edges)
        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(
            seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq
        )

        node_visit_order = webnetwork.get_visit_order()
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash])
        self.crawler.close()

        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        print ("Node visit order is", node_visit_order)
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order)
Ejemplo n.º 3
0
class TestBFSNetworkCrawler(unittest.TestCase):
    def setUp(self):
        self.network_edges = {"11": ["1", "5", "8"], "1": ["11", "8"],
                              "8": ["1", "11", "5"], "5": ["11", "8"]}
        self.node_info_data = None
        self.node_connections_data = None
        self.correct_visit_order = ["11", "5", "8", "1"]
        self.crawler = None
        self.STORE_TYPE = "sqlite"

    def tearDown(self):
        """
        Function that deletes all files after the test is complete. Disable this for testing if required.
        """
        print("Deleting all data files generated as a part of test tearDown().")
        self.crawler.gbuffer.destroy_stores()
        self.crawler.pqueue.destroy_state()


    def test_crawl(self):
        # Running the bfs crawler
        num_nodes_before_crash = 2
        checkpoint_freq = 2
        webnetwork = TestWebCaller("test", self.network_edges)
        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True,
                           checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash])
        self.crawler.close()

        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        print("Node visit order is", node_visit_order)
        self.compare_values(node_visit_order,
                            test_visit_order=self.correct_visit_order)

        #self.crawler.close()

    def compare_values(self, real_visit_order, test_visit_order):
        # Accessing the results of calling the source code
        self.node_info_data = self.crawler.gbuffer.nodes_store
        print(self.node_info_data)
        self.node_connections_data = self.crawler.gbuffer.edges_store
        self.node_visit_order = list(OrderedDict.fromkeys(real_visit_order))

        # Testing the results with expected values
        test_node_info = {}
        test_node_edges_data = {}
        #test_visit_order = ["11", "5", "8", "1"]
        node_counter = 0
        edge_counter = 0
        test_webnetwork = TestWebCaller("test", self.network_edges)
        for i in test_visit_order:
            curr_node_info = test_webnetwork.get_node_info(i)
            curr_node_info['id'] = node_counter
            test_node_info.update({str(i): curr_node_info})
            curr_edges_info = test_webnetwork.get_edges_info(i)
            for edge_info in curr_edges_info:
                edge_info['id'] = edge_counter
                test_node_edges_data[str(edge_counter)] = edge_info
                edge_counter += 1
            node_counter += 1

        print self.node_visit_order
        pprint(self.node_info_data)
        pprint(test_node_info)
        pprint(self.node_connections_data)
        pprint(test_node_edges_data)
        #pprint(self.node_connections_data)
        self.assertListEqual(test_visit_order, self.node_visit_order,
                             "Problem in order of BFS crawl")
        self.assertDictEqual(test_node_info, dict(self.node_info_data),
                             "Nodes do not match: Problem in BFS crawl.")
        self.assertDictEqual(test_node_edges_data, dict(self.node_connections_data),
                             "Edges do not match: Problem in BFS crawl.")
class TestBFSNetworkCrawler(unittest.TestCase):
    def setUp(self):
        self.network_edges = {"11": ["1", "5", "8"], "1": ["11", "8"], "8": ["1", "11", "5"], "5": ["11", "8"]}
        self.node_info_data = None
        self.node_connections_data = None
        self.correct_visit_order = ["11", "5", "8", "1"]
        self.crawler = None
        self.STORE_TYPE = "sqlite"

    def tearDown(self):
        """
        Function that deletes all files after the test is complete. Disable this for testing if required.
        """
        print ("Deleting all data files generated as a part of test tearDown().")
        self.crawler.gbuffer.destroy_stores()
        self.crawler.pqueue.destroy_state()

    def test_crawl(self):
        # Running the bfs crawler
        num_nodes_before_crash = 2
        checkpoint_freq = 2
        webnetwork = TestWebCaller("test", self.network_edges)
        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(
            seed_nodes=["11", "5"], max_nodes=num_nodes_before_crash, recover=True, checkpoint_frequency=checkpoint_freq
        )

        node_visit_order = webnetwork.get_visit_order()
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order[:num_nodes_before_crash])
        self.crawler.close()

        self.crawler = BFSNetworkCrawler(webnetwork, store_type=self.STORE_TYPE)
        self.crawler.crawl(seed_nodes=["11", "5"], max_nodes=10, recover=True, checkpoint_frequency=checkpoint_freq)

        node_visit_order = webnetwork.get_visit_order()
        print ("Node visit order is", node_visit_order)
        self.compare_values(node_visit_order, test_visit_order=self.correct_visit_order)

        # self.crawler.close()

    def compare_values(self, real_visit_order, test_visit_order):
        # Accessing the results of calling the source code
        self.node_info_data = self.crawler.gbuffer.nodes_store
        print (self.node_info_data)
        self.node_connections_data = self.crawler.gbuffer.edges_store
        self.node_visit_order = list(OrderedDict.fromkeys(real_visit_order))

        # Testing the results with expected values
        test_node_info = {}
        test_node_edges_data = {}
        # test_visit_order = ["11", "5", "8", "1"]
        node_counter = 0
        edge_counter = 0
        test_webnetwork = TestWebCaller("test", self.network_edges)
        for i in test_visit_order:
            curr_node_info = test_webnetwork.get_node_info(i)
            curr_node_info["id"] = node_counter
            test_node_info.update({str(i): curr_node_info})
            curr_edges_info = test_webnetwork.get_edges_info(i)
            for edge_info in curr_edges_info:
                edge_info["id"] = edge_counter
                test_node_edges_data[str(edge_counter)] = edge_info
                edge_counter += 1
            node_counter += 1

        print self.node_visit_order
        pprint(self.node_info_data)
        pprint(test_node_info)
        pprint(self.node_connections_data)
        pprint(test_node_edges_data)
        # pprint(self.node_connections_data)
        self.assertListEqual(test_visit_order, self.node_visit_order, "Problem in order of BFS crawl")
        self.assertDictEqual(test_node_info, dict(self.node_info_data), "Nodes do not match: Problem in BFS crawl.")
        self.assertDictEqual(
            test_node_edges_data, dict(self.node_connections_data), "Edges do not match: Problem in BFS crawl."
        )
    logging.basicConfig(filename="socintpy.log", level=numeric_loglevel)

    # Fetch api object using the settings from settings.py
    api_args = get_args(settings)
    api = get_api(api_args)
    """
    # If you want to pass a dictionary (say params_dict), use **params_dict
    result = api.get_data(user='******', method="user.getRecentTracks",
    max_results = 100)                
    print result
    """
    #####NOTENOTE: I changed how api calls handle errors. if you get one error, then it breaks and does not fetch other api calls for the same user.
    #####TEST it when YOU NEXT RUN THIS CRAWLER. Also, the checkpoint_Freq should not be changed. Keep it to one unless further notification.
    cmd_params = cmd_script.get_cmd_parameters(sys.argv)
    if cmd_params['mode'] == "fetch_data":
        # Set up the data crawl
        logging.info("STARTING FETCH_DATA CRAWL. STANDBY!")
        crawler = BFSNetworkCrawler(api, store_type="sqlite")

        #crawler = BFSNetworkCrawler(api, seed_nodes=None, store_type="basic_shelve", recover=True)
        # Start the data crawl
        crawler.crawl(seed_nodes=api.get_uniform_random_nodes(100), max_nodes=1000000, recover=True, checkpoint_frequency=1)

    elif cmd_params['mode'] == "retry_errors":
        nodes_with_error = cmd_script.get_nodes_with_error(logfile="socintpy_old.log")
        print nodes_with_error, len(nodes_with_error)
        logging.info("STARTING RETRY_ERRORS CRAWL. STANDBY!")
        crawler = FixedNodesCrawler(api, store_type="sqlite")
        nodes_stored, edges_stored = cmd_script.recover_num_items_stored(logfile="socinty_old.log")
        crawler.crawl(nodes_list=nodes_with_error, start_node_id=nodes_stored, start_edge_id=edges_stored)