def recover_num_items_stored(logfile): state_store_path = read_param_from_log(logfile, param_name="STATE_STORE_PATH") checkpoint_freq = read_param_from_log( logfile, param_name="CHECKPOINT_FREQUENCY" ) # in real code, this would be fetch from the log. Currently the log that we have run does not support it. state_store = PriorityQueue("sqlite", store_name=state_store_path) return state_store.initialize(seed_nodes=[], do_recovery=True, checkpoint_freq=checkpoint_freq)
def __init__(self, network, store_type="gml"): """ Function to initialize a network crawler. Args: network: the network object to be crawled store_type: the type of storage. Valid values are "gml", "basic_shelve" or "couchdb" Returns: An initialized crawler object """ self.network = network store_class = GenericStore.get_store_class(store_type) # Buffer to store network data self.gbuffer = GraphBuffer(self.network.label, store_class) #self.recover = recover # Priority queue in memory that keeps track of nodes to visit self.pqueue = PriorityQueue(store_class, store_name=self.network.label + "_state")
class BFSNetworkCrawler(NetworkCrawlerInterface): """ BFS Crawler. Uses a weighted BFS implementation which gives more weight to nodes that are seen multiple times. """ def __init__(self, network, store_type="gml"): """ Function to initialize a network crawler. Args: network: the network object to be crawled store_type: the type of storage. Valid values are "gml", "basic_shelve" or "couchdb" Returns: An initialized crawler object """ self.network = network store_class = GenericStore.get_store_class(store_type) # Buffer to store network data self.gbuffer = GraphBuffer(self.network.label, store_class) #self.recover = recover # Priority queue in memory that keeps track of nodes to visit self.pqueue = PriorityQueue(store_class, store_name=self.network.label + "_state") """ def set_seed_nodes(self, seed_values): for seed in seed_values: self.add_to_crawl(seed) """ def add_to_crawl(self, node, priority=0): self.pqueue.push(node, priority) #self.visited[node] = False def __del__(self): self.close() def close(self): self.gbuffer.close() self.pqueue.close() def crawl(self, seed_nodes, max_nodes=10, recover=True, checkpoint_frequency=100): """ Function to crawl the network using BFS. Works in two modes: fresh or recover. If recover is False, then seed_nodes needs to be not None. """ # if the crawl was stopped for some reason, recover parameter helps to # restart it from where it stopped. #if self.recover and not self.pqueue.is_empty(): # Just trying to minimize conflict. May lead to duplicate storage. # node_counter = itertools.count(self.gbuffer.nodes_store.get_maximum_id()+1) # edge_counter = itertools.count(self.gbuffer.edges_store.get_maximum_id()+1) # print("Node counter", node_counter) #else: # self.set_seed_nodes(self.seed_nodes) # WARNING: The initialize function assumes that the checkpoint_freq remains the same between current run and the previous run which is being recovered. prior_num_nodes_visited, prior_num_edges_visited = self.pqueue.initialize(seed_nodes, do_recovery=recover, checkpoint_freq=checkpoint_frequency) node_counter = itertools.count(prior_num_nodes_visited) edge_counter = itertools.count(prior_num_edges_visited) iterations = 0 try: while (not self.pqueue.is_empty()) and iterations < max_nodes: iterations += 1 new_node = self.pqueue.pop() logging.info("Popped %s from queue and now starting to process it..." % new_node) print "Popped %s from queue and now starting to process it..." % new_node # Ignore if new_node has already been visited if new_node in self.pqueue.visited: continue # Get details about the current node new_node_info = self.network.get_node_info(new_node) if new_node_info is None: error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node logging.error(error_msg) print error_msg continue #raise NameError # Assume dict output for all stores. new_node_edges_info = self.network.get_edges_info(new_node) if new_node_edges_info is None: error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node logging.error(error_msg) print error_msg continue #raise NameError logging.info("Crawler: Got information about %s" % new_node) print "Got all data" for edge_info in new_node_edges_info: node = edge_info['target'] if node not in self.pqueue.visited: # Update priority if node already exists in the queue if node in self.pqueue.queue_dict: node_priority = self.pqueue.mark_removed(node) updated_priority = node_priority - 1 self.add_to_crawl(node, updated_priority) # Otherwise just add the node to the queue else: self.add_to_crawl(node) # Now storing the data about the node and its edges print "Starting to store node info" new_node_info['id'] = next(node_counter) self.gbuffer.store_node(new_node, new_node_info) self.pqueue.mark_visited(new_node) for edge_info in new_node_edges_info: edge_info['id'] = next(edge_counter) self.gbuffer.store_edge(str(edge_info['id']), edge_info) logging.info("Processed %s \n" % new_node) print "Processed ", new_node if iterations % checkpoint_frequency == 0: #curr_num_nodes_stored = self.gbuffer.nodes_store.get_maximum_id() + 1 curr_num_nodes_stored = self.gbuffer.nodes_store.get_count_records() print("Making a checkpoint for pqueue state", iterations, curr_num_nodes_stored) logging.info("Making a checkpoint for pqueue state: %d", curr_num_nodes_stored) self.pqueue.save_checkpoint(curr_num_nodes_stored, self.gbuffer.edges_store.get_count_records(), checkpoint_frequency) #print gc.collect() except (KeyboardInterrupt, SystemExit): self.pqueue.close() self.gbuffer.close() return
class PriorityQueueTestCase(unittest.TestCase): def setUp(self): self.pqueue = PriorityQueue(store_class=STORE_CLASS, store_name="testpq") self.test_data = [(10, "Phy"), (12, "Maths"), (30, "Chem"), (25, "Bio"), (41, "English")] def tearDown(self): self.pqueue.close() def test_push(self): for data_element in self.test_data: self.pqueue.push(data_element) test_list = [] test_dict = {} for i in range(len(self.test_data)): entry = [0, i, self.test_data[i]] test_list.append(entry) test_dict[self.test_data[i]] = entry self.assertListEqual(self.pqueue.queue, test_list, "Problem with push to queue.") self.assertDictEqual(self.pqueue.queue_dict, test_dict, "Problem with push to queue.") def test_remove(self): for data_element in self.test_data[:3]: self.pqueue.push(data_element) node1 = self.test_data[1] self.pqueue.mark_removed(node1) self.pqueue.push(self.test_data[3]) node3 = self.test_data[0] self.pqueue.mark_removed(node3) test_list = [] test_dict = {} for i in range(len(self.test_data[:4])): entry = [0, i, self.test_data[i]] test_list.append(entry) test_dict[self.test_data[i]] = entry test_list[0][2] = "REMOVD" test_list[1][2] = "REMOVD" del test_dict[self.test_data[0]] del test_dict[self.test_data[1]] #print test_list, test_dict #print self.pqueue.queue, self.pqueue.queue_dict self.assertListEqual(self.pqueue.queue, test_list, "Problem with mark_removed on PriorityQueue.") self.assertDictEqual(self.pqueue.queue_dict, test_dict, "Problem with mark_removed on PriorityQueue.") def test_pop(self): for data_element in self.test_data[:3]: self.pqueue.push(data_element) node1 = self.test_data[1] self.pqueue.mark_removed(node1) self.pqueue.pop() node1 = self.test_data[3] self.pqueue.push(node1) self.pqueue.pop() self.pqueue.pop() test_list = [] test_dict = {} self.assertListEqual(self.pqueue.queue, test_list, "Problem with pop on PriorityQueue.") self.assertDictEqual(self.pqueue.queue_dict, test_dict, "Problem with pop on PriorityQueue.")
def setUp(self): self.pqueue = PriorityQueue(store_class=STORE_CLASS, store_name="testpq") self.test_data = [(10, "Phy"), (12, "Maths"), (30, "Chem"), (25, "Bio"), (41, "English")]
class BFSNetworkCrawler(NetworkCrawlerInterface): """ BFS Crawler. Uses a weighted BFS implementation which gives more weight to nodes that are seen multiple times. """ def __init__(self, network, store_type="gml"): """ Function to initialize a network crawler. Args: network: the network object to be crawled store_type: the type of storage. Valid values are "gml", "basic_shelve" or "couchdb" Returns: An initialized crawler object """ self.network = network store_class = GenericStore.get_store_class(store_type) # Buffer to store network data self.gbuffer = GraphBuffer(self.network.label, store_class) #self.recover = recover # Priority queue in memory that keeps track of nodes to visit self.pqueue = PriorityQueue(store_class, store_name=self.network.label + "_state") """ def set_seed_nodes(self, seed_values): for seed in seed_values: self.add_to_crawl(seed) """ def add_to_crawl(self, node, priority=0): self.pqueue.push(node, priority) #self.visited[node] = False def __del__(self): self.close() def close(self): self.gbuffer.close() self.pqueue.close() def crawl(self, seed_nodes, max_nodes=10, recover=True, checkpoint_frequency=100): """ Function to crawl the network using BFS. Works in two modes: fresh or recover. If recover is False, then seed_nodes needs to be not None. """ # if the crawl was stopped for some reason, recover parameter helps to # restart it from where it stopped. #if self.recover and not self.pqueue.is_empty(): # Just trying to minimize conflict. May lead to duplicate storage. # node_counter = itertools.count(self.gbuffer.nodes_store.get_maximum_id()+1) # edge_counter = itertools.count(self.gbuffer.edges_store.get_maximum_id()+1) # print("Node counter", node_counter) #else: # self.set_seed_nodes(self.seed_nodes) # WARNING: The initialize function assumes that the checkpoint_freq remains the same between current run and the previous run which is being recovered. prior_num_nodes_visited, prior_num_edges_visited = self.pqueue.initialize( seed_nodes, do_recovery=recover, checkpoint_freq=checkpoint_frequency) node_counter = itertools.count(prior_num_nodes_visited) edge_counter = itertools.count(prior_num_edges_visited) iterations = 0 try: while (not self.pqueue.is_empty()) and iterations < max_nodes: iterations += 1 new_node = self.pqueue.pop() logging.info( "Popped %s from queue and now starting to process it..." % new_node) print "Popped %s from queue and now starting to process it..." % new_node # Ignore if new_node has already been visited if new_node in self.pqueue.visited: continue # Get details about the current node new_node_info = self.network.get_node_info(new_node) if new_node_info is None: error_msg = "Crawler: Error in fetching node info. Skipping node %s" % new_node logging.error(error_msg) print error_msg continue #raise NameError # Assume dict output for all stores. new_node_edges_info = self.network.get_edges_info(new_node) if new_node_edges_info is None: error_msg = "Crawler: Error in fetching node edges info. Skipping node %s" % new_node logging.error(error_msg) print error_msg continue #raise NameError logging.info("Crawler: Got information about %s" % new_node) print "Got all data" for edge_info in new_node_edges_info: node = edge_info['target'] if node not in self.pqueue.visited: # Update priority if node already exists in the queue if node in self.pqueue.queue_dict: node_priority = self.pqueue.mark_removed(node) updated_priority = node_priority - 1 self.add_to_crawl(node, updated_priority) # Otherwise just add the node to the queue else: self.add_to_crawl(node) # Now storing the data about the node and its edges print "Starting to store node info" new_node_info['id'] = next(node_counter) self.gbuffer.store_node(new_node, new_node_info) self.pqueue.mark_visited(new_node) for edge_info in new_node_edges_info: edge_info['id'] = next(edge_counter) self.gbuffer.store_edge(str(edge_info['id']), edge_info) logging.info("Processed %s \n" % new_node) print "Processed ", new_node if iterations % checkpoint_frequency == 0: #curr_num_nodes_stored = self.gbuffer.nodes_store.get_maximum_id() + 1 curr_num_nodes_stored = self.gbuffer.nodes_store.get_count_records( ) print("Making a checkpoint for pqueue state", iterations, curr_num_nodes_stored) logging.info("Making a checkpoint for pqueue state: %d", curr_num_nodes_stored) self.pqueue.save_checkpoint( curr_num_nodes_stored, self.gbuffer.edges_store.get_count_records(), checkpoint_frequency) #print gc.collect() except (KeyboardInterrupt, SystemExit): self.pqueue.close() self.gbuffer.close() return
def recover_num_items_stored(logfile): state_store_path = read_param_from_log(logfile, param_name="STATE_STORE_PATH") checkpoint_freq = read_param_from_log(logfile, param_name="CHECKPOINT_FREQUENCY") # in real code, this would be fetch from the log. Currently the log that we have run does not support it. state_store = PriorityQueue("sqlite", store_name=state_store_path) return state_store.initialize(seed_nodes=[], do_recovery=True, checkpoint_freq=checkpoint_freq )