def crawl_heuristic_bfs(self, *args, **kwargs): """Heuristic breadth-first search crawl""" page_title = kwargs.get('page_title', self.page_title) timestamp = kwargs.get('timestamp', self.timestamp) node_count = kwargs.get('node_count', self.node_count) #first_layer_only = kwargs.get('first_layer_only', False) div_from_root = kwargs.get('div_from_root', False) div_func = kwargs.get('div_func', lambda x, y: jsd2(x, y)) debug = kwargs.get('debug', False) date_from_revision = kwargs.get('date_from_revision', False) graph = nx.Graph(name="{0}&&{1}".format(page_title, timestamp.date()), starting_node=page_title, starting_date=timestamp, year=timestamp.year) self.graphs.append(graph) if debug: print("WikiCrawler.crawl_wiki starting") q = queue.PriorityQueue() visited = set() q.put((0, page_title)) root_pdist = None root_title = None root_revid = None #for score, page_title in iter(q.get, None): while not q.empty(): score, page_title = q.get() if page_title in visited: continue if len(graph) > node_count: break visited.add(page_title) if debug: print("{0}".format(page_title)) revid = self._get_revisionid_before_date(page_title, timestamp, debug) if not revid: if debug: print("{0} - no revid found".format(page_title)) continue # Crawl by prioritizing article links with smaller JS divergence if not root_pdist and div_from_root: root_pdist = self._get_revision_word_dist(page_title, revid) root_title = page_title root_revid = revid if debug: print("{0}: root_pdist's top 3: {1}".format( page_title, root_pdist.most_common(3))) #current_pdist = self._get_revision_word_dist(page_title, revid) forward_links = self._get_revision_forward_links( page_title, revid, debug) if debug: print("{0}: {1} nodes".format(page_title, len(graph))) print("{0}: {1} forward links ".format(page_title, len(forward_links))) forward_links = [ self._get_canonical_page_title(link) for link in forward_links ] forward_links = [flink for flink in forward_links if flink] for forward_link in forward_links: if forward_link: flink_revid = self._get_revisionid_before_date( forward_link, timestamp) if not flink_revid: continue if div_from_root: div = self._get_div_score(root_title, root_revid, forward_link, flink_revid, div_func=div_func) #div = div_func(root_pdist, flink_pdist) q.put((div, forward_link)) #div = div_func(current_pdist, flink_pdist) if not div_from_root: div = self._get_div_score(page_title, revid, forward_link, flink_revid, div_func=div_func) q.put((div, forward_link)) if debug: print("{0} -> {1} - divergence: {2}".format( page_title, forward_link, div)) graph.add_edge(page_title, forward_link, div=div, revid=flink_revid) if debug: print() return
def crawl_heuristic_bfs(self, *args, **kwargs): """Heuristic breadth-first search crawl""" page_title = kwargs.get('page_title', self.page_title) timestamp = kwargs.get('timestamp', self.timestamp) node_count = kwargs.get('node_count', self.node_count) #first_layer_only = kwargs.get('first_layer_only', False) div_from_root = kwargs.get('div_from_root', False) div_func = kwargs.get('div_func', lambda x, y: jsd2(x,y)) debug = kwargs.get('debug', False) date_from_revision = kwargs.get('date_from_revision', False) graph = nx.Graph(name="{0}&&{1}".format(page_title, timestamp.date()), starting_node=page_title, starting_date=timestamp, year=timestamp.year) self.graphs.append(graph) if debug: print("WikiCrawler.crawl_wiki starting") q = queue.PriorityQueue() visited = set() q.put((0, page_title)) root_pdist = None root_title = None root_revid = None #for score, page_title in iter(q.get, None): while not q.empty(): score, page_title = q.get() if page_title in visited: continue if len(graph) > node_count: break visited.add(page_title) if debug: print("{0}".format(page_title)) revid = self._get_revisionid_before_date(page_title, timestamp, debug) if not revid: if debug: print("{0} - no revid found".format(page_title)) continue # Crawl by prioritizing article links with smaller JS divergence if not root_pdist and div_from_root: root_pdist = self._get_revision_word_dist(page_title, revid) root_title = page_title root_revid = revid if debug: print("{0}: root_pdist's top 3: {1}".format(page_title, root_pdist.most_common(3))) #current_pdist = self._get_revision_word_dist(page_title, revid) forward_links = self._get_revision_forward_links(page_title, revid, debug) if debug: print("{0}: {1} nodes".format(page_title, len(graph))) print("{0}: {1} forward links ".format(page_title, len(forward_links))) forward_links = [self._get_canonical_page_title(link) for link in forward_links] forward_links = [flink for flink in forward_links if flink] for forward_link in forward_links: if forward_link: flink_revid = self._get_revisionid_before_date(forward_link, timestamp) if not flink_revid: continue if div_from_root: div = self._get_div_score(root_title, root_revid, forward_link, flink_revid, div_func=div_func) #div = div_func(root_pdist, flink_pdist) q.put((div, forward_link)) #div = div_func(current_pdist, flink_pdist) if not div_from_root: div = self._get_div_score(page_title, revid, forward_link, flink_revid, div_func=div_func) q.put((div, forward_link)) if debug: print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div)) graph.add_edge(page_title, forward_link, div=div, revid=flink_revid) if debug: print() return
def crawl_heuristic_bfs(self, *args, **kwargs): """Heuristic breadth-first search crawl""" page_title = kwargs.get('page_title', self.page_title) timestamp = kwargs.get('timestamp', self.timestamp) node_count = kwargs.get('node_count', self.node_count) debug = kwargs.get('debug', False) div_from_root = kwargs.get('div_from_root', False) div_func = kwargs.get('div_func', lambda x, y: jsd2(x,y)) date_from_revision = kwargs.get('date_from_revision', False) graph = nx.Graph() self.graphs.append(graph) if debug: print("MultiThreadedWikiCrawler.crawl_wiki starting") q = queue.PriorityQueue() visited = set() q.put((0, page_title)) root_pdist = None root_title = None for score, page_title in iter(q.get, None): if page_title in visited: continue if len(graph) > node_count: break visited.add(page_title) if debug: print("{0}".format(page_title)) revid = self._get_revisionid_before_date(page_title, timestamp, debug) if not revid: continue # Crawl by prioritizing article links with smaller JS divergence if not root_pdist and div_from_root: root_pdist = self._get_revision_word_dist(page_title, revid) root_title = page_title if debug: print("{0}: root_pdist's top 3: {1}".format(page_title, root_pdist.most_common(3))) current_pdist = self._get_revision_word_dist(page_title, revid) forward_links = self._get_revision_forward_links(page_title, revid) if debug: print("{0}: {1} nodes".format(page_title, len(graph))) print("{0}: {1} forward links ".format(page_title, len(forward_links))) forward_links = [self._get_canonical_page_title(link) for link in forward_links] for forward_link in forward_links: if forward_link: flink_revid = self._get_revisionid_before_date(forward_link, timestamp) if not flink_revid: continue if div_from_root: if tuple(sorted([root_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores: div = self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] else: flink_pdist = self._get_revision_word_dist(forward_link, flink_revid) div = div_func(root_pdist, flink_pdist) self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] = div #div = div_func(root_pdist, flink_pdist) q.put((div, forward_link)) if tuple(sorted([page_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores: div = self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] else: flink_pdist = self._get_revision_word_dist(forward_link, flink_revid) div = div_func(current_pdist, flink_pdist) self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] = div #div = div_func(current_pdist, flink_pdist) if not div_from_root: q.put((div, forward_link)) if debug: print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div)) graph.add_edge(page_title, forward_link, div=div, revid=flink_revid) if debug: print() return
def crawl_heuristic_bfs(self, *args, **kwargs): """Heuristic breadth-first search crawl""" page_title = kwargs.get("page_title", self.page_title) timestamp = kwargs.get("timestamp", self.timestamp) node_count = kwargs.get("node_count", self.node_count) debug = kwargs.get("debug", False) div_from_root = kwargs.get("div_from_root", False) div_func = kwargs.get("div_func", lambda x, y: jsd2(x, y)) date_from_revision = kwargs.get("date_from_revision", False) graph = nx.Graph() self.graphs.append(graph) if debug: print("MultiThreadedWikiCrawler.crawl_wiki starting") q = queue.PriorityQueue() visited = set() q.put((0, page_title)) root_pdist = None root_title = None for score, page_title in iter(q.get, None): if page_title in visited: continue if len(graph) > node_count: break visited.add(page_title) if debug: print("{0}".format(page_title)) revid = self._get_revisionid_before_date(page_title, timestamp, debug) if not revid: continue # Crawl by prioritizing article links with smaller JS divergence if not root_pdist and div_from_root: root_pdist = self._get_revision_word_dist(page_title, revid) root_title = page_title if debug: print("{0}: root_pdist's top 3: {1}".format(page_title, root_pdist.most_common(3))) current_pdist = self._get_revision_word_dist(page_title, revid) forward_links = self._get_revision_forward_links(page_title, revid) if debug: print("{0}: {1} nodes".format(page_title, len(graph))) print("{0}: {1} forward links ".format(page_title, len(forward_links))) forward_links = [self._get_canonical_page_title(link) for link in forward_links] for forward_link in forward_links: if forward_link: flink_revid = self._get_revisionid_before_date(forward_link, timestamp) if not flink_revid: continue if div_from_root: if tuple(sorted([root_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores: div = self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] else: flink_pdist = self._get_revision_word_dist(forward_link, flink_revid) div = div_func(root_pdist, flink_pdist) self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] = div # div = div_func(root_pdist, flink_pdist) q.put((div, forward_link)) if tuple(sorted([page_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores: div = self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] else: flink_pdist = self._get_revision_word_dist(forward_link, flink_revid) div = div_func(current_pdist, flink_pdist) self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] = div # div = div_func(current_pdist, flink_pdist) if not div_from_root: q.put((div, forward_link)) if debug: print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div)) graph.add_edge(page_title, forward_link, div=div, revid=flink_revid) if debug: print() return