Exemple #1
0
    def crawl_heuristic_bfs(self, *args, **kwargs):
        """Heuristic breadth-first search crawl"""
        page_title = kwargs.get('page_title', self.page_title)
        timestamp = kwargs.get('timestamp', self.timestamp)
        node_count = kwargs.get('node_count', self.node_count)
        #first_layer_only = kwargs.get('first_layer_only', False)
        div_from_root = kwargs.get('div_from_root', False)
        div_func = kwargs.get('div_func', lambda x, y: jsd2(x, y))

        debug = kwargs.get('debug', False)

        date_from_revision = kwargs.get('date_from_revision', False)

        graph = nx.Graph(name="{0}&&{1}".format(page_title, timestamp.date()),
                         starting_node=page_title,
                         starting_date=timestamp,
                         year=timestamp.year)

        self.graphs.append(graph)

        if debug: print("WikiCrawler.crawl_wiki starting")

        q = queue.PriorityQueue()

        visited = set()

        q.put((0, page_title))

        root_pdist = None
        root_title = None
        root_revid = None

        #for score, page_title in iter(q.get, None):
        while not q.empty():
            score, page_title = q.get()

            if page_title in visited:
                continue

            if len(graph) > node_count:
                break

            visited.add(page_title)

            if debug: print("{0}".format(page_title))

            revid = self._get_revisionid_before_date(page_title, timestamp,
                                                     debug)

            if not revid:
                if debug: print("{0} - no revid found".format(page_title))
                continue

            # Crawl by prioritizing article links with smaller JS divergence
            if not root_pdist and div_from_root:
                root_pdist = self._get_revision_word_dist(page_title, revid)

                root_title = page_title

                root_revid = revid

                if debug:
                    print("{0}: root_pdist's top 3: {1}".format(
                        page_title, root_pdist.most_common(3)))

            #current_pdist = self._get_revision_word_dist(page_title, revid)

            forward_links = self._get_revision_forward_links(
                page_title, revid, debug)

            if debug:
                print("{0}: {1} nodes".format(page_title, len(graph)))
                print("{0}: {1} forward links ".format(page_title,
                                                       len(forward_links)))

            forward_links = [
                self._get_canonical_page_title(link) for link in forward_links
            ]

            forward_links = [flink for flink in forward_links if flink]

            for forward_link in forward_links:
                if forward_link:
                    flink_revid = self._get_revisionid_before_date(
                        forward_link, timestamp)

                    if not flink_revid:
                        continue

                    if div_from_root:
                        div = self._get_div_score(root_title,
                                                  root_revid,
                                                  forward_link,
                                                  flink_revid,
                                                  div_func=div_func)

                        #div = div_func(root_pdist, flink_pdist)
                        q.put((div, forward_link))

                    #div = div_func(current_pdist, flink_pdist)

                    if not div_from_root:
                        div = self._get_div_score(page_title,
                                                  revid,
                                                  forward_link,
                                                  flink_revid,
                                                  div_func=div_func)
                        q.put((div, forward_link))

                    if debug:
                        print("{0} -> {1} - divergence: {2}".format(
                            page_title, forward_link, div))

                    graph.add_edge(page_title,
                                   forward_link,
                                   div=div,
                                   revid=flink_revid)

            if debug: print()
        return
Exemple #2
0
    def crawl_heuristic_bfs(self, *args, **kwargs):
        """Heuristic breadth-first search crawl"""
        page_title = kwargs.get('page_title', self.page_title)
        timestamp = kwargs.get('timestamp', self.timestamp)
        node_count = kwargs.get('node_count', self.node_count)
        #first_layer_only = kwargs.get('first_layer_only', False)
        div_from_root = kwargs.get('div_from_root', False)
        div_func = kwargs.get('div_func', lambda x, y: jsd2(x,y))

        debug = kwargs.get('debug', False)

        date_from_revision = kwargs.get('date_from_revision', False)

        graph = nx.Graph(name="{0}&&{1}".format(page_title, timestamp.date()),
                         starting_node=page_title, starting_date=timestamp, year=timestamp.year)

        self.graphs.append(graph)

        if debug: print("WikiCrawler.crawl_wiki starting")

        q = queue.PriorityQueue()

        visited = set()

        q.put((0, page_title))

        root_pdist = None
        root_title = None
        root_revid = None

        #for score, page_title in iter(q.get, None):
        while not q.empty():
            score, page_title = q.get()

            if page_title in visited:
                continue

            if len(graph) > node_count:
                break

            visited.add(page_title)

            if debug: print("{0}".format(page_title))

            revid = self._get_revisionid_before_date(page_title, timestamp, debug)

            if not revid:
                if debug: print("{0} - no revid found".format(page_title))
                continue

            # Crawl by prioritizing article links with smaller JS divergence
            if not root_pdist and div_from_root:
                root_pdist = self._get_revision_word_dist(page_title, revid)

                root_title = page_title

                root_revid = revid

                if debug:
                    print("{0}: root_pdist's top 3: {1}".format(page_title,
                                                                root_pdist.most_common(3)))

            #current_pdist = self._get_revision_word_dist(page_title, revid)

            forward_links = self._get_revision_forward_links(page_title, revid, debug)

            if debug:
                print("{0}: {1} nodes".format(page_title, len(graph)))
                print("{0}: {1} forward links ".format(page_title, len(forward_links)))

            forward_links = [self._get_canonical_page_title(link)
                             for link in forward_links]

            forward_links = [flink for flink in forward_links if flink]

            for forward_link in forward_links:
                if forward_link:
                    flink_revid = self._get_revisionid_before_date(forward_link, timestamp)

                    if not flink_revid:
                        continue

                    if div_from_root:
                        div = self._get_div_score(root_title, root_revid, forward_link, flink_revid, div_func=div_func)

                        #div = div_func(root_pdist, flink_pdist)
                        q.put((div, forward_link))

                    #div = div_func(current_pdist, flink_pdist)

                    if not div_from_root:
                        div = self._get_div_score(page_title, revid, forward_link, flink_revid, div_func=div_func)
                        q.put((div, forward_link))

                    if debug:
                        print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div))

                    graph.add_edge(page_title, forward_link, div=div, revid=flink_revid)

            if debug: print()
        return
    def crawl_heuristic_bfs(self, *args, **kwargs):
        """Heuristic breadth-first search crawl"""
        page_title = kwargs.get('page_title', self.page_title)
        timestamp = kwargs.get('timestamp', self.timestamp)
        node_count = kwargs.get('node_count', self.node_count)
        debug = kwargs.get('debug', False)

        div_from_root = kwargs.get('div_from_root', False)
        div_func = kwargs.get('div_func', lambda x, y: jsd2(x,y))

        date_from_revision = kwargs.get('date_from_revision', False)

        graph = nx.Graph()

        self.graphs.append(graph)

        if debug: print("MultiThreadedWikiCrawler.crawl_wiki starting")

        q = queue.PriorityQueue()

        visited = set()

        q.put((0, page_title))

        root_pdist = None
        root_title = None

        for score, page_title in iter(q.get, None):
            if page_title in visited:
                continue

            if len(graph) > node_count:
                break

            visited.add(page_title)

            if debug: print("{0}".format(page_title))

            revid = self._get_revisionid_before_date(page_title, timestamp, debug)

            if not revid:
                continue

            # Crawl by prioritizing article links with smaller JS divergence
            if not root_pdist and div_from_root:
                root_pdist = self._get_revision_word_dist(page_title, revid)

                root_title = page_title

                if debug:
                    print("{0}: root_pdist's top 3: {1}".format(page_title,
                                                                root_pdist.most_common(3)))

            current_pdist = self._get_revision_word_dist(page_title, revid)

            forward_links = self._get_revision_forward_links(page_title, revid)

            if debug:
                print("{0}: {1} nodes".format(page_title, len(graph)))
                print("{0}: {1} forward links ".format(page_title, len(forward_links)))

            forward_links = [self._get_canonical_page_title(link)
                             for link in forward_links]

            for forward_link in forward_links:
                if forward_link:
                    flink_revid = self._get_revisionid_before_date(forward_link,
                                                                   timestamp)

                    if not flink_revid:
                        continue

                    if div_from_root:
                        if tuple(sorted([root_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores:
                            div = self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)]
                        else:
                            flink_pdist = self._get_revision_word_dist(forward_link, flink_revid)
                            div = div_func(root_pdist, flink_pdist)
                            self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] = div

                        #div = div_func(root_pdist, flink_pdist)
                        q.put((div, forward_link))

                    if tuple(sorted([page_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores:
                        div = self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)]
                    else:
                        flink_pdist = self._get_revision_word_dist(forward_link, flink_revid)
                        div = div_func(current_pdist, flink_pdist)
                        self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] = div

                    #div = div_func(current_pdist, flink_pdist)

                    if not div_from_root:
                        q.put((div, forward_link))

                    if debug:
                        print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div))

                    graph.add_edge(page_title, forward_link, div=div, revid=flink_revid)

            if debug: print()

        return
    def crawl_heuristic_bfs(self, *args, **kwargs):
        """Heuristic breadth-first search crawl"""
        page_title = kwargs.get("page_title", self.page_title)
        timestamp = kwargs.get("timestamp", self.timestamp)
        node_count = kwargs.get("node_count", self.node_count)
        debug = kwargs.get("debug", False)

        div_from_root = kwargs.get("div_from_root", False)
        div_func = kwargs.get("div_func", lambda x, y: jsd2(x, y))

        date_from_revision = kwargs.get("date_from_revision", False)

        graph = nx.Graph()

        self.graphs.append(graph)

        if debug:
            print("MultiThreadedWikiCrawler.crawl_wiki starting")

        q = queue.PriorityQueue()

        visited = set()

        q.put((0, page_title))

        root_pdist = None
        root_title = None

        for score, page_title in iter(q.get, None):
            if page_title in visited:
                continue

            if len(graph) > node_count:
                break

            visited.add(page_title)

            if debug:
                print("{0}".format(page_title))

            revid = self._get_revisionid_before_date(page_title, timestamp, debug)

            if not revid:
                continue

            # Crawl by prioritizing article links with smaller JS divergence
            if not root_pdist and div_from_root:
                root_pdist = self._get_revision_word_dist(page_title, revid)

                root_title = page_title

                if debug:
                    print("{0}: root_pdist's top 3: {1}".format(page_title, root_pdist.most_common(3)))

            current_pdist = self._get_revision_word_dist(page_title, revid)

            forward_links = self._get_revision_forward_links(page_title, revid)

            if debug:
                print("{0}: {1} nodes".format(page_title, len(graph)))
                print("{0}: {1} forward links ".format(page_title, len(forward_links)))

            forward_links = [self._get_canonical_page_title(link) for link in forward_links]

            for forward_link in forward_links:
                if forward_link:
                    flink_revid = self._get_revisionid_before_date(forward_link, timestamp)

                    if not flink_revid:
                        continue

                    if div_from_root:
                        if tuple(sorted([root_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores:
                            div = self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)]
                        else:
                            flink_pdist = self._get_revision_word_dist(forward_link, flink_revid)
                            div = div_func(root_pdist, flink_pdist)
                            self.ctitle_and_ctitle_to_div_scores[(root_title, forward_link)] = div

                        # div = div_func(root_pdist, flink_pdist)
                        q.put((div, forward_link))

                    if tuple(sorted([page_title, forward_link])) in self.ctitle_and_ctitle_to_div_scores:
                        div = self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)]
                    else:
                        flink_pdist = self._get_revision_word_dist(forward_link, flink_revid)
                        div = div_func(current_pdist, flink_pdist)
                        self.ctitle_and_ctitle_to_div_scores[(page_title, forward_link)] = div

                    # div = div_func(current_pdist, flink_pdist)

                    if not div_from_root:
                        q.put((div, forward_link))

                    if debug:
                        print("{0} -> {1} - divergence: {2}".format(page_title, forward_link, div))

                    graph.add_edge(page_title, forward_link, div=div, revid=flink_revid)

            if debug:
                print()

        return