Python CrawlerState Exemples, crawler_state.CrawlerState Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : crawler.py Projet : charliewingstrom/CSC299Final

 def __init__(self, depth_limit, api):
     """
     Sets up the data structures for the crawl
     """
     self._state = CrawlerState(depth_limit)
     self._net = nx.Graph()
     self._api = api
     self._api.set_state(self._state)
     self._api.set_graph(self._net)

Exemple #2

0

Afficher le fichier

Fichier : crawler.py Projet : charliewingstrom/CSC299Final

    def from_net(self, net, depth_limit):
        """
        Create a crawl from an existing network. Assumes that all nodes at maximum depth need to be expanded.

        Arguments
        net -- a Graph object from a previous crawl.
        depth_limit -- the new depth limit for crawling.
        """
        state = CrawlerState()
        state.from_net(net, depth_limit)
        self._state = state
        self._net = net
        self._api.set_state(self._state)
        self._api.set_graph(self._net)

Exemple #3

0

Afficher le fichier

Fichier : crawler.py Projet : charliewingstrom/CSC299Final

    def from_files(self, graph_file, state_file):
        """
        Creates a crawl from a pair of files. No error checking is performed to make sure the files match.

        Arguments
        graph_file -- a GraphML file containing a network created in a previous crawl.
        state_file -- a JSON file containing the serialized CrawlerState.
        """
        state = CrawlerState()
        state.from_file(state_file)
        self._state = state
        self._net = nx.read_graphml(graph_file)
        self._api.set_state(self._state)
        self._api.set_graph(self._net)

Exemple #4

0

Afficher le fichier

Fichier : crawler.py Projet : charliewingstrom/CSC299Final

class Crawler:
    """
    Manages the crawling process. It can be run from the command line or imported and the methods used.
    Sample command line:
    crawler.py 2 5 out.graphml out.json

    The Crawler function depends on the implementation of an API class, which is a subclass of the CrawlerAbstractAPI
    class. This must implemented fresh for every new API to be crawled. You should not need to edit this code at all.

    Variables:
    _state -- A CrawlerState object encapsulating the current state of the crawl
    _net -- A NetworkX Graph object containing the network gather so far
    _api -- The API object, a concrete instance of a subclass of CrawlerAbstractAPI
    _autosave_interval -- The number of nodes to be added in between autosaves. A large number (~100) recommended
    _autosave_path -- The path where autosave files are stored.
    _max_fail -- The number of failures before the crawler will exit.
    """

    _state = None
    _net = None
    _api = None
    _autosave_interval = -1  # No autosaving by default (configurable)
    _autosave_path = ""
    _max_fail = 3  # Allow 3 failures (configurable)

    def __init__(self, depth_limit, api):
        """
        Sets up the data structures for the crawl
        """
        self._state = CrawlerState(depth_limit)
        self._net = nx.Graph()
        self._api = api
        self._api.set_state(self._state)
        self._api.set_graph(self._net)

    def get_state(self):
        return self._state

    def get_graph(self):
        return self._net

    def get_api(self):
        return self._api

    def from_files(self, graph_file, state_file):
        """
        Creates a crawl from a pair of files. No error checking is performed to make sure the files match.

        Arguments
        graph_file -- a GraphML file containing a network created in a previous crawl.
        state_file -- a JSON file containing the serialized CrawlerState.
        """
        state = CrawlerState()
        state.from_file(state_file)
        self._state = state
        self._net = nx.read_graphml(graph_file)
        self._api.set_state(self._state)
        self._api.set_graph(self._net)

    def from_net(self, net, depth_limit):
        """
        Create a crawl from an existing network. Assumes that all nodes at maximum depth need to be expanded.

        Arguments
        net -- a Graph object from a previous crawl.
        depth_limit -- the new depth limit for crawling.
        """
        state = CrawlerState()
        state.from_net(net, depth_limit)
        self._state = state
        self._net = net
        self._api.set_state(self._state)
        self._api.set_graph(self._net)

    def to_files(self, net_file, state_file):
        """
        Saves the state of the crawl.

        Arguments
        net_file -- name for the graphml file for the network part
        state_file -- name for the JSON file for the CrawlerState
        """
        nx.write_graphml(self._net, net_file)
        self._state.to_file(state_file)

    def set_depth_limit(self, limit):
        self._state.set_depth_limit(limit)

    def initialize(self):
        """
        Create initial nodes to start the crawl and set up the data structures

        """

        for label, node in self._api.initial_nodes():

            # We don't want to create another copy of the start point, so we add to the visited set.
            self._state.add_visited(label, 1, node)

            # Add to open queue to initialize search
            self._state.add_open(node)

    def expand_node(self, node):
        """
        Expands a node by getting its children via the API and adding them to the graph
        """
        print("Expanding node: %s" % node)
        ndepth = self._net.nodes[node]['_depth']
        # Don't expand a node at the maximum legal depth
        if not (self._state.is_legal_depth(ndepth + 1)):
            return True
        else:
            # Query the API
            result = self._api.get_children(node, self)
            if result['success']:
                for old_node in result['old']:
                    # Link existing nodes
                    self._net.add_edge(node, old_node)
                for new_node in result['new']:
                    # Link new nodes
                    self._net.add_edge(node, new_node)
                    # Add to visited set
                    label = self._net.nodes[new_node]['label']
                    bipartite = self._net.nodes[new_node]['bipartite']
                    self._state.add_visited(label, bipartite, new_node)
                    # Add new nodes to queue
                    self._state.add_open(new_node)
                # Mark parent as expanded
                self._net.nodes[node]['_expanded'] = 1
                return True
            else:  # The API call failed
                return False

    def set_autosave(self, interval, path):
        """
        Sets up the parameters of the autosave capability

        Arguments
        interval -- number of nodes to expand in between autosaves.
        path -- the path to save the crawler files. The files are named autosave.graphml and autosave.json.
        """
        self._autosave_path = path
        self._autosave_interval = interval

    def crawl_k(self, k):
        """
        Build the network by expanding up to k nodes (but not beyond the depth limit). Halts
        if it encounters more than _max_fail errors trying to expand the same node.

        Arguments
        k -- the number of nodes to expand
        """
        node_count = 0
        fail_count = 0
        while node_count < k and (not self.is_completed()):
            node = self._state.next_node()
            if self.expand_node(node):
                fail_count = 0
                node_count = node_count + 1
                if (self._autosave_interval > 0
                        and node_count % self._autosave_interval == 0):
                    print("Autosaving...")
                    self.to_files(self._autosave_path + "autosave.graphml",
                                  self._autosave_path + "autosave.json")
            else:
                fail_count = fail_count + 1
                # Put the node back on the queue to try again
                self._state.put_back(self._net, node)
                print("API call failed on node %s. Error %i." %
                      (node, fail_count))
                print(self._net.nodes[node])
                # Exit if we fail too many times
                if fail_count > self._max_fail:
                    return False
        return True

    def is_completed(self):
        """
        Returns true if all the nodes of depth_limit have been generated (but not expanded)
        """
        return self._state.is_completed()