Example #1
0
def main():
    ft = frontier.Frontier([
        ('http://m.sohu.com/', 1),
    ])
    http_fetcher = fetcher.HTTPFetcher()
    rb = robot.Robot(http_fetcher, [
        ('/.*', AHandler),
    ])
    id = fork_processes(0)
    if id == master_id():
        master.Master(ft).start()
    else:
        worker.Worker(rb).start()
    IOLoop.instance().start()
Example #2
0
    def __init__(self, init_url):
        self.init_url = init_url
        self.init_domain = urlparse.urlparse(init_url).netloc

        # Manages our domains we want to visit or have visited
        self.frontier = frontier.Frontier()
        self.frontier.add(init_url, None)

        # List of deadlinks for each URL we have,
        # i.e. url1: [deadlink1, deadlink2]
        self.deadlinks = {}

        # Regular expression for URLs we are interested in (our internal
        # URLs)
        self._url_match = None

        # Regular expression for URLs we are interested in (our internal
        # URLs)
        self._exclude = None

        # Timeout in seconds to wait, so that we do not kill our server
        self._wait_time = 0

        # Verbose
        self._verbose = True

        # Debug
        self._debug = False

        # Report 40x http codes as deadlinks
        self._report40x = False

        # For progress reporting
        self._pages = 0
        self._links = 0
        self._via = 0
        self._dead = 0
Example #3
0
import sieve as sv
import frontier as fr
import url_filter as uf
import web_graph as wg

if __name__ == '__main__':

    #Parameters
    sieve_limit = 10  # max number of urls in the sieve
    host_politeness = 60.0  # seconds to wait before visit the host again, float type
    requests_number = 5  # max number of urls, associated with an host, to visit
    number_of_threads = 4  # max number of active threads

    # Data structures instantiation
    seed = uf.fs_url_filter(
        sl.load('seed.txt'))  # Seed instantiation and loading
    web_graph = wg.Web_graph()  # WebGraph instantiation for Centralities

    frontier = fr.Frontier(
        sv.Sieve(seed,
                 sieve_limit), host_politeness, requests_number, web_graph
    )  # Frontier instantiation with data structures and Parameters

    # Execution
    print('Crawler Execution\n')
    frontier.execute(number_of_threads)

    #Centrality Measures
    print('\nCentralities:\n')
    web_graph.print_metrics()
Example #4
0
 def __init__(self):
     self.frontier = frontier.Frontier()
     self.exset = exploredset.ExploredSet()
     self.numnodes = 0