def main(): ft = frontier.Frontier([ ('http://m.sohu.com/', 1), ]) http_fetcher = fetcher.HTTPFetcher() rb = robot.Robot(http_fetcher, [ ('/.*', AHandler), ]) id = fork_processes(0) if id == master_id(): master.Master(ft).start() else: worker.Worker(rb).start() IOLoop.instance().start()
def __init__(self, init_url): self.init_url = init_url self.init_domain = urlparse.urlparse(init_url).netloc # Manages our domains we want to visit or have visited self.frontier = frontier.Frontier() self.frontier.add(init_url, None) # List of deadlinks for each URL we have, # i.e. url1: [deadlink1, deadlink2] self.deadlinks = {} # Regular expression for URLs we are interested in (our internal # URLs) self._url_match = None # Regular expression for URLs we are interested in (our internal # URLs) self._exclude = None # Timeout in seconds to wait, so that we do not kill our server self._wait_time = 0 # Verbose self._verbose = True # Debug self._debug = False # Report 40x http codes as deadlinks self._report40x = False # For progress reporting self._pages = 0 self._links = 0 self._via = 0 self._dead = 0
import sieve as sv import frontier as fr import url_filter as uf import web_graph as wg if __name__ == '__main__': #Parameters sieve_limit = 10 # max number of urls in the sieve host_politeness = 60.0 # seconds to wait before visit the host again, float type requests_number = 5 # max number of urls, associated with an host, to visit number_of_threads = 4 # max number of active threads # Data structures instantiation seed = uf.fs_url_filter( sl.load('seed.txt')) # Seed instantiation and loading web_graph = wg.Web_graph() # WebGraph instantiation for Centralities frontier = fr.Frontier( sv.Sieve(seed, sieve_limit), host_politeness, requests_number, web_graph ) # Frontier instantiation with data structures and Parameters # Execution print('Crawler Execution\n') frontier.execute(number_of_threads) #Centrality Measures print('\nCentralities:\n') web_graph.print_metrics()
def __init__(self): self.frontier = frontier.Frontier() self.exset = exploredset.ExploredSet() self.numnodes = 0