def __init__(self, url_provider, storage_server=None): self.url_provider = url_provider self.storage_server = storage_server self.urls_pool = List(100) self.fetching_context = FetchingContext() self.stats = StatsStatusCode()
class Crawler(object): def __init__(self, url_provider, storage_server=None): self.url_provider = url_provider self.storage_server = storage_server self.urls_pool = List(100) self.fetching_context = FetchingContext() self.stats = StatsStatusCode() def push(self, url): self.urls_pool.append(url) def start(self): self.fetching = True while self.fetching: if not self.urls_pool: # TODO Load urls from an stored filesystem try: # Call for urls in the url server anchors = self.url_provider.choose() except NoContentError: anchors = list() for uri in anchors: self.urls_pool.append(uri) del anchors # If it still haven't got any url, there's nothing to crawl/fetch if not self.urls_pool: break if len(self.urls_pool) > 0: try: anchor = self.urls_pool.pop(0) self.fetching_context.fetch_page(anchor) except FullOfWork: self.url_provider.create(anchor) except Unreachable: pass except NotSupported: pass del anchor received_data = self.fetching_context.received_data() for data in received_data: response = self.url_provider.fetched(data[0]) # If the URL was alredy fetched, don't process the page data if response.status_code == 304: continue # Send data to be "processed" by the HttpResponse response = HttpResponse(data[1]) import sys sys.stdout.write("{0}: {1}\n".format(data[0], response.status_code)) self.stats.compute(response.status_code) # TODO Process response header (eg.: Location field, status code) parser = ParseHtmlAnchors(response.content, data[0]) anchors = parser.anchors for anchor in anchors: self.url_provider.create(anchor) # TODO Then, after all the data has been cleaned, send it to the data analysis del response, parser, anchors # TODO Send data analysed to storage server del received_data return self.stop() def stop(self): self.fetching = False