class Crawler: def __init__(self): self.queuesystem = queuesystem self.dbstore = dbstore self.urltracker = urltracker self.extractor = UrlExtractionSystem() def startcrawler(self, urls): self.queuesystem.storeUrlsToQueue(urls) while True: if self.queuesystem.hasNextUrl(): url = self.queuesystem.nextUrl() if url.strip() != "": self.urltracker.setUrlState(url, self.urltracker.IN_CRAWLER) if not self.dbstore.findIfUrlStoredInDb(url): self.urltracker.setUrlState(url, self.urltracker.IN_PROGRESS) stdLogger.info("Extraction Started!") page_info = self.extractor.getPageInfo(url) stdLogger.info("Done Extraction!") stdLogger.info("Queuing Started!") self.queuesystem.storeUrlsToQueue(page_info["child_urls"]) stdLogger.info("Queuing Ended!") stdLogger.info("DBStore Started!") self.dbstore.storeUrlDataToDb(page_info) stdLogger.info("DBStore Ended!") self.urltracker.setUrlState(url, self.urltracker.DONE_PROCESS) stdLogger.info(self.urltracker._trackingstore) else: self.urltracker.setUrlState(url, self.urltracker.NO_URL) else: self.urltracker.setUrlState(url, self.urltracker.NO_URL)
def __init__(self): self.queuesystem = queuesystem self.dbstore = dbstore self.urltracker = urltracker self.extractor = UrlExtractionSystem()