Beispiel #1
0
class Crawler:
    def __init__(self):
        self.queuesystem = queuesystem
        self.dbstore = dbstore
        self.urltracker = urltracker
        self.extractor = UrlExtractionSystem()

    def startcrawler(self, urls):
        self.queuesystem.storeUrlsToQueue(urls)

        while True:

            if self.queuesystem.hasNextUrl():
                url = self.queuesystem.nextUrl()

                if url.strip() != "":
                    self.urltracker.setUrlState(url, self.urltracker.IN_CRAWLER)

                    if not self.dbstore.findIfUrlStoredInDb(url):
                        self.urltracker.setUrlState(url, self.urltracker.IN_PROGRESS)

                        stdLogger.info("Extraction Started!")
                        page_info = self.extractor.getPageInfo(url)
                        stdLogger.info("Done Extraction!")

                        stdLogger.info("Queuing Started!")
                        self.queuesystem.storeUrlsToQueue(page_info["child_urls"])
                        stdLogger.info("Queuing Ended!")

                        stdLogger.info("DBStore Started!")
                        self.dbstore.storeUrlDataToDb(page_info)
                        stdLogger.info("DBStore Ended!")

                    self.urltracker.setUrlState(url, self.urltracker.DONE_PROCESS)

                    stdLogger.info(self.urltracker._trackingstore)
                else:
                    self.urltracker.setUrlState(url, self.urltracker.NO_URL)
            else:
                self.urltracker.setUrlState(url, self.urltracker.NO_URL)
Beispiel #2
0
 def __init__(self):
     self.queuesystem = queuesystem
     self.dbstore = dbstore
     self.urltracker = urltracker
     self.extractor = UrlExtractionSystem()