def crawlingThread(self): """Takes URL from the urlToVisit queue and visits them""" logger.log(logging.DEBUG, "CrawlingThread started") self.scrapper = scrapping.Scrapper(self.config.userAgent, self.config.robotParserEnabled, self.config.domainRestricted, self.config.crawling) while self.isActive: try: urlList = protocol.deQueue([self.urlToVisit]) if not urlList: time.sleep(0.2) #temp - For testing continue for url in urlList: session = self.scrapper.visit(url) logger.log( logging.DEBUG, "Session \n" + str(session.url) + "\nCode : " + str(session.returnCode) + "\nRequest time : " + str(session.requestTime) + "\nBs time : " + str(session.bsParsingTime)) if not session.failed: if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: payload = protocol.URLPayload( session.scrappedURLs, protocol.URLPayload.SCRAPPED_URL) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) payload = protocol.URLPayload( [url], protocol.URLPayload.VISITED, session=session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) else: logger.log(logging.INFO, "Skipping URL : " + url) payload = protocol.URLPayload( [url], protocol.URLPayload.SKIPPED, session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) continue except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def mainRoutine(self): """To Come in da future. For now, no use""" logger.log(logging.INFO, "Starting server mainRoutine") for url in self.configurationPayload.config.rootUrls: payload = protocol.URLPayload([str(url)], protocol.URLPayload.TOVISIT) packet = protocol.Packet(protocol.URL, payload) urlVisited[url] = True outputQueue.put(packet) if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING and ( self.configurationPayload.config.crawlDelay != 0): if self.configurationPayload.config.crawlDelay != 0: time.sleep(self.configurationPayload.config.crawlDelay) while self.isActive: try: if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: url = urlToVisit.get(True) payload = protocol.URLPayload([str(url)], protocol.URLPayload.TOVISIT) packet = protocol.Packet(protocol.URL, payload) outputQueue.put(packet) self.requestCount = self.requestCount + 1 if self.configurationPayload.config.crawlDelay != 0: time.sleep(self.configurationPayload.config.crawlDelay) if self.requestLimit != 0 and len( visitedURLlist) + 1 > self.requestLimit: break elif self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING: if (len(skippedURLlist + visitedURLlist) == len( self.configurationPayload.config.rootUrls)): break else: time.sleep(0.3) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = "\n" + ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.ERROR, message) logger.log(logging.INFO, "Scrapping complete. Terminating...") self.disconnectAllClient() self.isActive = False