def crawlingThread(self): """Takes URL from the urlToVisit queue and visits them""" logger.log(logging.DEBUG, "CrawlingThread started") self.scrapper = scrapping.Scrapper(self.config.userAgent, self.config.robotParserEnabled, self.config.domainRestricted, self.config.crawling) while self.isActive: try: urlList = protocol.deQueue([self.urlToVisit]) if not urlList: time.sleep(0.2) #temp - For testing continue for url in urlList: session = self.scrapper.visit(url) logger.log( logging.DEBUG, "Session \n" + str(session.url) + "\nCode : " + str(session.returnCode) + "\nRequest time : " + str(session.requestTime) + "\nBs time : " + str(session.bsParsingTime)) if not session.failed: if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: payload = protocol.URLPayload( session.scrappedURLs, protocol.URLPayload.SCRAPPED_URL) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) payload = protocol.URLPayload( [url], protocol.URLPayload.VISITED, session=session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) else: logger.log(logging.INFO, "Skipping URL : " + url) payload = protocol.URLPayload( [url], protocol.URLPayload.SKIPPED, session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) continue except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def outputThread(self): """Checks if there are messages to send to the client and sends them""" while self.isActive: if self.sentCount > 5: time.sleep(0.03) continue packetToBroadCast = protocol.deQueue([outputQueue]) if not packetToBroadCast: continue for packet in packetToBroadCast: self.writeSocket(packet) self.sentCount = self.sentCount+1 logger.log(logging.DEBUG, self.formattedAddr + "Sending URL " + str(packet.payload.urlList[0]))
def outputThread(self): """Checks if there are messages to send to the client and sends them""" while self.isActive: if self.sentCount > 5: time.sleep(0.03) continue packetToBroadCast = protocol.deQueue([outputQueue]) if not packetToBroadCast: continue for packet in packetToBroadCast: self.writeSocket(packet) self.sentCount = self.sentCount + 1 logger.log( logging.DEBUG, self.formattedAddr + "Sending URL " + str(packet.payload.urlList[0]))
def storageRoutine(self): """Stores session and data""" logger.log(logging.INFO, "Starting server storageRoutine") while self.isActive: try: sessions = protocol.deQueue([sessionStorageQueue]) if not sessions: continue for session in sessions: storage.writeToFile(session, session.dataContainer) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = "\n" + ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.ERROR, message)
def storageRoutine(self): """Stores session and data""" logger.log(logging.INFO, "Starting server storageRoutine") while self.isActive: try: sessions = protocol.deQueue([sessionStorageQueue]) if not sessions: continue for session in sessions: storage.writeToFile(session, session.dataContainer) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = "\n" + ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.ERROR, message)
def crawlingThread(self): """Takes URL from the urlToVisit queue and visits them""" logger.log(logging.DEBUG, "CrawlingThread started") self.scrapper = scrapping.Scrapper(self.config.userAgent, self.config.robotParserEnabled, self.config.domainRestricted, self.config.crawling) while self.isActive: try: urlList = protocol.deQueue([self.urlToVisit]) if not urlList: time.sleep(0.2) #temp - For testing continue for url in urlList: session = self.scrapper.visit(url) logger.log(logging.DEBUG, "Session \n" + str(session.url) + "\nCode : " + str(session.returnCode) + "\nRequest time : " + str(session.requestTime) + "\nBs time : " + str(session.bsParsingTime)) if not session.failed: if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: payload = protocol.URLPayload(session.scrappedURLs, protocol.URLPayload.SCRAPPED_URL) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) payload = protocol.URLPayload([url], protocol.URLPayload.VISITED, session=session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) else: logger.log(logging.INFO, "Skipping URL : " + url) payload = protocol.URLPayload([url], protocol.URLPayload.SKIPPED, session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) continue except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def interpretingThread(self): """Interprets message from the server other than type URL. (ie: INFO)""" logger.log(logging.DEBUG, "InterpretingThread started") while self.isActive: try: time.sleep(0.01) #temp - For testing packets = protocol.deQueue([self.infoQueue]) if not packets: continue for packet in packets: if packet.type == protocol.INFO: logger.log(logging.INFO, "Interpreting INFO packet : " + str(packet.payload.urlList)) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def interpretingThread(self): logger.log(logging.DEBUG, "InterpretingThread started") while self.isActive: try: time.sleep(0.01) #temp - For testing #packets = protocol.deQueue([self.urlToVisit, self.infoQueue]) packets = protocol.deQueue([self.urlToVisit]) if not packets: continue for packet in packets: if packet.type is protocol.URL: #visiting site logger.log(logging.INFO, "Visiting site : " + str(packet.payload.urlList)) #self.outputQueue.put(packet) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def storageRoutine(self): """Stores session and data""" logger.log(logging.INFO, "Starting server storageRoutine") try: connection = psycopg2.connect(user="******", password="******", host="localhost", port="5432", database="crawler2") cursor = connection.cursor() # Print PostgreSQL Connection properties print(connection.get_dsn_parameters(), "\n") # Print PostgreSQL version cursor.execute("SELECT version();") record = cursor.fetchone() print("You are connected to - ", record, "\n") except (Exception, psycopg2.Error) as error: print("Error while connecting to PostgreSQL", error) """finally: #closing database connection. if(connection): cursor.close() connection.close() print("PostgreSQL connection is closed")""" while self.isActive: try: sessions = protocol.deQueue([sessionStorageQueue]) if not sessions: continue for session in sessions: #storage.writeToFile(session, session.dataContainer) #storage.writeToDb(session, session.dataContainer) try: if (not session.failed): #insert_one(session.url) #connection = None try: cursor.execute("INSERT INTO url VALUES (%s)", session.url) # read database configuration #params = config() # connect to the PostgreSQL database #conn = psycopg2.connect(**params) # create a new cursor #cur = connection.cursor() # execute the INSERT statement #cursor.execute(sql, (session.url,)) # commit the changes to the database #conn.commit() # close communication with the database #cursor.close() except (Exception, psycopg2.DatabaseError) as error: print(error) print "ez" elif session.failed: #insert_one(session.url.replace) print "hola" #else: # raise Exception("..") except: logger.log(logging.ERROR, "Unhandled exception in storage.py") except: exc_type, exc_value, exc_traceback = sys.exc_info() message = "\n" + ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.ERROR, message) cursor.close()