Example #1
0
    def crawlingThread(self):
        """Takes URL from the urlToVisit queue and visits them"""
        logger.log(logging.DEBUG, "CrawlingThread started")

        self.scrapper = scrapping.Scrapper(self.config.userAgent,
                                           self.config.robotParserEnabled,
                                           self.config.domainRestricted,
                                           self.config.crawling)

        while self.isActive:
            try:
                urlList = protocol.deQueue([self.urlToVisit])

                if not urlList:
                    time.sleep(0.2)  #temp - For testing
                    continue

                for url in urlList:
                    session = self.scrapper.visit(url)
                    logger.log(
                        logging.DEBUG, "Session \n" + str(session.url) +
                        "\nCode : " + str(session.returnCode) +
                        "\nRequest time : " + str(session.requestTime) +
                        "\nBs time : " + str(session.bsParsingTime))

                    if not session.failed:
                        if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                            payload = protocol.URLPayload(
                                session.scrappedURLs,
                                protocol.URLPayload.SCRAPPED_URL)
                            packet = protocol.Packet(protocol.URL, payload)
                            self.outputQueue.put(packet)

                        payload = protocol.URLPayload(
                            [url],
                            protocol.URLPayload.VISITED,
                            session=session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                    else:
                        logger.log(logging.INFO, "Skipping URL : " + url)
                        payload = protocol.URLPayload(
                            [url], protocol.URLPayload.SKIPPED, session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                        continue

            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Example #2
0
    def outputThread(self):
        """Checks if there are messages to send to the client and sends them"""
        while self.isActive:
            if self.sentCount > 5:
                time.sleep(0.03)
                continue
            packetToBroadCast = protocol.deQueue([outputQueue])

            if not packetToBroadCast:
                    continue

            for packet in packetToBroadCast:
                self.writeSocket(packet)
                self.sentCount = self.sentCount+1
                logger.log(logging.DEBUG, self.formattedAddr + "Sending URL " + str(packet.payload.urlList[0]))
Example #3
0
    def outputThread(self):
        """Checks if there are messages to send to the client and sends them"""
        while self.isActive:
            if self.sentCount > 5:
                time.sleep(0.03)
                continue
            packetToBroadCast = protocol.deQueue([outputQueue])

            if not packetToBroadCast:
                continue

            for packet in packetToBroadCast:
                self.writeSocket(packet)
                self.sentCount = self.sentCount + 1
                logger.log(
                    logging.DEBUG, self.formattedAddr + "Sending URL " +
                    str(packet.payload.urlList[0]))
Example #4
0
    def storageRoutine(self):
        """Stores session and data"""
        logger.log(logging.INFO, "Starting server storageRoutine")

        while self.isActive:
            try:
                sessions = protocol.deQueue([sessionStorageQueue])

                if not sessions:
                        continue

                for session in sessions:
                    storage.writeToFile(session, session.dataContainer)
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = "\n" + ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.ERROR, message)
    def storageRoutine(self):
        """Stores session and data"""
        logger.log(logging.INFO, "Starting server storageRoutine")

        while self.isActive:
            try:
                sessions = protocol.deQueue([sessionStorageQueue])

                if not sessions:
                    continue

                for session in sessions:
                    storage.writeToFile(session, session.dataContainer)
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = "\n" + ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.ERROR, message)
Example #6
0
    def crawlingThread(self):
        """Takes URL from the urlToVisit queue and visits them"""
        logger.log(logging.DEBUG, "CrawlingThread started")

        self.scrapper = scrapping.Scrapper(self.config.userAgent, self.config.robotParserEnabled, self.config.domainRestricted, self.config.crawling)

        while self.isActive:
            try:
                urlList = protocol.deQueue([self.urlToVisit])

                if not urlList:
                    time.sleep(0.2) #temp - For testing
                    continue

                for url in urlList:
                    session = self.scrapper.visit(url)
                    logger.log(logging.DEBUG, "Session \n" + str(session.url) +
                      "\nCode : " + str(session.returnCode) +
                      "\nRequest time : " + str(session.requestTime) +
                      "\nBs time : " + str(session.bsParsingTime))

                    if not session.failed:
                        if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                            payload = protocol.URLPayload(session.scrappedURLs, protocol.URLPayload.SCRAPPED_URL)
                            packet = protocol.Packet(protocol.URL, payload)
                            self.outputQueue.put(packet)

                        payload = protocol.URLPayload([url], protocol.URLPayload.VISITED, session=session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                    else:
                        logger.log(logging.INFO, "Skipping URL : " + url)
                        payload = protocol.URLPayload([url], protocol.URLPayload.SKIPPED, session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                        continue

            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Example #7
0
    def interpretingThread(self):
        """Interprets message from the server other than type URL. (ie: INFO)"""
        logger.log(logging.DEBUG, "InterpretingThread started")

        while self.isActive:
            try:
                time.sleep(0.01) #temp - For testing
                packets = protocol.deQueue([self.infoQueue])

                if not packets:
                    continue

                for packet in packets:
                    if packet.type == protocol.INFO:
                        logger.log(logging.INFO, "Interpreting INFO packet : " + str(packet.payload.urlList))
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Example #8
0
    def interpretingThread(self):
        """Interprets message from the server other than type URL. (ie: INFO)"""
        logger.log(logging.DEBUG, "InterpretingThread started")

        while self.isActive:
            try:
                time.sleep(0.01) #temp - For testing
                packets = protocol.deQueue([self.infoQueue])

                if not packets:
                    continue

                for packet in packets:
                    if packet.type == protocol.INFO:
                        logger.log(logging.INFO, "Interpreting INFO packet : " + str(packet.payload.urlList))
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Example #9
0
    def interpretingThread(self):
        logger.log(logging.DEBUG, "InterpretingThread started")

        while self.isActive:
            try:
                time.sleep(0.01) #temp - For testing
                #packets = protocol.deQueue([self.urlToVisit, self.infoQueue])
                packets = protocol.deQueue([self.urlToVisit])

                if not packets:
                    continue

                for packet in packets:
                    if packet.type is protocol.URL:
                        #visiting site
                        logger.log(logging.INFO, "Visiting site : " + str(packet.payload.urlList))
                        #self.outputQueue.put(packet)
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Example #10
0
    def storageRoutine(self):
        """Stores session and data"""
        logger.log(logging.INFO, "Starting server storageRoutine")
        try:
            connection = psycopg2.connect(user="******",
                                          password="******",
                                          host="localhost",
                                          port="5432",
                                          database="crawler2")
            cursor = connection.cursor()
            # Print PostgreSQL Connection properties
            print(connection.get_dsn_parameters(), "\n")
            # Print PostgreSQL version
            cursor.execute("SELECT version();")
            record = cursor.fetchone()
            print("You are connected to - ", record, "\n")
        except (Exception, psycopg2.Error) as error:
            print("Error while connecting to PostgreSQL", error)
        """finally:
        #closing database connection.
            if(connection):
                cursor.close()
                connection.close()
                print("PostgreSQL connection is closed")"""
        while self.isActive:
            try:
                sessions = protocol.deQueue([sessionStorageQueue])

                if not sessions:
                    continue

                for session in sessions:
                    #storage.writeToFile(session, session.dataContainer)

                    #storage.writeToDb(session, session.dataContainer)
                    try:
                        if (not session.failed):
                            #insert_one(session.url)
                            #connection = None
                            try:
                                cursor.execute("INSERT INTO url VALUES (%s)",
                                               session.url)
                                # read database configuration
                                #params = config()
                                # connect to the PostgreSQL database
                                #conn = psycopg2.connect(**params)
                                # create a new cursor
                                #cur = connection.cursor()
                                # execute the INSERT statement
                                #cursor.execute(sql, (session.url,))
                                # commit the changes to the database
                                #conn.commit()
                                # close communication with the database
                                #cursor.close()
                            except (Exception,
                                    psycopg2.DatabaseError) as error:
                                print(error)
                            print "ez"

                        elif session.failed:
                            #insert_one(session.url.replace)
                            print "hola"
                        #else:
                        #    raise Exception("..")
                    except:
                        logger.log(logging.ERROR,
                                   "Unhandled exception in storage.py")

            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = "\n" + ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.ERROR, message)
        cursor.close()