Exemple #1
0
    def crawlingThread(self):
        """Takes URL from the urlToVisit queue and visits them"""
        logger.log(logging.DEBUG, "CrawlingThread started")

        self.scrapper = scrapping.Scrapper(self.config.userAgent,
                                           self.config.robotParserEnabled,
                                           self.config.domainRestricted,
                                           self.config.crawling)

        while self.isActive:
            try:
                urlList = protocol.deQueue([self.urlToVisit])

                if not urlList:
                    time.sleep(0.2)  #temp - For testing
                    continue

                for url in urlList:
                    session = self.scrapper.visit(url)
                    logger.log(
                        logging.DEBUG, "Session \n" + str(session.url) +
                        "\nCode : " + str(session.returnCode) +
                        "\nRequest time : " + str(session.requestTime) +
                        "\nBs time : " + str(session.bsParsingTime))

                    if not session.failed:
                        if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                            payload = protocol.URLPayload(
                                session.scrappedURLs,
                                protocol.URLPayload.SCRAPPED_URL)
                            packet = protocol.Packet(protocol.URL, payload)
                            self.outputQueue.put(packet)

                        payload = protocol.URLPayload(
                            [url],
                            protocol.URLPayload.VISITED,
                            session=session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                    else:
                        logger.log(logging.INFO, "Skipping URL : " + url)
                        payload = protocol.URLPayload(
                            [url], protocol.URLPayload.SKIPPED, session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                        continue

            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Exemple #2
0
    def mainRoutine(self):
        """To Come in da future. For now, no use"""
        logger.log(logging.INFO, "Starting server mainRoutine")

        for url in self.configurationPayload.config.rootUrls:
            payload = protocol.URLPayload([str(url)],
                                          protocol.URLPayload.TOVISIT)
            packet = protocol.Packet(protocol.URL, payload)
            urlVisited[url] = True
            outputQueue.put(packet)

            if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING and (
                    self.configurationPayload.config.crawlDelay != 0):
                if self.configurationPayload.config.crawlDelay != 0:
                    time.sleep(self.configurationPayload.config.crawlDelay)

        while self.isActive:
            try:
                if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                    url = urlToVisit.get(True)
                    payload = protocol.URLPayload([str(url)],
                                                  protocol.URLPayload.TOVISIT)
                    packet = protocol.Packet(protocol.URL, payload)
                    outputQueue.put(packet)
                    self.requestCount = self.requestCount + 1

                    if self.configurationPayload.config.crawlDelay != 0:
                        time.sleep(self.configurationPayload.config.crawlDelay)

                    if self.requestLimit != 0 and len(
                            visitedURLlist) + 1 > self.requestLimit:
                        break

                elif self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING:
                    if (len(skippedURLlist + visitedURLlist) == len(
                            self.configurationPayload.config.rootUrls)):
                        break
                    else:
                        time.sleep(0.3)
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = "\n" + ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.ERROR, message)

        logger.log(logging.INFO, "Scrapping complete. Terminating...")
        self.disconnectAllClient()
        self.isActive = False
Exemple #3
0
    def sendConfig(self, configuration):
        """Sends the configuration to the client"""
        logger.log(logging.DEBUG, self.formattedAddr + "Sending configuration")
        self.configuration = configuration

        packet = protocol.Packet(protocol.CONFIG, self.configuration)
        self.writeSocket(packet)

        logger.log(logging.DEBUG,
                   self.formattedAddr + "Configuration sent waiting for ACK")
        packet = self.readSocket(5)

        if packet.type == protocol.INFO:
            if packet.payload.info == protocol.InfoPayload.CLIENT_ACK:
                logger.log(
                    logging.DEBUG, self.formattedAddr +
                    "Working node ACK received (configuration)")
                return
            else:
                self.isActive = False
                raise Exception("Unable to transmit configuration")
Exemple #4
0
    def readConfig(self):
        """Reads the configuration from the server"""
        logger.log(logging.DEBUG, "Waiting for configuration from the server.")
        if self.isActive:
            try:
                deserializedPacket = self.readSocket()
                logger.log(logging.DEBUG, "Configuration received.")

                if deserializedPacket.type == protocol.CONFIG:
                    self.crawlingType = deserializedPacket.payload.crawlingType
                    self.config = deserializedPacket.payload.config

                    # dynamic module reload
                    basePath = os.path.dirname(sys.argv[0])
                    if basePath:
                        basePath = basePath + "/"

                    # path building
                    rulePath = basePath + "modules/rule.py"
                    scrappingPath = basePath + "modules/scrapping.py"

                    # re-writing source .py
                    logger.log(logging.INFO, "Importing rule.py from server")
                    ruleFd = open(rulePath, 'w')
                    ruleFd.write(self.config.rule_py)
                    ruleFd.close()

                    logger.log(logging.INFO, "Importing scrapping.py from server")
                    scrappingFd = open(scrappingPath, 'w')
                    scrappingFd.write(self.config.scrapping_py)
                    scrappingFd.close()

                    # compilation test
                    try:
                        code=open(rulePath, 'rU').read()
                        compile(code, "rule_test", "exec")
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                        logger.log(logging.CRITICAL, message)
                        logger.log(logging.ERROR, "Unable to compile rule.py (is the syntax right?)")
                        sys.exit(0)

                    try:
                        code=open(scrappingPath, 'rb').read(os.path.getsize(scrappingPath))
                        compile(code, "scrapping_test", "exec")
                    except:
                        exc_type, exc_value, exc_traceback = sys.exc_info()
                        message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                        logger.log(logging.CRITICAL, message)
                        logger.log(logging.ERROR, "Unable to compile scrapping.py (is the syntax right?)")
                        sys.exit(0)

                    # dynamic reload of modules
                    # TODO reloading of rule.py should eventually come here
                    logger.log(logging.INFO, "Reloading modules imported for server")
                    reload(sys.modules["modules.scrapping"])


                    payload = protocol.InfoPayload(protocol.InfoPayload.CLIENT_ACK)
                    packet = protocol.Packet(protocol.INFO, payload)
                    self.writeSocket(packet)

                    logger.log(logging.DEBUG, "Sending ACK for configuration.")
                else:
                    raise Exception("Unable to parse configuration.")
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False