Example #1
0
    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD'))

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)
    def __init__(self, crawler):
        """Access the settings of the crawler to connect to Scrapoxy.
        """
        self._http_status_codes = crawler.settings.get(
            'BLACKLIST_HTTP_STATUS_CODES', [503])
        self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60)
        self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180)

        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD'))
class BlacklistDownloaderMiddleware(object):
    def __init__(self, crawler):
        """Access the settings of the crawler to connect to Scrapoxy.
        """
        self._http_status_codes = crawler.settings.get(
            'BLACKLIST_HTTP_STATUS_CODES', [503])
        self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60)
        self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180)

        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD'))

    @classmethod
    def from_crawler(cls, crawler):
        """Call constructor with crawler parameters
        """
        return cls(crawler)

    def process_response(self, request, response, spider):
        """Detect blacklisted response and stop the instance if necessary.
        """
        try:
            if response.status in self._http_status_codes:
                raise BlacklistError(response,
                                     'HTTP status {}'.format(response.status))

            return response

        except BlacklistError as ex:
            spider.log('Ignoring Blacklisted response {0}: {1}'.format(
                response.url, ex.message),
                       level=logging.DEBUG)

            name = response.headers['x-cache-proxyname'].decode('utf-8')
            self._stop_and_sleep(spider, name)

            raise IgnoreRequest()

    def _stop_and_sleep(self, spider, name):
        if name:
            alive = self._commander.stop_instance(name)
            if alive < 0:
                spider.log('Remove: cannot find instance {}'.format(name),
                           level=logging.ERROR)
            elif alive == 0:
                spider.log('Remove: instance removed (no instance remaining)',
                           level=logging.WARNING)
            else:
                spider.log(
                    'Remove: instance removed ({} instances remaining)'.format(
                        alive),
                    level=logging.DEBUG)
        else:
            spider.log('Cannot find instance name in headers',
                       level=logging.ERROR)

        delay = random.randrange(self._sleep_min, self._sleep_max)
        spider.log('Sleeping {} seconds'.format(delay), level=logging.INFO)
        time.sleep(delay)
    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD')
        )

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)
class BlacklistDownloaderMiddleware(object):

    def __init__(self, crawler):
        """Access the settings of the crawler to connect to Scrapoxy.
        """
        self._http_status_codes = crawler.settings.get('BLACKLIST_HTTP_STATUS_CODES', [503])
        self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60)
        self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180)

        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD')
        )


    @classmethod
    def from_crawler(cls, crawler):
        """Call constructor with crawler parameters
        """
        return cls(crawler)


    def process_response(self, request, response, spider):
        """Detect blacklisted response and stop the instance if necessary.
        """
        try:
            if response.status in self._http_status_codes:
                raise BlacklistError(response, u'HTTP status '.format(response.status))

            return response

        except BlacklistError as ex:
            spider.log(u'Ignoring Blacklisted response {0}: {1}'.format(response.url, ex.message), level=logging.DEBUG)

            name = response.headers.get(u'x-cache-proxyname')
            self._stop_and_sleep(spider, name)

            raise IgnoreRequest()


    def _stop_and_sleep(self, spider, name):
        if name:
            alive = self._commander.stop_instance(name)
            if alive < 0:
                spider.log(u'Remove: cannot find instance {}'.format(name), level=logging.ERROR)
            elif alive == 0:
                spider.log(u'Remove: instance removed (no instance remaining)', level=logging.WARNING)
            else:
                spider.log(u'Remove: instance removed ({} instances remaining)'.format(alive), level=logging.DEBUG)
        else:
            spider.log(u'Cannot find instance name in headers', level=logging.ERROR)

        delay = random.randrange(self._sleep_min, self._sleep_max)
        spider.log(u'Sleeping {} seconds'.format(delay), level=logging.INFO)
        time.sleep(delay)
    def __init__(self, crawler):
        """Access the settings of the crawler to connect to Scrapoxy.
        """
        self._http_status_codes = crawler.settings.get('BLACKLIST_HTTP_STATUS_CODES', [503])
        self._sleep_min = crawler.settings.get('SCRAPOXY_SLEEP_MIN', 60)
        self._sleep_max = crawler.settings.get('SCRAPOXY_SLEEP_MAX', 180)

        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD')
        )
class ScaleMiddleware(object):

    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD')
        )

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)


    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)


    def spider_opened(self, spider):
        spider.logger.debug(u'[ScaleMiddleware] Upscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = max_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)

        spider.log(u'[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(self._WAIT_FOR_SCALE), level=logging.WARNING)
        time.sleep(self._WAIT_FOR_SCALE)


    def spider_closed(self, spider):
        spider.logger.debug(u'[ScaleMiddleware] Downscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = min_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)
Example #8
0
class ScaleMiddleware(object):
    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD'))

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def spider_opened(self, spider):
        spider.logger.debug('[ScaleMiddleware] Upscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = max_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)

        spider.log(
            '[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(
                self._WAIT_FOR_SCALE),
            level=logging.WARNING)
        time.sleep(self._WAIT_FOR_SCALE)

    def spider_closed(self, spider):
        spider.logger.debug('[ScaleMiddleware] Downscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = min_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)
Example #9
0
File: run.py Project: arbal/recrawl
 def __init__(self):
     self.settings = get_project_settings()
     self.commander = Commander(self.settings.get('API_SCRAPOXY'),
                                self.settings.get('API_SCRAPOXY_PASSWORD'))
     configure_logging(settings=None, install_root_handler=False)
     logging.config.dictConfig(self.settings['LOGGING_SETTINGS'])
Example #10
0
File: run.py Project: arbal/recrawl
class App(object):
    def __init__(self):
        self.settings = get_project_settings()
        self.commander = Commander(self.settings.get('API_SCRAPOXY'),
                                   self.settings.get('API_SCRAPOXY_PASSWORD'))
        configure_logging(settings=None, install_root_handler=False)
        logging.config.dictConfig(self.settings['LOGGING_SETTINGS'])

    def prepare_instances(self):
        if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1:
            logger.info("Do not run crawler over proxy")
            return
        min_sc, required_sc, max_sc = self.commander.get_scaling()
        required_sc = max_sc
        self.commander.update_scaling(min_sc, required_sc, max_sc)
        wait_for_scale = self.settings.get('WAIT_FOR_SCALE')
        time.sleep(wait_for_scale)

    def runCrawlers(self):
        process = CrawlerProcess(self.settings)

        crawl_thread = Crawlers(process=process,
                                spiders=[Homegate, Newhome, Immoscout24])
        crawl_thread.start()
        rounds = 0
        while crawl_thread.is_alive():
            if rounds == (4320):  # 4320*10(sleep) = 12h
                logger.info("Run into time out")
                break
            rounds += 1
            time.sleep(10)

        logger.debug("Stopping all crawlers..")
        process.stop()
        while crawl_thread.is_alive():
            logger.debug("Wait for crawlers to clean up...")
            time.sleep(100)

    def shutdown_instances(self):
        if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1:
            logger.info("Nothing to stop, because no instances were started")
            return
        min_sc, required_sc, max_sc = self.commander.get_scaling()
        self.commander.update_scaling(min_sc, 0, max_sc)

    def getCrawledData(self):
        engine = create_engine(self.settings.get('DATABASE_URL'))
        Session = sessionmaker(bind=engine, expire_on_commit=True)
        session = Session()
        from_time = datetime.datetime.now() - datetime.timedelta(days=1)
        ads = session.query(Advertisement).filter(
            Advertisement.last_seen >= from_time).all()
        with open("crawled_ads.csv", "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile,
                                   delimiter=';',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(
                [column.key for column in Advertisement.__table__.columns])
            for ad in ads:
                csvwriter.writerow(list(ad))
        print(len(ads))