class ScaleMiddleware(object):

    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD')
        )

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)


    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)


    def spider_opened(self, spider):
        spider.logger.debug(u'[ScaleMiddleware] Upscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = max_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)

        spider.log(u'[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(self._WAIT_FOR_SCALE), level=logging.WARNING)
        time.sleep(self._WAIT_FOR_SCALE)


    def spider_closed(self, spider):
        spider.logger.debug(u'[ScaleMiddleware] Downscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = min_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)
Exemple #2
0
class ScaleMiddleware(object):
    def __init__(self, crawler):
        self._commander = Commander(
            crawler.settings.get('API_SCRAPOXY'),
            crawler.settings.get('API_SCRAPOXY_PASSWORD'))

        self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120

        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def spider_opened(self, spider):
        spider.logger.debug('[ScaleMiddleware] Upscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = max_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)

        spider.log(
            '[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(
                self._WAIT_FOR_SCALE),
            level=logging.WARNING)
        time.sleep(self._WAIT_FOR_SCALE)

    def spider_closed(self, spider):
        spider.logger.debug('[ScaleMiddleware] Downscale Scrapoxy')

        min_sc, required_sc, max_sc = self._commander.get_scaling()
        required_sc = min_sc

        self._commander.update_scaling(min_sc, required_sc, max_sc)
Exemple #3
0
class App(object):
    def __init__(self):
        self.settings = get_project_settings()
        self.commander = Commander(self.settings.get('API_SCRAPOXY'),
                                   self.settings.get('API_SCRAPOXY_PASSWORD'))
        configure_logging(settings=None, install_root_handler=False)
        logging.config.dictConfig(self.settings['LOGGING_SETTINGS'])

    def prepare_instances(self):
        if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1:
            logger.info("Do not run crawler over proxy")
            return
        min_sc, required_sc, max_sc = self.commander.get_scaling()
        required_sc = max_sc
        self.commander.update_scaling(min_sc, required_sc, max_sc)
        wait_for_scale = self.settings.get('WAIT_FOR_SCALE')
        time.sleep(wait_for_scale)

    def runCrawlers(self):
        process = CrawlerProcess(self.settings)

        crawl_thread = Crawlers(process=process,
                                spiders=[Homegate, Newhome, Immoscout24])
        crawl_thread.start()
        rounds = 0
        while crawl_thread.is_alive():
            if rounds == (4320):  # 4320*10(sleep) = 12h
                logger.info("Run into time out")
                break
            rounds += 1
            time.sleep(10)

        logger.debug("Stopping all crawlers..")
        process.stop()
        while crawl_thread.is_alive():
            logger.debug("Wait for crawlers to clean up...")
            time.sleep(100)

    def shutdown_instances(self):
        if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1:
            logger.info("Nothing to stop, because no instances were started")
            return
        min_sc, required_sc, max_sc = self.commander.get_scaling()
        self.commander.update_scaling(min_sc, 0, max_sc)

    def getCrawledData(self):
        engine = create_engine(self.settings.get('DATABASE_URL'))
        Session = sessionmaker(bind=engine, expire_on_commit=True)
        session = Session()
        from_time = datetime.datetime.now() - datetime.timedelta(days=1)
        ads = session.query(Advertisement).filter(
            Advertisement.last_seen >= from_time).all()
        with open("crawled_ads.csv", "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile,
                                   delimiter=';',
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(
                [column.key for column in Advertisement.__table__.columns])
            for ad in ads:
                csvwriter.writerow(list(ad))
        print(len(ads))