class ScaleMiddleware(object): def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD') ) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def spider_opened(self, spider): spider.logger.debug(u'[ScaleMiddleware] Upscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = max_sc self._commander.update_scaling(min_sc, required_sc, max_sc) spider.log(u'[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format(self._WAIT_FOR_SCALE), level=logging.WARNING) time.sleep(self._WAIT_FOR_SCALE) def spider_closed(self, spider): spider.logger.debug(u'[ScaleMiddleware] Downscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = min_sc self._commander.update_scaling(min_sc, required_sc, max_sc)
class ScaleMiddleware(object): def __init__(self, crawler): self._commander = Commander( crawler.settings.get('API_SCRAPOXY'), crawler.settings.get('API_SCRAPOXY_PASSWORD')) self._WAIT_FOR_SCALE = crawler.settings.get('WAIT_FOR_SCALE') or 120 crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def spider_opened(self, spider): spider.logger.debug('[ScaleMiddleware] Upscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = max_sc self._commander.update_scaling(min_sc, required_sc, max_sc) spider.log( '[ScaleMiddleware] Sleeping {0} seconds to finish upscale'.format( self._WAIT_FOR_SCALE), level=logging.WARNING) time.sleep(self._WAIT_FOR_SCALE) def spider_closed(self, spider): spider.logger.debug('[ScaleMiddleware] Downscale Scrapoxy') min_sc, required_sc, max_sc = self._commander.get_scaling() required_sc = min_sc self._commander.update_scaling(min_sc, required_sc, max_sc)
class App(object): def __init__(self): self.settings = get_project_settings() self.commander = Commander(self.settings.get('API_SCRAPOXY'), self.settings.get('API_SCRAPOXY_PASSWORD')) configure_logging(settings=None, install_root_handler=False) logging.config.dictConfig(self.settings['LOGGING_SETTINGS']) def prepare_instances(self): if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1: logger.info("Do not run crawler over proxy") return min_sc, required_sc, max_sc = self.commander.get_scaling() required_sc = max_sc self.commander.update_scaling(min_sc, required_sc, max_sc) wait_for_scale = self.settings.get('WAIT_FOR_SCALE') time.sleep(wait_for_scale) def runCrawlers(self): process = CrawlerProcess(self.settings) crawl_thread = Crawlers(process=process, spiders=[Homegate, Newhome, Immoscout24]) crawl_thread.start() rounds = 0 while crawl_thread.is_alive(): if rounds == (4320): # 4320*10(sleep) = 12h logger.info("Run into time out") break rounds += 1 time.sleep(10) logger.debug("Stopping all crawlers..") process.stop() while crawl_thread.is_alive(): logger.debug("Wait for crawlers to clean up...") time.sleep(100) def shutdown_instances(self): if len(self.settings.get('DOWNLOADER_MIDDLEWARES', {})) <= 1: logger.info("Nothing to stop, because no instances were started") return min_sc, required_sc, max_sc = self.commander.get_scaling() self.commander.update_scaling(min_sc, 0, max_sc) def getCrawledData(self): engine = create_engine(self.settings.get('DATABASE_URL')) Session = sessionmaker(bind=engine, expire_on_commit=True) session = Session() from_time = datetime.datetime.now() - datetime.timedelta(days=1) ads = session.query(Advertisement).filter( Advertisement.last_seen >= from_time).all() with open("crawled_ads.csv", "w", newline="") as csvfile: csvwriter = csv.writer(csvfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow( [column.key for column in Advertisement.__table__.columns]) for ad in ads: csvwriter.writerow(list(ad)) print(len(ads))