def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0') ]
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.requests = self.spider.start_requests()
def execute(self): start_time = datetime.now() date_columns = { "weekend": dh.next_friday(), "today": datetime.now(), "autumn": dh.autumn_date(), "summer": dh.summer_date(), "spring": dh.spring_date(), "winter": dh.winter_date(), "new_year": dh.new_year_date(), "one_month": dh.current_date_plus_months(1), "three_months": dh.current_date_plus_months(3), "five_months": dh.current_date_plus_months(5) } spiders_executor = CrawlerProcess() for process_name, date in date_columns.items(): booking_crawler = Crawler(bs, get_project_settings()) booking_crawler.signals.connect(self.spider_done, signals.spider_closed) spiders_executor.crawl(booking_crawler, column_name=process_name, date=date, city=self.__city) self.RUNNING_SPIDERS.append(booking_crawler.spider) if len(spiders_executor.crawlers) < len(date_columns): print("Less crawlers than date_columns") self.execute() spiders_executor.start() print("Scrapping {} in {}s".format( self.__city, str((datetime.now() - start_time).seconds)))
def prepare_callback_replay(fixture_path, encoding="utf-8"): with open(str(fixture_path), 'rb') as f: raw_data = f.read() fixture_info = unpickle_data(decompress_data(raw_data), encoding) if 'fixture_version' in fixture_info: encoding = fixture_info['encoding'] data = unpickle_data(fixture_info['data'], encoding) else: data = fixture_info # legacy tests settings = get_project_settings() spider_name = data.get('spider_name') if not spider_name: # legacy tests spider_name = os.path.basename( os.path.dirname(os.path.dirname(fixture_path))) spider_cls = get_spider_class(spider_name, settings) spider_cls.update_settings(settings) for k, v in data.get('settings', {}).items(): settings.set(k, v, 50) crawler = Crawler(spider_cls, settings) spider_args_in = data.get('spider_args', data.get('spider_args_in', {})) spider = spider_cls.from_crawler(crawler, **spider_args_in) crawler.spider = spider return data, crawler, spider, settings
def __init__(self, spider): Process.__init__(self) self.crawler = Crawler() self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def __init__(self, splash_url, crawler_depth_limit): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler( self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': 50, 'DEPTH_LIMIT': crawler_depth_limit })
def __init__(self, splash_url, useragent, depth=1, log_enabled=False, log_level='WARNING'): self.process = CrawlerProcess({'LOG_ENABLED': log_enabled}) self.crawler = Crawler( self.ScrapySplashWrapperSpider, { 'LOG_ENABLED': log_enabled, 'LOG_LEVEL': log_level, 'USER_AGENT': useragent, 'SPLASH_URL': splash_url, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 'scrapysplashwrapper.ScrapySplashWrapperDepthMiddleware': 110 }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'DEPTH_LIMIT': depth })
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error( _('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect( soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
def main(): """Setups item signal and run the spider""" from twisted.internet import reactor from scrapy import signals from scrapy.settings import Settings from scrapy.crawler import Crawler def catch_item(sender, item, **kwargs): print "Got:", item settings = Settings() # set up crawler crawler = Crawler(settings) # shut off log crawler.settings.set('LOG_ENABLED', False, priority='cmdline') # set up signal to catch items scraped crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider spider = MySpider() crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() reactor.run() print "ENGINE STOPPED"
def setUp(self): """Initialize the test.""" settings.LOG_LEVEL = 'DEBUG' crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler)
def test_start_requests_http_error(spider_name): # See scrapy.crawler.CrawlerRunner._create_crawler spidercls = runner.spider_loader.load(spider_name) crawler = Crawler(spidercls, runner.settings) start_time = datetime(2001, 2, 3, 4, 5, 6) crawler.stats.set_value('start_time', start_time) try: # See scrapy.crawler.Crawler._create_spider spider = crawler.spidercls.from_crawler(crawler) for request in spider.start_requests(): # See scrapy.core.scraper.Scraper.call_spider callback = request.callback or spider.parse response = Response('http://example.com', status=555, request=request) # If `max_retries` is set, the spider handles (and retries) error responses. if hasattr(spider, 'max_retries'): response.request.meta['retries'] = spider.max_retries items = list(callback(response)) assert len(items) == 1 for item in items: assert type(item) is FileError assert len(item) == 3 assert item['errors'] == {'http_code': 555} assert item['file_name'] assert item['url'] except MissingEnvVarError as e: warnings.warn(f'{spidercls.name}: {e}')
def setup(self): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._next_crawl, signal=signals.spider_closed) crawler.crawl(self.spider) crawler.start()
def kickoff(self): """ Starts a new crawler :return: """ settings = Settings() # settings.set("USER_AGENT", "Test") settings.set('JOBDIR', self.args.data_dir) self.spider = MavenDataSpider() # Wrap with crawler, configure crawler = Crawler(self.spider, settings) crawler.signals.connect(spider_closing, signal=signals.spider_closed) logger.info('Starting crawler') crawler.crawl(self.spider, app=self, dbsess=self.session) self.spider = crawler.spider self.spider.link_queue_mode = False if self.args.debug: coloredlogs.install(level=logging.DEBUG) # Keeping thread working reactor.run()
def results(): """Return results generator from the arxiv spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) fake_response = fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', response_type=TextResponse, ) test_selectors = fake_response.xpath('.//record') parsed_items = [spider.parse_record(sel) for sel in test_selectors] pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield [ _get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items ] clean_dir()
def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri' os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file' crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = next( spider.parse( fake_response_from_file( file_name=str('pos/sample_pos_record.xml'), ))) response = HtmlResponse(url=request.url, request=request, body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = next(request.callback(response)) crawl_result = pipeline.process_item(parsed_item, spider) assert crawl_result['record'] yield crawl_result['record'] clean_dir()
def call_spider(file): """ Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego transformarlos a los archivos data.json correspondientes. """ with open(file, "r") as f: list_url = f.readlines() domains = [] urls = [] created_files = [] for u in list_url: domain = u.strip('\n') url_aux = domain.split("/") domain_type = False if (len(url_aux) > 1): domain = url_aux[0] url = "http://" + url_aux[0] + "/datos/data" if domain == 'www.paraguay.gov.py': url = "http://" + url_aux[0] + "/datos" else: url = "http://" + u.strip('\n') + "/data" domain_type = True print "============= Domain " + domain print "============= Start url " + url response = requests.get(url + "/data.json") if response.status_code == 200: filename = FileController.FileController( ).save_existing_data_json(response, domain, True) created_files.append({ 'modalidad': 'recolecta', 'archivo': filename }) else: domains.append(domain) urls.append(url) spider = DataSpider(domains=domains, start_urls=urls, domain_type=domain_type) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False) reactor.run() # the script will block here """ Copiar los datos a los archivos .json """ data_spider.copy_items_to_files() """ Eliminar archivos temporales """ FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: filename = DataJson.DataJson().convert(domain) created_files.append({ 'modalidad': 'data-hunting', 'archivo': filename }) return created_files
def create_crawler2(self, spidercls_cls, spidercls_str): cls_settings = self.settings.copy() spidercls_info = self.get_spidercls_info(spidercls_str) extra_info = {'project_id': spidercls_info.get('project_id')} cls_settings.setdict(extra_info) logger.info(cls_settings) return Crawler(spidercls_cls, cls_settings)
def test_not_enabled(self): settings: Settings = Settings({'HTTPPROXY_ENABLED': False}) crawler: Crawler = Crawler(_spider, settings) self.assertRaises( NotConfigured, partial(HttpProxyMiddleware.from_crawler, crawler) )
def parse_careers(spider): crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() spider.start()
def crawl(): for spider_test in spider_tests: print("Running spider: %s" % spider_test[SETTING]['HTTPCACHE_DIR'].split('/')[-1]) crawler = Crawler(spider_test[SPIDER], spider_test[SETTING]) yield runner.crawl(crawler) reactor.stop()
def handle(self, url_slug, **options): page = Page.objects.get(url_slug=url_slug) feed = page.feed store = page.store store_slug = store.slug.lower() opts = { 'recreate_tiles': options['recreate_tiles'], 'skip_images': not options['update_images'], 'skip_tiles': True, } start_urls = [] for tile in feed.tiles.all(): if tile.product: start_urls.append(tile.product.url) for content in tile.content.all(): for prod in content.tagged_products.all(): start_urls.append(prod.url) start_urls = set(start_urls) # set up standard framework for running spider in a script settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() spider = crawler.spiders.create(store_slug, **opts) spider.start_urls = start_urls spider.feed_id = feed.id crawler.crawl(spider) logging.info('Starting spider with options: {}'.format(opts)) crawler.start() reactor.run()
def _setup(self, project): spider = crawlspider.LinkSpider(project) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) self.add_crawler()
def setup(self): logging.disable(logging.DEBUG) # Setting pipeline settings = { "DB_HOST": os.getenv("DB_HOST"), "DB_PORT": os.getenv("DB_PORT"), "DB_DATABASE": os.getenv("DB_DATABASE"), "DB_USERNAME": os.getenv("DB_USERNAME"), "DB_PASSWORD": os.getenv("DB_PASSWORD"), } crawler = Crawler(HorseRacingSpider, settings) self.pipeline = PostgreSQLPipeline.from_crawler(crawler) # Setting db with self.pipeline.session() as sess: sess.query(RaceInfoData).delete() sess.query(RacePayoffData).delete() sess.query(RaceResultData).delete() sess.query(RaceDenmaData).delete() sess.query(HorseData).delete() sess.query(TrainerData).delete() sess.query(JockeyData).delete() sess.query(OddsWinPlaceData).delete() sess.query(OddsBracketQuinellaData).delete() sess.query(OddsExactaData).delete() sess.query(OddsQuinellaData).delete() sess.query(OddsQuinellaPlaceData).delete() sess.query(OddsTrifectaData).delete() sess.query(OddsTrioData).delete() sess.commit() self.sess = self.pipeline.session()
def setup_crawler(spider, stop=False): ''' Takes a spider class object ''' # Deferred means other functions can wait on this finishing # Wait until the callback is triggered by spider close # See twisted docs d = defer.Deferred() def foo(*a, **kw): # The result to be passed to any callbacks to deferred # (we don't use it, so True could've been False, None w/e) d.callback(True) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Ref to foo otherwise it gets GC'd (garbage collected) crawler._tempref = foo # foo is the handler for the closed signal from this spider # N.B. dispatch returns spider and reason (e.g. 'finished') to foo. crawler.signals.connect(foo, signal=signals.spider_closed) crawler.crawl(spider) # N.B log is scrapy log. log2 is python color logger # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats # which you will want for stats mailer extension. # Starting this each time will cause the big torrade of ESMTP Error # log.start(crawler=crawler) crawler.start() return d
def _crawl_next(self, spider): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._done_task, signal=signals.spider_closed) crawler.crawl(spider) crawler.start()
def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler( self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'], 'SPLASH_COOKIES_DEBUG': False })
def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(spider.__class__, settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def setup_crawler(domain): spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def test_spider_custom_settings_log_level(self): log_file = self.mktemp() class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file, # disable telnet if not available to avoid an extra warning 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } configure_logging() self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG) crawler = Crawler(MySpider, {}) self.assertEqual(get_scrapy_root_handler().level, logging.INFO) info_count = crawler.stats.get_value('log_count/INFO') logging.debug('debug message') logging.info('info message') logging.warning('warning message') logging.error('error message') with open(log_file, 'rb') as fo: logged = fo.read().decode('utf8') self.assertNotIn('debug message', logged) self.assertIn('info message', logged) self.assertIn('warning message', logged) self.assertIn('error message', logged) self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1) self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1) self.assertEqual( crawler.stats.get_value('log_count/INFO') - info_count, 1) self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
def setUp(self): self.spider = Spider("foo") settings = Settings() settings.setmodule(default_settings) self.crawler = Crawler(Spider, settings) self.mw = CookiesMiddleware.from_crawler(self.crawler) self.mw.spider_opened(self.spider)