def run_scraper(self, scraper): start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print(f'{datetime.now()} starting {scraper.__name__}') runner = CrawlerRunner(get_project_settings()) runner.crawl(scraper, start_date, end_date) runner.join()
def scrapNtucAmazon(item, ntuc, amazonprime, flag): runner = CrawlerRunner() runner.crawl(NtucSpider, item=item, ntuc=ntuc, flag=flag) runner.crawl(AmazonPrimeSpider, item=item, amazonprime=amazonprime, flag=flag) runner.join() while len(flag) != 2: # ntuc and amazon crawl have not finished pass
def _exec_crawler(movie, file): SPIDERS_NAME = ['zimuku'] settings = get_project_settings() settings.set('file', file) runner = CrawlerRunner(settings) for name in SPIDERS_NAME: runner.crawl(name, movie) runner.join().addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run()
def f(q, spider_amazon, spider_google, start_amazon_urls, start_google_urls): try: runner = CrawlerRunner() runner.crawl(spider_amazon, start_urls = start_amazon_urls) runner.crawl(spider_google, start_urls = start_google_urls) runner.join() deferred = runner.join() deferred.addBoth(lambda _: reactor.stop()) reactor.run() q.put(None) except Exception as e: q.put(e)
def main(): username = input('Username:'******'Password:'******'config.json') as conf_file: rule = json.load(conf_file) settings = get_project_settings() file_store_location = settings.get("FILES_STORE") + "\\" + rule["run_date"] settings.set("FILES_STORE", file_store_location, priority='cmdline') runner = CrawlerRunner(settings) runner.crawl(TorrentSpider, user={ 'username': username, 'password': password }, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(): options = { 'CONCURRENT_ITEMS': 250, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings.update(options) #BookToscrapeSpider basic version from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider #runner = CrawlerRunner(settings) #runner.crawl(BookToscrapeSpider()) #BookToscrapeSpider crawl version from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl runner = CrawlerRunner(settings) runner.crawl(BookToscrapeSpider_crawl()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
class scheduler(): def __init__(self): self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders." self.sched = TwistedScheduler() self.process = CrawlerRunner(get_project_settings()) def addJob(self, spiderModulePath, spiderClass, scheduleTime): # Create Spider Object dynamically by importing module. try: module = self.modulePath + spiderModulePath module = importlib.import_module(module) class_ = getattr(module, spiderClass) instance = class_() self.sched.add_job(self.process.crawl, 'date', args=[instance], run_date=scheduleTime) except (Exception) as error: print(error) def runJob(self): try: self.sched.start() d = self.process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() except (Exception) as error: print(error)
def associates_task(): print("=== ASSOCIATES SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(associatesHandler) d = runner.join()
def run(): options = { 'CONCURRENT_ITEMS': 250, #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } spider = EntertainmentcareersSpider() settings = get_project_settings() settings.update(options) runner= CrawlerRunner(settings) runner.crawl(spider) d= runner.join() d.addBoth(lambda _:reactor.stop()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
def ctkhdetails_task(): print("=== CTKH DETAILS SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(ctkhDetailsHandler) d = runner.join()
def majorshareholders_task(): print("=== MAJOR SHAREHOLDERS SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(majorShareHoldersHandler) d = runner.join()
def ownerstructure_task(): print("=== OWNER STRUCTURE SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(ownerStructureHandler) d = runner.join()
def run_all_modules(): sorted_modules = cullModules() configure_logging() if len(sorted_modules) is not 0: runner = CrawlerRunner(get_project_settings()) for module in sorted_modules: try: bla = issubclass(module, Spider) if bla: runner.crawl(module) logger.debug(module.name + " succesfully loaded") else: module.run() logger.debug(module.name + " succesfully loaded") except: logger.critical("Module " + module.name + " could not be started") d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run(installSignalHandlers=False) else: logger.debug("No modules enabled, try again")
def counterparts_task(): print("=== COUNTERPARTS SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(counterPartsHandler) d = runner.join()
def corporateAZExpress_task(): print("=== CORPORATEAZ-Express SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(corporateazExpressHandler) d = runner.join()
def finance_task(): print("=== FINANCE SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(financeInfoHandler) d = runner.join()
def viewprofile_task(): print("=== VIEW PROFILE SPIDER CRAWLING ===") setup() configure_logging() runner = CrawlerRunner(settings=get_project_settings()) runner.crawl(viewProfileHandlder) d = runner.join()
def booking(self, *, func=None): from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scripture.spiders.booking import BookingSpider as Spider if func and not func.startswith("--"): print("func is", func) func = globals().get(func) or locals().get(func) if func: make_requests = func else: import importlib make_requests = importlib.import_module(func) Spider.start_requests = make_requests Spider._entrypoint_page = ( "https://www.booking.com/searchresults.zh-cn.html?ss" ) Spider.dbname = "bookings" _settings = get_project_settings() runner = CrawlerRunner(_settings) runner.crawl(Spider) deamon = runner.join() deamon.addBoth(lambda _: reactor.stop()) # pylint: disable=E1101 reactor.run() # pylint: disable=E1101
def crawl(keyword, topic_id): configure_logging() runner = CrawlerRunner() runner.crawl(CommentSpider, keyword=keyword, topic_id=topic_id, start_page=0, end_page=9) runner.crawl(CommentSpider, keyword=keyword, topic_id=topic_id, start_page=10, end_page=19, username='******', password='******') runner.crawl(CommentSpider, keyword=keyword, topic_id=topic_id, start_page=20, end_page=29, username='******', password='******') runner.crawl(CommentSpider, keyword=keyword, topic_id=topic_id, start_page=30, end_page=39, username='******', password='******') d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run( ) # the script will block here until all crawling jobs are finished
def run_spider(): options = { 'CONCURRENT_ITEMS': 250, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings.update(options); #BookToscrapeSpider basic version from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider #runner = CrawlerRunner(settings) #runner.crawl(BookToscrapeSpider()) #BookToscrapeSpider crawl version from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl runner = CrawlerRunner(settings) runner.crawl(BookToscrapeSpider_crawl()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) d= runner.join() d.addBoth(lambda _:reactor.stop()) reactor.run()
def main(argv): inputfile = '' try: opts, args = getopt.getopt(argv, "hi:", ["ifile="]) except getopt.GetoptError: print('crawlers.py -i <inputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('crawlers.py -i <inputfile>') sys.exit() elif opt in ("-i", "--ifile="): inputfile = arg ticks = list(read_file(inputfile)) # Create and run spiders configure_logging() crawler_settings = Settings() crawler_settings.setmodule(my_settings) runner = CrawlerRunner(settings=crawler_settings) for tick in ticks: kwargs = {'tick': tick} runner.crawl(MWSpider, **kwargs) runner.crawl(ReutersSpider, **kwargs) runner.crawl(BloSpider, **kwargs) runner.crawl(MSNBCSpider, **kwargs) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def __init__(self, link): configure_logging() crawler = CrawlerRunner() crawler.crawl(Spider) d = crawler.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_crawler_by_runner(): runner = CrawlerRunner(get_project_settings()) [runner.crawl(spider) for spider in spiders] d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def handle(self, *args, **options): configure_logging() process = CrawlerRunner(get_project_settings()) # film crawlers process.crawl('acfilm') process.crawl('fpp') process.crawl('retrospekt') process.crawl('brooklyn-film') process.crawl('precision-film') process.crawl('bhfilm') process.crawl('freestyle') process.crawl('moment') process.crawl('ultrafine') # camera crawlers process.crawl('brooklyn') process.crawl('austin_camera') process.crawl('precision') process.crawl('keh') process.crawl('bh') # not super impressed with etsy, tbh # process.crawl('etsy') d = process.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def crawl_articles(spids): settings = get_project_settings() configure_logging(settings, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(settings) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [ loader.load(spid) for spid in spids if spid in loader.list() ] if not spiders: return random.shuffle(spiders) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl job starting...') try: reactor.run() except Exception: logger.exception('crawl job got exception:') logger.info('crawl job finished')
def run(self): # set custom settings here if you want # e.g self.spider.custom_settings={'RETRY_TIMES':10} logging.info('Spider thread is running. Calling spiders: %s ' % self.spiders) if not self.spiders: raise ValidationError('Missing spiders') # crawler = CrawlerProcess(get_project_settings()) settings = Settings() os.environ[ 'SCRAPY_SETTINGS_MODULE'] = 'app.spiders.spider_robot.settings' settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE'] settings.setmodule(settings_module_path, priority='project') crawler = CrawlerRunner(settings) live_spider_names = SpiderConfigure.get_all_live_spider_names() is_crawl = False logging.info(live_spider_names) for spider in self.spiders: if spider.name in live_spider_names: is_crawl = True crawler.crawl(spider, params=self.params) if is_crawl: d = crawler.join() d.addBoth(lambda _: reactor.stop()) reactor.run( ) # The script will block here until all crawlers are finished else: logging.warning('No live spider found') self.result_queue.put(self.items)
def crawlingMonth(end_year, end_month, end_day): """ 인자로 전달받은 날짜에서 한달(30일)치 데이터를 가져온다. :type end_year: Integer :param end_year: 크롤링 마지막 날의 연도 :type end_month: Integer :param end_month: 크롤링 마지막 날의 월 :type end_day: Integer :param end_day: 크롤링 마지막 날의 일 :raises: None :rtype: None """ runner = CrawlerRunner({ 'USER_AGENT': 'MOZILLA/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'FEED_FORMAT': 'csv', 'LOG_ENABLED': 'False' }) end_date = datetime.date(end_year, end_month, end_day) for i in range(0, 30): dateStr = end_date.strftime("%Y%m%d") runner.settings['FEED_URI'] = "./generated/" + dateStr + ".csv" runner.settings['TARGET_DATE'] = dateStr runner.crawl(HsmoabotSpider, target_date=dateStr) end_date = end_date - datetime.timedelta(1) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def crawl_job(): settings = get_project_settings() runner = CrawlerRunner(settings) for spider in spiders: print('spider crawl') runner.crawl(spider) return runner.join()
def run(): options = { 'CONCURRENT_ITEMS': 250, #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } spider = EntertainmentcareersSpider() settings = get_project_settings() settings.update(options) runner = CrawlerRunner(settings) runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run(self): settings = get_project_settings() myrunner = CrawlerRunner(settings) myrunner.crawl(HkexSpider) d = myrunner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() #the script will block here until the crawling is finished return True
def run_spiders(): settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) runner.crawl(RedisSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def crawl(cls, *args, **kwargs): crawler = CrawlerRunner(get_project_settings()) crawler.crawl(cls, *args, **kwargs) d = crawler.join() print(MeituanArticleSpider.runing) d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_async_def_asyncio_parse(self): runner = CrawlerRunner({"ASYNCIO_REACTOR": True}) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log))
def runProcess(self): configure_logging() dbHandler.check_watches() runner = CrawlerRunner() runner.crawl(spider.available_courses_spider) dbHandler.check_watches() d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def runSpider(host, spider): spiders = spider.split(',') changeSettings(host) settings = get_project_settings() runner = CrawlerRunner(settings) for i in spiders: runner.crawl(SPIDER_MATCHER[i.lower()]) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def main(): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings = get_project_settings() runner = CrawlerRunner(settings) # settings.set('FEED_FORMAT','json') # settings.set('FEED_URI', 'result.json') runner.crawl(PttBoard) runner.crawl(PTTArticle) d = runner.join() d.addBoth(lambda _: reactor.stop()) result = reactor.run() # the script will block here until the crawling is finished print result
class Runner(object): def __init__(self,*args,**kwargs): configure_logging() self.settings = get_project_settings() self.runner = CrawlerRunner(self.settings) def add(self,*a,**kw): crawler = Crawler(BroadSpider,self.settings) self.runner.crawl(crawler,*a,**kw) def start(self): d = self.runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() def stop(self): self.runner.stop() reactor.stop()
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines '''}) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
# -*- coding: utf-8 -*- import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from tmall import TmallSpider spider = TmallSpider(domain='tmall.com') crawler = CrawlerRunner() crawler.crawl(spider) d = crawler.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
from time import sleep import scrapy from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from PTTRank.spiders.ptt import PttSpider settings = get_project_settings() runner = CrawlerRunner(settings) runner.crawl(PttSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def callSpiderWithCrawlerRunner(self): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) runner.crawl(self.spider) dispatcher.connect(self.spider_closing, signal=signals.spider_closed) runner.join()