class Executor(object): def __init__(self, settings, spider_cls_list): self.runner = CrawlerRunner(settings) self.spider_cls_list = spider_cls_list self.logger = logging.getLogger('Executor') self.interrupted = False dispatcher.connect(self._handle_spider_close, signal=signals.spider_closed) signal.signal(signal.SIGTERM, self._handler_sys_signal) signal.signal(signal.SIGINT, self._handler_sys_signal) @defer.inlineCallbacks def _crawl(self): while True: finished = True for spider_cls in self.spider_cls_list: if self.interrupted: self.logger.info('interrupted, cancel pending jobs') break if spider_cls.get_dataset().count() > 0: finished = False logging.info( 'Starting {:s}'.format(spider_cls.__name__)) yield self.runner.crawl(spider_cls) if finished or self.interrupted: break reactor.stop() self.logger.info('finished') if not self.interrupted: send_mail('finished') def execute(self): self._crawl() reactor.run(installSignalHandlers=False) def _handle_spider_close(self, spider, reason): self.logger.info('Spider "{:s}" stopped with reason "{:s}"'.format(spider.name, reason)) if reason not in ['finished', 'shutdown']: self.logger.error('Spider stopped abnormally. Stopping CrawlerRunner') self.interrupted = True self.runner.stop() send_mail(reason) def _handler_sys_signal(self, signum, frame): self.logger.info('KeyboardInterrupt. Stopping CrawlerRunner') self.interrupted = True self.runner.stop()
class CrawlerRunnerTest(unittest.TestCase): def setUp(self): self.crawler_runner = CrawlerRunner(Settings()) def tearDown(self): return self.crawler_runner.stop() @defer.inlineCallbacks def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings self.crawler_runner.settings.setdict(project_settings, priority='project') d = self.crawler_runner.crawl(CustomSettingsSpider) crawler = list(self.crawler_runner.crawlers)[0] yield d self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project')
class Bot(Resource): spider = SlydSpider() def __init__(self, settings, spec_manager): # twisted base class is old-style so we cannot user super() Resource.__init__(self) self.spec_manager = spec_manager settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')]) self.runner = CrawlerRunner(settings) log.msg("bot initialized", level=log.DEBUG) def keep_spider_alive(self, spider): raise DontCloseSpider("keeping it open") def stop(self): """Stop the crawler""" self.runner.stop() log.msg("bot stopped", level=log.DEBUG)
class Runner(object): def __init__(self,*args,**kwargs): configure_logging() self.settings = get_project_settings() self.runner = CrawlerRunner(self.settings) def add(self,*a,**kw): crawler = Crawler(BroadSpider,self.settings) self.runner.crawl(crawler,*a,**kw) def start(self): d = self.runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() def stop(self): self.runner.stop() reactor.stop()
class Runner(object): def __init__(self, *args, **kwargs): configure_logging() self.settings = get_project_settings() self.runner = CrawlerRunner(self.settings) def add(self, *a, **kw): crawler = Crawler(BroadSpider, self.settings) self.runner.crawl(crawler, *a, **kw) def start(self): d = self.runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() def stop(self): self.runner.stop() reactor.stop()
class checkpointed_crawler(object): """ Crawl with checkpointing enabled for crash recovery. Multiple instances of this class enables multiple spiders to simultaneously crawl within a single process. Args: spider_name name of spider within this project cp_interval interval, in seconds, between checkpoints; expected to be in 100's at the minimum add_settings additional settings specific to this spider Preconditions: JOBDIR must be set either in add_settings or in settings.py """ # state of crawler FINISHED = 'finished' RUNNING = 'running' def __init__(self, spider_name: str, cp_interval: int, add_settings: dict = {}): self._cps = 0 self._cp_int = cp_interval self._cp_path = None self._runner = None self._settings = get_project_settings().copy() self._spider = spider_name self._state = None configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) # update settings for k, v in add_settings.items(): self._settings[k] = v # set path to folder to checkpoint if 'JOBDIR' not in self._settings: raise KeyError('JOBDIR not set') self._cp_path = add_settings['JOBDIR'] # restore checkpoint, if any logger.info("Restoring checkpoint: {}".format(self._cp_path)) checkpoint.restore_checkpoint(self._cp_path) logger.info("Checkpoint successfully restored: {}" .format(self._cp_path)) # create runner self._runner = CrawlerRunner(self._settings) # start crawling self._start_crawling() def _start_crawling(self): # start crawling dfd_finish = self._runner.crawl(self._spider) self._state = self.RUNNING # add finishing deferred call dfd_finish.addCallback(self._finish) # schedule a checkpoint reactor.callLater(self._cp_int, self._stop) def _finish(self, _): # ignore triggers due to checkpointing if self._cps > 0: self._cps -= 1 return # clear saved checkpoints logger.info("Clearing checkpoint: {}".format(self._cp_path)) checkpoint.clear_checkpoint(self._cp_path) logger.info("Successfully cleared checkpoint: {}" .format(self._cp_path)) self._state = self.FINISHED # stop reactor reactor.stop() def _stop(self): # ignore event if crawler has finished if self._state == self.FINISHED: return # stop crawling and trigger checkpointing self._cps += 1 dfd_stop = self._runner.stop() dfd_stop.addCallback(self._checkpoint) def _checkpoint(self, _): # checkpoint state logger.info("Creating checkpoint: {}".format(self._cp_path)) checkpoint.create_checkpoint(self._cp_path) logger.info("Successfully created checkpoint: {}" .format(self._cp_path)) # start crawling again self._start_crawling() @staticmethod def start_crawler(): """ Start executing all the crawlers. """ reactor.run()
class ScrAPI(Flask): def __init__(self, import_name=__package__, **kwargs): super(ScrAPI, self).__init__(import_name, **kwargs) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) self._init_url_rules() self.process = CrawlerRunner(get_project_settings()) self.tp = reactor.getThreadPool() self.database = DatabaseConnector(DATABASE_URL) self.response_meta = {"meta": {"project": "WSF Web Scraper"}} def __del__(self): self.database._close_all_spiders() self.database.cursor.close() self.database.connection.close() def run(self, host=None, port=None, debug=None, **options): super(ScrAPI, self).run(host, port, debug, **options) def _get_meta_response(self, res): res.update(self.response_meta) return res def _init_url_rules(self): """Attach the endpoints to run spiders and list the spiders that are available in the API """ self.add_url_rule( '/spiders', view_func=self.list_spiders, methods=['GET'], ) self.add_url_rule( '/spiders', view_func=self.run_spider, methods=['POST'], ) self.add_url_rule( '/spiders/<int:spider_id>', view_func=self.close_spider, methods=['DELETE'], ) self.add_url_rule( '/database', view_func=self.import_db, methods=['POST'], ) self.add_url_rule( '/database', view_func=self.export_db, methods=['GET'], ) self.add_url_rule( '/database', view_func=self.clear_scraps, methods=['DELETE'], ) self.add_url_rule( '/crawls', view_func=self.list_crawls, methods=['GET'], ) self.add_url_rule( '/crawls', view_func=self.stop, methods=['DELETE'], ) self.add_url_rule( '/', view_func=self.home, methods=['GET'], ) def home(self): routes = [{ "url": "/spiders", "method": "GET" }, { "url": "/spiders", "method": "POST", "arguments": { "spider": "name of the spider to run" } }, { "url": "/spiders/:spider_id", "method": "DELETE", "arguments": { "spider_id": "uuid of the spider to close" } }, { "url": "/crawls", "method": "GET" }, { "url": "/crawls", "method": "DELETE" }, { "url": "/database", "method": "GET" }, { "url": "/database", "method": "POST", "arguments": { "file": "json file containing the database dump" } }, { "url": "/database", "method": "DELETE" }] result = self._get_meta_response({"routes": routes}) return jsonify(result), 200 def list_spiders(self): spiders = self.process.spider_loader.list() return jsonify({"spiders": spiders, "status": "success"}), 200 def run_spider(self): post_data = request.get_json() spider = post_data.get('spider') if spider == 'who_iris': spider = who_iris_spider.WhoIrisSpider() elif spider == 'nice': spider = nice_spider.NiceSpider() else: return '', 404 spider_id = str(uuid.uuid4()) self.process.crawl(spider, uuid=spider_id) crawl = self.process.join() self.database.insert_spider(spider.name, spider_id) crawl.addBoth(self.on_success) return jsonify({ "data": { "status": "running", "spider": spider.name, "_id": spider_id } }), 200 def on_success(self, data): self.database._close_all_spiders() def close_spider(self, spider_id): for crawl in self.process.crawlers: if crawl.spider.uuid == uuid: crawl.stop() return jsonify( {"data": { "status": "success", "_id": spider_id }}), 200 return '', 400 def list_crawls(self): crawls = self.process.crawlers running_spiders = [] for crawl in crawls: start_time = crawl.stats.get_value('start_time') spider = { '_id': crawl.spider.uuid, 'spider': crawl.spider.name, 'start_time': start_time, 'total_time': str(datetime.now() - start_time), 'item_dropped': crawl.stats.get_value('item_dropped_count'), 'item_scraped': crawl.stats.get_value('item_scraped_count'), 'total_requests': crawl.stats.get_value('downloader/request_count'), } running_spiders.append(spider) finished_spiders = [] for spider in self.database.get_finished_crawls(): finished_spiders.append(spider) spiders = {"crawling": running_spiders, "finished": finished_spiders} return jsonify({"data": {"spiders": spiders}}), 200 def stop(self): self.process.stop() return jsonify({"data": {"status": "success"}}), 200 def export_db(self): articles_rows = self.database.get_articles() articles = [] now = datetime.now() for title, file_hash, url in articles_rows: articles.append({ 'title': title, 'file_hash': file_hash, 'url': url, }) json_file = tempfile.NamedTemporaryFile() json_file.write(json.dumps(articles).encode('utf-8')) json_file.seek(0) return send_file(json_file, mimetype='application/json', as_attachment=True, attachment_filename=f'export-{now}.json') def import_db(self): if request.files: data_file = request.files.get('file') if data_file.filename == '': return 'Filename must not be blank', 400 if data_file.content_type == 'application/json': json_file = data_file.stream.read() else: return 'File format is not json.', 415 try: json_dict = json.loads(json_file) for article in json_dict: self.database.insert_article(article.get('title'), article.get('file_hash'), article.get('url')) return '', 201 except Exception as e: result = {"errors": [str(e)]} return jsonify(result), 400 else: return 'No JSON file in request', 400 def clear_scraps(self): try: self.database.reset_scraped() return '', 204 except Exception as e: return str(e), 500
class FronteraSchedulerTest(TestCase): def setUp(self): self.runner = CrawlerRunner() def tearDown(self): self.runner.stop() while TestDownloadHandler.results: TestDownloadHandler.results.pop() @defer.inlineCallbacks def test_start_requests(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com') ]) with patch( 'frontera.contrib.backends.memory.MemoryBaseBackend.links_extracted' ) as mocked_links_extracted: mocked_links_extracted.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertTrue(crawler.spider.success2) mocked_links_extracted.assert_not_called() @defer.inlineCallbacks def test_cf_store(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results( [Response(url='http://example.com', body=b'cf_store')]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertEqual(mocked_schedule.call_count, 1) @defer.inlineCallbacks def test_callback_requests_to_frontier(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results( [Response(url='http://example.com')]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse2'], }) crawler = get_crawler(TestSpider2, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertFalse(crawler.spider.success2) self.assertEqual(mocked_schedule.call_count, 1) @defer.inlineCallbacks def test_callback_requests_to_frontier_with_implicit_callback(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com') ]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], }) crawler = get_crawler(TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) self.assertEqual(mocked_schedule.call_count, 1) @defer.inlineCallbacks def test_callback_requests_slot_map(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') mocked_handler.return_value.set_results([resp1, resp2]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], 'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': { 'parse': 'myslot' }, }) crawler = get_crawler(TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) self.assertEqual(mocked_schedule.call_count, 1) frontera_request = mocked_schedule.call_args_list[0][0][0][0][ 2] self.assertEqual(frontera_request.url, resp2.url) self.assertEqual( frontera_request.meta[b'frontier_slot_prefix'], 'myslot') @defer.inlineCallbacks def test_callback_requests_slot_map_with_num_slots(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() resp1 = Response(url='http://example.com') resp2 = Response(url='http://example2.com') mocked_handler.return_value.set_results([resp1, resp2]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_REQUEST_CALLBACKS_TO_FRONTIER': ['parse'], 'FRONTERA_SCHEDULER_CALLBACK_SLOT_PREFIX_MAP': { 'parse': 'myslot/5' }, }) crawler = get_crawler(TestSpider3, settings) yield self.runner.crawl(crawler) self.assertEqual(crawler.spider.success, 1) self.assertEqual(mocked_schedule.call_count, 1) frontera_request = mocked_schedule.call_args_list[0][0][0][0][ 2] self.assertEqual(frontera_request.url, resp2.url) self.assertEqual( frontera_request.meta[b'frontier_slot_prefix'], 'myslot') self.assertEqual( frontera_request.meta[b'frontier_number_of_slots'], 5) @defer.inlineCallbacks def test_start_requests_to_frontier(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com') ]) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True, }) crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertTrue(crawler.spider.success2) @defer.inlineCallbacks def test_start_requests_to_frontier_ii(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() with patch( 'frontera.contrib.backends.memory.MemoryBaseBackend.add_seeds' ) as mocked_add_seeds: mocked_add_seeds.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') settings.setdict({ 'FRONTERA_SCHEDULER_START_REQUESTS_TO_FRONTIER': True, }) crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertEqual(mocked_add_seeds.call_count, 1) @defer.inlineCallbacks def test_start_handle_errback(self): with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com', status=501), Response(url='http://example3.com') ]) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertFalse(crawler.spider.success2) self.assertTrue(crawler.spider.error) self.assertTrue(crawler.spider.success3) @defer.inlineCallbacks def test_start_handle_errback_with_cf_store(self): """ Test that we get the expected result with errback cf_store """ with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com') ]) settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertFalse(crawler.spider.success2) self.assertTrue(crawler.spider.error) self.assertTrue(crawler.spider.success3) @defer.inlineCallbacks def test_start_handle_errback_with_cf_store_ii(self): """ Test that we scheduled cf_store request on backend queue """ with patch( 'scrapy.core.downloader.handlers.http11.HTTP11DownloadHandler' ) as mocked_handler: mocked_handler.return_value = TestDownloadHandler() mocked_handler.return_value.set_results([ Response(url='http://example.com'), Response(url='http://example2.com', status=501, body=b'cf_store'), Response(url='http://example3.com') ]) with patch( 'frontera.contrib.backends.memory.MemoryDequeQueue.schedule' ) as mocked_schedule: mocked_schedule.return_value = None settings = Settings() settings.setdict(TEST_SETTINGS, priority='cmdline') crawler = get_crawler(TestSpider, settings) yield self.runner.crawl(crawler) self.assertTrue(crawler.spider.success) self.assertFalse(crawler.spider.success2) self.assertTrue(crawler.spider.error) self.assertEqual(mocked_schedule.call_count, 1)