def test_crawl_stderr_str(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('fail_spider') spider_settings.extra_requirements = ['s'] target = VenvRunner(eggf) try: yield target.crawl(spider_settings) except ProcessFailed as ex: self.assertEqual(str, type(ex.std_output))
def test_to_json(self): spider_name = 'abc' target = SpiderSetting(spider_name) json_text = target.to_json() json_deserialized = json.loads(json_text) self.assertEqual(json_deserialized['spider_name'], spider_name) self.assertEqual(json_deserialized['project_name'], None) self.assertEqual(json_deserialized['extra_requirements'], []) self.assertEqual(json_deserialized['spider_parameters'], {})
def test_from_json(self): spider_name = 'abc' json_text = ''' { "spider_name": "abc", "project_name": "xyz", "extra_requirements": [ "scrapy", "beautifulsoup4" ], "spider_parameters" : { "parameter_a" : "value_a", "parameter_b" : "value_b" } } ''' target = SpiderSetting.from_json(json_text) self.assertEqual(target.spider_name, spider_name) self.assertEqual(target.project_name, 'xyz') self.assertEqual(target.extra_requirements, ["scrapy", "beautifulsoup4"]) self.assertEqual(target.spider_parameters, { 'parameter_a': 'value_a', 'parameter_b': 'value_b' })
def test_crawl_overwrite_setting(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting( 'log_spider', spider_parameters={'SOME_SETTING': 'abc'}) spider_settings.base_settings_module = 'test_project.settings' target = DockerRunner(eggf) target.image = 'scrapydd:develop' ret = yield target.crawl(spider_settings) self.assertIsNotNone(ret) self.assertEqual(0, ret.ret_code) self.assertIsNotNone(ret.items_file) self.assertTrue(os.path.exists(ret.items_file)) self.assertIsNotNone(ret.crawl_logfile) self.assertTrue(os.path.exists(ret.crawl_logfile)) with open(ret.crawl_logfile, 'r') as f: crawl_log = f.read() self.assertIn('SOME_SETTING: abc', crawl_log)
def main(): """ Need put plugin packages(eggs) in the `plugin` folder first. :return: """ parser = ArgumentParser() parser.add_argument('-f', '--file', dest='file', required=False, default='spider.json', help='The spider settings json ' 'file') args = parser.parse_args() file_ext = os.path.splitext(args.file)[1] if file_ext.lower() in ('.yaml', '.yml'): with open(args.file, 'r') as f: dic = yaml.load(f, yaml.Loader) elif file_ext.lower() == '.json': with open(args.file, 'r') as f: dic = json.load(f) else: raise Exception(f'Not supported file type : {args.file}') spider_setting = SpiderSetting.from_dict(dic) plugin_settings = spider_setting.plugin_settings extra_requirements = spider_setting.extra_requirements if extra_requirements: for requirement in extra_requirements: _pip_installer(requirement) try: settings_module = 'settings_' + randomString(6) settings_package = tempfile.mkdtemp() settings_stream = open( os.path.join(settings_package, settings_module + '.py'), 'w') if plugin_settings: perform(base_module=spider_setting.base_settings_module, output_file=settings_stream, input_file=plugin_settings) settings_stream.close() sys.path.append(settings_package) os.environ['SCRAPY_EXTRA_SETTINGS_MODULE'] = settings_module output_file = spider_setting.output_file or 'items.jl' argv = [ 'scrapy', 'crawl', spider_setting.spider_name, '-o', output_file ] for param_key, param_value in spider_setting.spider_parameters.items(): argv += ['-s', '%s=%s' % (param_key, param_value)] runner_main(argv) except SystemExit: pass finally: if os.path.exists(settings_package): shutil.rmtree(settings_package)
def test_crawl_process_fail(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('NO_EXIST_SPIDER') target = VenvRunner(eggf) try: ret = yield target.crawl(spider_settings) self.fail('Did not caught ProcessFail exception.') except ProcessFailed as e: self.assertIsNotNone(e.err_output) self.assertTrue("KeyError: 'Spider not found: NO_EXIST_SPIDER'" in e.err_output)
def test_crawl(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('fail_spider') target = VenvRunner(eggf) ret = yield target.crawl(spider_settings) self.assertIsNotNone(ret) self.assertEqual(0, ret.ret_code) self.assertIsNotNone(ret.items_file) self.assertTrue(os.path.exists(ret.items_file)) self.assertIsNotNone(ret.crawl_logfile) self.assertTrue(os.path.exists(ret.crawl_logfile))
def test_kill_list(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('fail_spider') target = DockerRunner(eggf) target.image = 'scrapydd:develop' future = target.list() target.kill() try: ret = yield future self.fail("Did not caught the ProcessFailed") except ProcessFailed: pass
def test_kill_crawl(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('fail_spider') target = VenvRunner(eggf) target.image = 'scrapydd:develop' future = target.crawl(spider_settings) target.kill() try: ret = yield future self.fail("Didnot caught ProcessFailed exception") except ProcessFailed: pass
def test_clear(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting('fail_spider') target = VenvRunner(eggf) ret = yield target.crawl(spider_settings) self.assertTrue(os.path.exists(ret.items_file)) self.assertTrue(os.path.exists(ret.crawl_logfile)) target.clear() self.assertFalse(os.path.exists(target._work_dir)) self.assertFalse(os.path.exists(ret.items_file)) self.assertFalse(os.path.exists(ret.crawl_logfile))
def test_crawl_overwrite_setting(self): eggf = open(test_project_file, 'rb') spider_settings = SpiderSetting( 'log_spider', spider_parameters={'SOME_SETTING': '2'}) target = VenvRunner(eggf) ret = yield target.crawl(spider_settings) self.assertIsNotNone(ret) self.assertEqual(0, ret.ret_code) self.assertIsNotNone(ret.items_file) self.assertTrue(os.path.exists(ret.items_file)) self.assertIsNotNone(ret.crawl_logfile) self.assertTrue(os.path.exists(ret.crawl_logfile)) with open(ret.crawl_logfile, 'r') as f: crawl_log = f.read() self.assertTrue('SOME_SETTING: 2' in crawl_log)