Exemple #1
0
 def test_crawl_stderr_str(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting('fail_spider')
     spider_settings.extra_requirements = ['s']
     target = VenvRunner(eggf)
     try:
         yield target.crawl(spider_settings)
     except ProcessFailed as ex:
         self.assertEqual(str, type(ex.std_output))
Exemple #2
0
    def test_to_json(self):
        spider_name = 'abc'
        target = SpiderSetting(spider_name)

        json_text = target.to_json()
        json_deserialized = json.loads(json_text)

        self.assertEqual(json_deserialized['spider_name'], spider_name)
        self.assertEqual(json_deserialized['project_name'], None)
        self.assertEqual(json_deserialized['extra_requirements'], [])
        self.assertEqual(json_deserialized['spider_parameters'], {})
Exemple #3
0
 def test_from_json(self):
     spider_name = 'abc'
     json_text = '''
     {
         "spider_name": "abc",
         "project_name": "xyz",
         "extra_requirements": [
             "scrapy",
             "beautifulsoup4"
         ],
         "spider_parameters" : {
             "parameter_a" : "value_a",
             "parameter_b" : "value_b"
         }
     }
     '''
     target = SpiderSetting.from_json(json_text)
     self.assertEqual(target.spider_name, spider_name)
     self.assertEqual(target.project_name, 'xyz')
     self.assertEqual(target.extra_requirements,
                      ["scrapy", "beautifulsoup4"])
     self.assertEqual(target.spider_parameters, {
         'parameter_a': 'value_a',
         'parameter_b': 'value_b'
     })
Exemple #4
0
 def test_crawl_overwrite_setting(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting(
         'log_spider', spider_parameters={'SOME_SETTING': 'abc'})
     spider_settings.base_settings_module = 'test_project.settings'
     target = DockerRunner(eggf)
     target.image = 'scrapydd:develop'
     ret = yield target.crawl(spider_settings)
     self.assertIsNotNone(ret)
     self.assertEqual(0, ret.ret_code)
     self.assertIsNotNone(ret.items_file)
     self.assertTrue(os.path.exists(ret.items_file))
     self.assertIsNotNone(ret.crawl_logfile)
     self.assertTrue(os.path.exists(ret.crawl_logfile))
     with open(ret.crawl_logfile, 'r') as f:
         crawl_log = f.read()
     self.assertIn('SOME_SETTING: abc', crawl_log)
Exemple #5
0
def main():
    """
      Need put plugin packages(eggs) in the `plugin` folder first.
    :return:
    """
    parser = ArgumentParser()
    parser.add_argument('-f',
                        '--file',
                        dest='file',
                        required=False,
                        default='spider.json',
                        help='The spider settings json '
                        'file')
    args = parser.parse_args()
    file_ext = os.path.splitext(args.file)[1]
    if file_ext.lower() in ('.yaml', '.yml'):
        with open(args.file, 'r') as f:
            dic = yaml.load(f, yaml.Loader)
    elif file_ext.lower() == '.json':
        with open(args.file, 'r') as f:
            dic = json.load(f)
    else:
        raise Exception(f'Not supported file type : {args.file}')

    spider_setting = SpiderSetting.from_dict(dic)
    plugin_settings = spider_setting.plugin_settings
    extra_requirements = spider_setting.extra_requirements
    if extra_requirements:
        for requirement in extra_requirements:
            _pip_installer(requirement)
    try:
        settings_module = 'settings_' + randomString(6)
        settings_package = tempfile.mkdtemp()

        settings_stream = open(
            os.path.join(settings_package, settings_module + '.py'), 'w')
        if plugin_settings:
            perform(base_module=spider_setting.base_settings_module,
                    output_file=settings_stream,
                    input_file=plugin_settings)
        settings_stream.close()
        sys.path.append(settings_package)
        os.environ['SCRAPY_EXTRA_SETTINGS_MODULE'] = settings_module
        output_file = spider_setting.output_file or 'items.jl'
        argv = [
            'scrapy', 'crawl', spider_setting.spider_name, '-o', output_file
        ]
        for param_key, param_value in spider_setting.spider_parameters.items():
            argv += ['-s', '%s=%s' % (param_key, param_value)]
        runner_main(argv)
    except SystemExit:
        pass
    finally:
        if os.path.exists(settings_package):
            shutil.rmtree(settings_package)
Exemple #6
0
 def test_crawl_process_fail(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting('NO_EXIST_SPIDER')
     target = VenvRunner(eggf)
     try:
         ret = yield target.crawl(spider_settings)
         self.fail('Did not caught ProcessFail exception.')
     except ProcessFailed as e:
         self.assertIsNotNone(e.err_output)
         self.assertTrue("KeyError: 'Spider not found: NO_EXIST_SPIDER'" in
                         e.err_output)
Exemple #7
0
 def test_crawl(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting('fail_spider')
     target = VenvRunner(eggf)
     ret = yield target.crawl(spider_settings)
     self.assertIsNotNone(ret)
     self.assertEqual(0, ret.ret_code)
     self.assertIsNotNone(ret.items_file)
     self.assertTrue(os.path.exists(ret.items_file))
     self.assertIsNotNone(ret.crawl_logfile)
     self.assertTrue(os.path.exists(ret.crawl_logfile))
Exemple #8
0
 def test_kill_list(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting('fail_spider')
     target = DockerRunner(eggf)
     target.image = 'scrapydd:develop'
     future = target.list()
     target.kill()
     try:
         ret = yield future
         self.fail("Did not caught the ProcessFailed")
     except ProcessFailed:
         pass
Exemple #9
0
 def test_kill_crawl(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting('fail_spider')
     target = VenvRunner(eggf)
     target.image = 'scrapydd:develop'
     future = target.crawl(spider_settings)
     target.kill()
     try:
         ret = yield future
         self.fail("Didnot caught ProcessFailed exception")
     except ProcessFailed:
         pass
Exemple #10
0
    def test_clear(self):
        eggf = open(test_project_file, 'rb')
        spider_settings = SpiderSetting('fail_spider')
        target = VenvRunner(eggf)
        ret = yield target.crawl(spider_settings)
        self.assertTrue(os.path.exists(ret.items_file))
        self.assertTrue(os.path.exists(ret.crawl_logfile))

        target.clear()
        self.assertFalse(os.path.exists(target._work_dir))
        self.assertFalse(os.path.exists(ret.items_file))
        self.assertFalse(os.path.exists(ret.crawl_logfile))
Exemple #11
0
 def test_crawl_overwrite_setting(self):
     eggf = open(test_project_file, 'rb')
     spider_settings = SpiderSetting(
         'log_spider', spider_parameters={'SOME_SETTING': '2'})
     target = VenvRunner(eggf)
     ret = yield target.crawl(spider_settings)
     self.assertIsNotNone(ret)
     self.assertEqual(0, ret.ret_code)
     self.assertIsNotNone(ret.items_file)
     self.assertTrue(os.path.exists(ret.items_file))
     self.assertIsNotNone(ret.crawl_logfile)
     self.assertTrue(os.path.exists(ret.crawl_logfile))
     with open(ret.crawl_logfile, 'r') as f:
         crawl_log = f.read()
     self.assertTrue('SOME_SETTING: 2' in crawl_log)