Beispiel #1
0
def test_from_crawler_missing_arguments():
    spider = spider_with_crawler()

    with pytest.raises(NotConfigured) as excinfo:
        KingfisherFilesStore.from_crawler(spider.crawler)

    assert str(excinfo.value) == 'FILES_STORE is not set.'
Beispiel #2
0
def test_item_scraped_with_build_file_item():
    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider = spider_with_crawler(settings={'FILES_STORE': files_store})
        extension = KingfisherFilesStore.from_crawler(spider.crawler)

        assert extension.item_scraped(spider.build_file_item(), spider) is None
Beispiel #3
0
def test_item_scraped_with_build_file(sample, path, tmpdir):
    spider = spider_with_files_store(tmpdir, sample=sample)
    store_extension = KingfisherFilesStore.from_crawler(spider.crawler)

    data = b'{"key": "value"}'
    url = 'https://example.com/remote.json'

    item = spider.build_file(file_name='file.json',
                             url=url,
                             data=data,
                             data_type='release_package',
                             encoding='iso-8859-1')
    store_extension.item_scraped(item, spider)

    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'

    with open(tmpdir.join(path + '.fileinfo')) as f:
        assert json.load(f) == {
            'url': 'https://example.com/remote.json',
            'data_type': 'release_package',
            'encoding': 'iso-8859-1',
        }

    assert item['path'] == path
    assert item['files_store'] == tmpdir
def test_build_file_with_existing_directory():
    spider = spider_with_crawler()

    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider.crawler.settings['FILES_STORE'] = files_store
        store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
        os.makedirs(os.path.join(files_store, 'test', '20010203_040506'))

        # No FileExistsError exception.
        store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider)
Beispiel #5
0
def test_item_scraped_with_build_file_and_existing_directory():
    with TemporaryDirectory() as tmpdirname:
        files_store = os.path.join(tmpdirname, 'data')
        spider = spider_with_crawler(settings={'FILES_STORE': files_store})
        extension = KingfisherFilesStore.from_crawler(spider.crawler)
        item = spider.build_file(file_name='file.json',
                                 data=b'{"key": "value"}')

        os.makedirs(os.path.join(files_store, 'test', '20010203_040506'))

        # No FileExistsError exception.
        extension.item_scraped(item, spider)
Beispiel #6
0
def test_item_scraped_with_file_and_file_item(sample, directory, data, item,
                                              expected_file_name, tmpdir):
    spider = spider_with_files_store(tmpdir, sample=sample)
    extension = KingfisherFilesStore.from_crawler(spider.crawler)
    path = os.path.join(directory, expected_file_name)
    original_file_name = item['file_name']
    item['data'] = data
    extension.item_scraped(item, spider)
    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'

    assert item['path'] == path
    assert item['files_store'] == tmpdir
    assert item['file_name'] == original_file_name
Beispiel #7
0
def test_item_scraped_with_build_file_from_response(sample, path, tmpdir):
    spider = spider_with_files_store(tmpdir, sample=sample)
    extension = KingfisherFilesStore.from_crawler(spider.crawler)

    response = Mock()
    response.body = b'{"key": "value"}'
    response.request = Mock()
    response.request.url = 'https://example.com/remote.json'
    response.request.meta = {'file_name': 'file.json'}

    item = spider.build_file_from_response(response,
                                           file_name='file.json',
                                           data_type='release_package',
                                           encoding='iso-8859-1')
    extension.item_scraped(item, spider)

    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'

    assert item['path'] == path
    assert item['files_store'] == tmpdir
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2,
                           directory, ok, post_to_api, crawl_time, tmpdir,
                           caplog):
    with patch('treq.response._Response.code',
               new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        settings = {}
        if directory:
            settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(
                tmpdir.join('xxx'))
        spider = spider_with_files_store(tmpdir,
                                         settings=settings,
                                         sample=sample,
                                         note=note,
                                         crawl_time=crawl_time)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        kwargs = {}
        if encoding:
            kwargs['encoding'] = encoding
        item = spider.build_file(
            file_name='file.json',
            url='https://example.com/remote.json',
            data=b'{"key": "value"}',
            data_type='release_package',
            post_to_api=post_to_api,
            **kwargs,
        )

        store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
        store_extension.item_scraped(item, spider)

        response = yield extension.item_scraped(item, spider)

        if post_to_api:
            data = yield response.json()

            form = {
                'collection_source': 'test',
                'collection_data_version': '2001-02-03 04:05:06',
                'collection_sample': str(is_sample),
                'file_name': 'file.json',
                'url': 'https://example.com/remote.json',
                # Specific to File.
                'data_type': 'release_package',
                'encoding': encoding2,
            }
            if note:
                form['collection_note'] = note
            if crawl_time:
                form['collection_data_version'] = '2020-01-01 00:00:00'
                path = path.replace('20010203_040506', '20200101_000000')
            if directory:
                form['local_file_name'] = tmpdir.join('xxx', path)

            with open(tmpdir.join(path)) as f:
                assert data['method'] == 'POST'
                assert data[
                    'url'] == 'http://httpbin.org/anything/api/v1/submit/file/'
                assert data['headers']['Authorization'] == 'ApiKey xxx'
                assert data['form'] == form
                assert data['args'] == {}
                assert data['data'] == ''
                if directory:
                    assert data['files'] == {}
                else:
                    assert data['files'] == {'file': f.read()}
        else:
            assert response is None

        if not ok:
            if post_to_api:
                message = 'create_file failed (https://example.com/remote.json) with status code: 400'

                assert len(caplog.records) == 1
                assert caplog.records[0].name == 'test'
                assert caplog.records[0].levelname == 'WARNING'
                assert caplog.records[0].message == message
            else:
                assert len(caplog.records) == 0
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, tmpdir,
                           caplog):
    spider = spider_with_files_store(tmpdir, sample=sample, note=note)

    if directory:
        spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx'))

    store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    kwargs = {}
    if encoding:
        kwargs['encoding'] = encoding
    item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json',
                             data_type='release_package', post_to_api=post_to_api, **kwargs)

    store_extension.item_scraped(item, spider)

    with patch('requests.post') as mocked:
        response = Mock()
        response.ok = ok
        response.status_code = 400
        mocked.return_value = response

        api_extension.item_scraped(item, spider)

        if not ok:
            if not post_to_api:
                assert len(caplog.records) == 0
            else:
                message = 'Failed to post [https://example.com/remote.json]. API status code: 400'

                assert len(caplog.records) == 1
                assert caplog.records[0].name == 'test'
                assert caplog.records[0].levelname == 'WARNING'
                assert caplog.records[0].message == message

        expected = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': is_sample,
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            # Specific to this test case.
            'data_type': 'release_package',
            'encoding': encoding2,
        }
        if note:
            expected['collection_note'] = note
        if directory:
            expected['local_file_name'] = tmpdir.join('xxx', path)
        if not post_to_api:
            assert mocked.call_count == 0
        else:
            with open(tmpdir.join(path), 'rb') as f:
                assert mocked.call_count == 1
                assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',)
                assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'}
                assert mocked.call_args[1]['data'] == expected
                assert len(mocked.call_args[1]) == 3

                if directory:
                    assert mocked.call_args[1]['files'] == {}
                else:
                    assert len(mocked.call_args[1]['files']) == 1
                    assert len(mocked.call_args[1]['files']['file']) == 3
                    assert mocked.call_args[1]['files']['file'][0] == 'file.json'
                    assert mocked.call_args[1]['files']['file'][1].read() == f.read()
                    assert mocked.call_args[1]['files']['file'][2] == 'application/json'