コード例 #1
0
def test_from_crawler_missing_arguments(api_url, api_key):
    spider = spider_with_crawler()
    spider.crawler.settings['KINGFISHER_API_URI'] = api_url
    spider.crawler.settings['KINGFISHER_API_KEY'] = api_key

    with pytest.raises(NotConfigured) as excinfo:
        KingfisherProcessAPI.from_crawler(spider.crawler)

    assert str(excinfo.value) == 'KINGFISHER_API_URI and/or KINGFISHER_API_KEY is not set.'
コード例 #2
0
def test_spider_closed_other_reason(tmpdir):
    spider = spider_with_files_store(tmpdir)
    extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    response = yield extension.spider_closed(spider, 'xxx')

    assert response is None
コード例 #3
0
def test_spider_closed(sample, is_sample, ok, tmpdir, caplog):
    with patch('treq.response._Response.code',
               new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        spider = spider_with_files_store(tmpdir, sample=sample)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        response = yield extension.spider_closed(
            spider, 'sample' if is_sample else 'finished')
        data = yield response.json()

        form = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': str(is_sample),
        }

        assert data['method'] == 'POST'
        assert data[
            'url'] == 'http://httpbin.org/anything/api/v1/submit/end_collection_store/'
        assert data['headers']['Authorization'] == 'ApiKey xxx'
        assert data['form'] == form
        assert data['args'] == {}
        assert data['data'] == ''
        assert data['files'] == {}

        if not ok:
            message = 'end_collection_store failed (test) with status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message
コード例 #4
0
def test_spider_closed(sample, is_sample, ok, tmpdir, caplog):
    spider = spider_with_files_store(tmpdir, sample=sample)

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    with patch('requests.post') as mocked:
        response = Mock()
        response.ok = ok
        response.status_code = 400
        mocked.return_value = response

        api_extension.spider_closed(spider, 'finished')

        mocked.assert_called_once_with(
            'http://httpbin.org/anything/api/v1/submit/end_collection_store/',
            headers={
                'Authorization': 'ApiKey xxx',
            },
            data={
                'collection_source': 'test',
                'collection_data_version': '2001-02-03 04:05:06',
                'collection_sample': is_sample,
            },
        )

        if not ok:
            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == 'Failed to post End Collection Store. API status code: 400'
コード例 #5
0
def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2,
                                ok, tmpdir, caplog):
    spider = spider_with_files_store(tmpdir, sample=sample, note=note)

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    with patch('requests.post') as mocked:
        response = Mock()
        response.ok = ok
        response.status_code = 400
        mocked.return_value = response

        kwargs = {}
        if encoding:
            kwargs['encoding'] = encoding
        item = spider.build_file_item(
            number=1,
            file_name='data.json',
            url='https://example.com/remote.json',
            data=b'{"key": "value"}',
            data_type='release_package',
            encoding=encoding2,
        )

        api_extension.item_scraped(item, spider)

        if not ok:
            message = 'Failed to post [https://example.com/remote.json]. API status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message

        expected = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': is_sample,
            'file_name': 'data.json',
            'url': 'https://example.com/remote.json',
            # Specific to this test case.
            'data_type': 'release_package',
            'encoding': encoding2,
            'number': 1,
            'data': b'{"key": "value"}',
        }
        if note:
            expected['collection_note'] = note

        mocked.assert_called_once_with(
            'http://httpbin.org/anything/api/v1/submit/item/',
            headers={
                'Authorization': 'ApiKey xxx',
            },
            proxies={
                'http': None,
                'https': None,
            },
            data=expected,
        )
コード例 #6
0
def test_spider_closed_exception(tmpdir, caplog):
    with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked:
        mocked.side_effect = ExpectedError

        spider = spider_with_files_store(tmpdir)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        with pytest.raises(ExpectedError):
            yield extension.spider_closed(spider, 'finished')
コード例 #7
0
def test_spider_closed_other_reason(tmpdir):
    spider = spider_with_files_store(tmpdir)

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    with patch('requests.post') as mocked:
        api_extension.spider_closed(spider, 'xxx')

        mocked.assert_not_called()
コード例 #8
0
def test_from_crawler():
    spider = spider_with_crawler()
    spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything'
    spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx'
    spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = 'localdir'

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    assert api_extension.directory == 'localdir'
コード例 #9
0
def test_from_crawler():
    spider = spider_with_crawler(settings={
        'KINGFISHER_API_URI': 'http://httpbin.org/anything',
        'KINGFISHER_API_KEY': 'xxx',
        'KINGFISHER_API_LOCAL_DIRECTORY': 'localdir',
    })

    extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    assert extension.directory == 'localdir'
コード例 #10
0
def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2,
                                ok, tmpdir, caplog):
    with patch('treq.response._Response.code',
               new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        spider = spider_with_files_store(tmpdir, sample=sample, note=note)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        kwargs = {}
        if encoding:
            kwargs['encoding'] = encoding
        item = spider.build_file_item(number=1,
                                      file_name='data.json',
                                      url='https://example.com/remote.json',
                                      data=b'{"key": "value"}',
                                      data_type='release_package',
                                      **kwargs)

        response = yield extension.item_scraped(item, spider)
        data = yield response.json()

        form = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': str(is_sample),
            'file_name': 'data.json',
            'url': 'https://example.com/remote.json',
            # Specific to FileItem.
            'data_type': 'release_package',
            'encoding': encoding2,
            'number': '1',
            'data': '{"key": "value"}',
        }
        if note:
            form['collection_note'] = note

        assert data['method'] == 'POST'
        assert data['url'] == 'http://httpbin.org/anything/api/v1/submit/item/'
        assert data['headers']['Authorization'] == 'ApiKey xxx'
        assert data['form'] == form
        assert data['args'] == {}
        assert data['data'] == ''
        assert data['files'] == {}

        if not ok:
            message = 'create_file_item failed (https://example.com/remote.json) with status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message
コード例 #11
0
def test_item_scraped_file_error(sample, is_sample, ok, tmpdir, caplog):
    spider = spider_with_files_store(tmpdir, sample=sample)

    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    with patch('requests.post') as mocked:
        response = Mock()
        response.ok = ok
        response.status_code = 400
        mocked.return_value = response

        data = FileError({
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            'errors': {
                'http_code': 500
            },
        })

        api_extension.item_scraped(data, spider)

        if not ok:
            message = 'Failed to post [https://example.com/remote.json]. File Errors API status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message

        expected = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': is_sample,
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            # Specific to this test case.
            'errors': '{"http_code": 500}',
        }

        mocked.assert_called_once_with(
            'http://httpbin.org/anything/api/v1/submit/file_errors/',
            headers={
                'Authorization': 'ApiKey xxx',
            },
            proxies={
                'http': None,
                'https': None,
            },
            data=expected,
        )
コード例 #12
0
def test_item_scraped_file_error(sample, is_sample, ok, tmpdir, caplog):
    with patch('treq.response._Response.code',
               new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        spider = spider_with_files_store(tmpdir, sample=sample)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        item = FileError({
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            'errors': {
                'http_code': 500
            },
        })

        response = yield extension.item_scraped(item, spider)
        data = yield response.json()

        form = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': str(is_sample),
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            # Specific to FileError.
            'errors': '{"http_code": 500}',
        }

        assert data['method'] == 'POST'
        assert data[
            'url'] == 'http://httpbin.org/anything/api/v1/submit/file_errors/'
        assert data['headers']['Authorization'] == 'ApiKey xxx'
        assert data['form'] == form
        assert data['args'] == {}
        assert data['data'] == ''
        assert data['files'] == {}

        if not ok:
            message = 'create_file_error failed (https://example.com/remote.json) with status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message
コード例 #13
0
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2,
                           directory, ok, post_to_api, crawl_time, tmpdir,
                           caplog):
    with patch('treq.response._Response.code',
               new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        settings = {}
        if directory:
            settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(
                tmpdir.join('xxx'))
        spider = spider_with_files_store(tmpdir,
                                         settings=settings,
                                         sample=sample,
                                         note=note,
                                         crawl_time=crawl_time)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        kwargs = {}
        if encoding:
            kwargs['encoding'] = encoding
        item = spider.build_file(
            file_name='file.json',
            url='https://example.com/remote.json',
            data=b'{"key": "value"}',
            data_type='release_package',
            post_to_api=post_to_api,
            **kwargs,
        )

        store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
        store_extension.item_scraped(item, spider)

        response = yield extension.item_scraped(item, spider)

        if post_to_api:
            data = yield response.json()

            form = {
                'collection_source': 'test',
                'collection_data_version': '2001-02-03 04:05:06',
                'collection_sample': str(is_sample),
                'file_name': 'file.json',
                'url': 'https://example.com/remote.json',
                # Specific to File.
                'data_type': 'release_package',
                'encoding': encoding2,
            }
            if note:
                form['collection_note'] = note
            if crawl_time:
                form['collection_data_version'] = '2020-01-01 00:00:00'
                path = path.replace('20010203_040506', '20200101_000000')
            if directory:
                form['local_file_name'] = tmpdir.join('xxx', path)

            with open(tmpdir.join(path)) as f:
                assert data['method'] == 'POST'
                assert data[
                    'url'] == 'http://httpbin.org/anything/api/v1/submit/file/'
                assert data['headers']['Authorization'] == 'ApiKey xxx'
                assert data['form'] == form
                assert data['args'] == {}
                assert data['data'] == ''
                if directory:
                    assert data['files'] == {}
                else:
                    assert data['files'] == {'file': f.read()}
        else:
            assert response is None

        if not ok:
            if post_to_api:
                message = 'create_file failed (https://example.com/remote.json) with status code: 400'

                assert len(caplog.records) == 1
                assert caplog.records[0].name == 'test'
                assert caplog.records[0].levelname == 'WARNING'
                assert caplog.records[0].message == message
            else:
                assert len(caplog.records) == 0
コード例 #14
0
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, tmpdir,
                           caplog):
    spider = spider_with_files_store(tmpdir, sample=sample, note=note)

    if directory:
        spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx'))

    store_extension = KingfisherFilesStore.from_crawler(spider.crawler)
    api_extension = KingfisherProcessAPI.from_crawler(spider.crawler)

    kwargs = {}
    if encoding:
        kwargs['encoding'] = encoding
    item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json',
                             data_type='release_package', post_to_api=post_to_api, **kwargs)

    store_extension.item_scraped(item, spider)

    with patch('requests.post') as mocked:
        response = Mock()
        response.ok = ok
        response.status_code = 400
        mocked.return_value = response

        api_extension.item_scraped(item, spider)

        if not ok:
            if not post_to_api:
                assert len(caplog.records) == 0
            else:
                message = 'Failed to post [https://example.com/remote.json]. API status code: 400'

                assert len(caplog.records) == 1
                assert caplog.records[0].name == 'test'
                assert caplog.records[0].levelname == 'WARNING'
                assert caplog.records[0].message == message

        expected = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': is_sample,
            'file_name': 'file.json',
            'url': 'https://example.com/remote.json',
            # Specific to this test case.
            'data_type': 'release_package',
            'encoding': encoding2,
        }
        if note:
            expected['collection_note'] = note
        if directory:
            expected['local_file_name'] = tmpdir.join('xxx', path)
        if not post_to_api:
            assert mocked.call_count == 0
        else:
            with open(tmpdir.join(path), 'rb') as f:
                assert mocked.call_count == 1
                assert mocked.call_args[0] == ('http://httpbin.org/anything/api/v1/submit/file/',)
                assert mocked.call_args[1]['headers'] == {'Authorization': 'ApiKey xxx'}
                assert mocked.call_args[1]['data'] == expected
                assert len(mocked.call_args[1]) == 3

                if directory:
                    assert mocked.call_args[1]['files'] == {}
                else:
                    assert len(mocked.call_args[1]['files']) == 1
                    assert len(mocked.call_args[1]['files']['file']) == 3
                    assert mocked.call_args[1]['files']['file'][0] == 'file.json'
                    assert mocked.call_args[1]['files']['file'][1].read() == f.read()
                    assert mocked.call_args[1]['files']['file'][2] == 'application/json'