def test_crawl_time_invalid(): expected = "spider argument `crawl_time`: invalid date value: time data '2020' does not match format " \ "'%Y-%m-%dT%H:%M:%S'" with pytest.raises(SpiderArgumentError) as e: spider_with_crawler(crawl_time='2020') assert str(e.value) == expected
def test_until_date_invalid(): expected = "spider argument `until_date`: invalid date value: time data 'invalid' does not match format '%Y-%m-%d'" with pytest.raises(SpiderArgumentError) as e: spider_with_crawler(until_date='invalid', default_from_date='2000-01-01') assert str(e.value) == expected
def test_from_crawler(): with pytest.raises(SpiderArgumentError) as excinfo: spider_with_crawler(release_pointer='/date', package_pointer='/publishedDate') assert str( excinfo.value ) == 'You cannot specify both package_pointer and release_pointer spider arguments.'
def test_custom_collection_data_version(): error_message = "time data '2020' does not match format '%Y-%m-%dT%H:%M:%S'" assert spider_with_crawler(crawl_time='2020-01-01T00:00:00') with pytest.raises(SpiderArgumentError) as e: assert spider_with_crawler(crawl_time='2020') assert str( e.value ) == f'spider argument `crawl_time`: invalid date value: {error_message}'
def test_parse(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'data': b'{}', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True, } with pytest.raises(StopIteration): next(generator)
def test_line_delimited_json_middleware_compressed(sample): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.line_delimited = True middleware = LineDelimitedMiddleware() content = [] for i in range(1, 21): content.append('{"key": %s}\n' % i) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', ''.join(content)) response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) transformed_items = list(generator) for i, item in enumerate(transformed_items, 1): assert type(item) is FileItem assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'number': i, 'data': '{"key": %s}\n' % i, 'data_type': 'release_package', 'encoding': 'utf-8' }
def test_middleware_output(meta, expected): spider = spider_with_crawler() middleware = DelayedRequestMiddleware() request = Request('http://example.com', meta=meta) output = middleware.process_request(request, spider) assert isinstance(output, expected)
def test_item_scraped_with_build_file_item(): with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider = spider_with_crawler(settings={'FILES_STORE': files_store}) extension = KingfisherFilesStore.from_crawler(spider.crawler) assert extension.item_scraped(spider.build_file_item(), spider) is None
def test_parse_rar_file(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' spider.archive_format = 'rar' # the rar library does'nt support the write mode so we use a static rar file rar_file_path = os.path.join( pathlib.Path(__file__).parent.absolute(), 'data', 'test.rar') with open(rar_file_path, 'rb') as f: io = BytesIO(f.read()) response = response_fixture(body=io.getvalue()) generator = spider.parse(response) item = next(generator) assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'data': b'', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True } with pytest.raises(StopIteration): next(generator)
def test_parse_release_package(sample, len_releases): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.resize_package = True middleware = ResizePackageMiddleware() package = {'releases': []} for i in range(200): package['releases'].append({'key': 'value'}) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', json.dumps(package)) response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) transformed_items = list(generator) for i, item in enumerate(transformed_items, 1): assert type(item) is FileItem assert len(item) == 6 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com' assert item['number'] == i assert len(json.loads(item['data'])['releases']) == len_releases assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8'
def test_next_link_condition(): spider = spider_with_crawler(spider_class=LinksSpider) spider.from_date = spider.until_date = date(2002, 12, 31) request = spider.next_link(response_fixture(body='{"links": {"next": ""}}')) assert type(request) is NoneType
def test_parse_zipfile_200(): spider = spider_with_crawler(spider_class=ZipSpider) response = text.TextResponse('test') response.status = 200 response.request = Mock() response.request.meta = {'kf_filename': 'test.json'} response.request.url = 'url' with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store tmp = os.path.join(files_store, 'test', '20010203_040506') os.makedirs(tmp) with open(os.path.join(tmp, 'test'), 'w'): pass with ZipFile(os.path.join(tmp, 'test.zip'), 'w') as z: z.write(os.path.join(tmp, 'test')) with open(os.path.join(tmp, 'test.zip'), 'rb') as z: response = response.replace(body=z.read()) actual = spider.parse_zipfile(response, None).__next__() assert isinstance(actual, File) assert actual['file_name'].find('.json')
def spider_with_files_store(files_store, **kwargs): spider = spider_with_crawler(**kwargs) spider.crawler.settings['FILES_STORE'] = files_store spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything' spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx' return spider
def test_parse_200(): spider = spider_with_crawler(spider_class=LinksSpider) spider.data_type = 'release_package' spider.next_page_formatter = lambda url: 'next.json' generator = spider.parse(response_fixture()) item = next(generator) request = next(generator) assert type(item) is File assert item == { 'file_name': 'test', 'url': 'http://example.com', 'data': b'{"links": {"next": "http://example.com/next"}}', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True, } assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'file_name': 'next.json'} with pytest.raises(StopIteration): next(generator)
def test_from_crawler_missing_arguments(): spider = spider_with_crawler() with pytest.raises(NotConfigured) as excinfo: KingfisherFilesStore.from_crawler(spider.crawler) assert str(excinfo.value) == 'FILES_STORE is not set.'
def test_qs_parameters(kwargs, expected): test_spider = type('TestSpider', (BaseSpider, ), { 'start_requests': lambda _self: [scrapy.Request('http://example.com')] }) spider = spider_with_crawler(test_spider, **kwargs) for request in spider.start_requests(): assert expected in request.url
def test_yield_items(middleware_class, item): spider = spider_with_crawler() middleware = middleware_class() generator = middleware.process_spider_output(None, [item], spider) returned_item = next(generator) assert item == returned_item
def test_from_crawler_missing_arguments(api_url, api_key): spider = spider_with_crawler() spider.crawler.settings['KINGFISHER_API_URI'] = api_url spider.crawler.settings['KINGFISHER_API_KEY'] = api_key with pytest.raises(NotConfigured) as excinfo: KingfisherProcessAPI.from_crawler(spider.crawler) assert str(excinfo.value) == 'KINGFISHER_API_URI and/or KINGFISHER_API_KEY is not set.'
def test_from_crawler(): spider = spider_with_crawler() spider.crawler.settings['KINGFISHER_API_URI'] = 'http://httpbin.org/anything' spider.crawler.settings['KINGFISHER_API_KEY'] = 'xxx' spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = 'localdir' api_extension = KingfisherProcessAPI.from_crawler(spider.crawler) assert api_extension.directory == 'localdir'
def test_next_link(): spider = spider_with_crawler(spider_class=LinksSpider) spider.next_page_formatter = lambda url: 'next.json' request = spider.next_link(response_fixture()) assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'file_name': 'next.json'}
def test_process_file_without_sample(): pipeline = Sample() spider = spider_with_crawler() item = File({ 'file_name': 'test', 'data': 'data', 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == item
def test_process_item_file_error(): pipeline = Sample() spider = spider_with_crawler(sample=1) item = FileError({ 'file_name': 'test', 'url': 'http://test.com', 'errors': 'error', }) with pytest.raises(DropItem): pipeline.process_item(item, spider)
def test_from_crawler(): spider = spider_with_crawler(settings={ 'KINGFISHER_API_URI': 'http://httpbin.org/anything', 'KINGFISHER_API_KEY': 'xxx', 'KINGFISHER_API_LOCAL_DIRECTORY': 'localdir', }) extension = KingfisherProcessAPI.from_crawler(spider.crawler) assert extension.directory == 'localdir'
def test_next_link(): spider = spider_with_crawler(spider_class=LinksSpider) url = 'https://example.com/remote.json' text_response = text.TextResponse('test') response = text_response.replace(body='{"links": {"next": "' + url + '"}}') actual = spider.next_link(response) assert actual.url == url
def test_date_arguments(): test_date = '2000-01-01' error_message = "time data 'test' does not match format '%Y-%m-%d'" assert spider_with_crawler(from_date=test_date) with pytest.raises(SpiderArgumentError) as e: assert spider_with_crawler(from_date='test') assert str( e.value ) == f'spider argument `from_date`: invalid date value: {error_message}' assert spider_with_crawler(until_date=test_date, default_from_date=test_date) with pytest.raises(SpiderArgumentError) as e: assert spider_with_crawler(until_date='test', default_from_date=test_date) assert str( e.value ) == f'spider argument `until_date`: invalid date value: {error_message}'
def test_spider_closed_without_items(): with TemporaryDirectory() as tmpdirname: spider = spider_with_crawler( settings={'KINGFISHER_PLUCK_PATH': tmpdirname}, release_pointer='/date') extension = KingfisherPluck.from_crawler(spider.crawler) extension.spider_closed(spider, 'itemcount') with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert 'closed: itemcount,test\n' == f.read()
def test_process_item_xlsx(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'test.xlsx', 'data': save_virtual_workbook(Workbook()), 'data_type': 'release_package', 'url': 'http://test.com/test.xlsx', }) assert pipeline.process_item(item, spider) == item
def test_parse_zipfile_release_package(): spider = spider_with_crawler(spider_class=ZipSpider) response = text.TextResponse('test') response.status = 200 response.request = Mock() response.request.meta = {'kf_filename': 'test.json'} response.request.url = 'url' with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store tmp = os.path.join(files_store, 'test', '20010203_040506') os.makedirs(tmp) with open(os.path.join(tmp, 'test.json'), 'w') as f: release = { 'releases': [], 'publisher': { 'name': 'test' }, 'extensions': ['a', 'b'], 'license': 'test', 'extra': 1.1 } for i in range(110): release['releases'].append({'key': 'value'}) json.dump(release, f) with ZipFile(os.path.join(tmp, 'test.zip'), 'w') as z: z.write(os.path.join(tmp, 'test.json')) with open(os.path.join(tmp, 'test.zip'), 'rb') as z: response = response.replace(body=z.read()) actual = spider.parse_zipfile( response, None, file_format='release_package').__next__() data = json.loads(actual['data']) assert isinstance(actual, FileItem) assert actual['number'] == 1 assert data['publisher']['name'] == 'test' assert data['extensions'] == ['a', 'b'] assert len(data['releases']) == spider.MAX_RELEASES_PER_PACKAGE spider.sample = True total = 0 for item in spider.parse_zipfile(response, None, file_format='release_package'): total = total + 1 data = json.loads(item['data']) assert isinstance(item, FileItem) assert item['number'] == total assert len(data['releases']) == spider.MAX_SAMPLE assert total == 1
def test_build_file_with_existing_directory(): spider = spider_with_crawler() with TemporaryDirectory() as tmpdirname: files_store = os.path.join(tmpdirname, 'data') spider.crawler.settings['FILES_STORE'] = files_store store_extension = KingfisherFilesStore.from_crawler(spider.crawler) os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider)
def test_process_item_csv(): spider = spider_with_crawler(unflatten=True) pipeline = Unflatten() item = File({ 'file_name': 'test.csv', 'data': b'data', 'data_type': 'release_package', 'url': 'http://test.com/test.csv', }) assert pipeline.process_item(item, spider) == item