def test_next_link_not_found(): spider = spider_with_crawler(spider_class=LinksSpider) spider.filter_arguments = [] body = '{"links": {"next": ""}}' with pytest.raises(MissingNextLinkError) as e: meta = {'file_name': 'test', 'depth': 0} spider.next_link(response_fixture(meta=meta, body=body)) assert str(e.value) == 'next link not found on the first page: http://example.com' meta = {'file_name': 'test', 'depth': 10} response = spider.next_link(response_fixture(meta=meta, body=body)) assert response is None
def test_parse_200(): spider = spider_with_crawler(spider_class=LinksSpider) spider.data_type = 'release_package' spider.next_page_formatter = lambda url: 'next.json' generator = spider.parse(response_fixture()) item = next(generator) request = next(generator) assert type(item) is File assert item == { 'file_name': 'test', 'url': 'http://example.com', 'data': b'{"links": {"next": "http://example.com/next"}}', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True, } assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'file_name': 'next.json'} with pytest.raises(StopIteration): next(generator)
def test_parse_release_package(sample, len_releases): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.resize_package = True middleware = ResizePackageMiddleware() package = {'releases': []} for i in range(200): package['releases'].append({'key': 'value'}) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', json.dumps(package)) response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) transformed_items = list(generator) for i, item in enumerate(transformed_items, 1): assert type(item) is FileItem assert len(item) == 6 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com' assert item['number'] == i assert len(json.loads(item['data'])['releases']) == len_releases assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8'
def test_line_delimited_json_middleware_compressed(sample): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.line_delimited = True middleware = LineDelimitedMiddleware() content = [] for i in range(1, 21): content.append('{"key": %s}\n' % i) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', ''.join(content)) response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) transformed_items = list(generator) for i, item in enumerate(transformed_items, 1): assert type(item) is FileItem assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'number': i, 'data': '{"key": %s}\n' % i, 'data_type': 'release_package', 'encoding': 'utf-8' }
def test_parse_rar_file(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' spider.archive_format = 'rar' # the rar library does'nt support the write mode so we use a static rar file rar_file_path = os.path.join( pathlib.Path(__file__).parent.absolute(), 'data', 'test.rar') with open(rar_file_path, 'rb') as f: io = BytesIO(f.read()) response = response_fixture(body=io.getvalue()) generator = spider.parse(response) item = next(generator) assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'data': b'', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True } with pytest.raises(StopIteration): next(generator)
def test_parse(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'data': b'{}', 'data_type': 'release_package', 'encoding': 'utf-8', 'post_to_api': True, } with pytest.raises(StopIteration): next(generator)
def test_next_link_condition(): spider = spider_with_crawler(spider_class=LinksSpider) spider.from_date = spider.until_date = date(2002, 12, 31) request = spider.next_link(response_fixture(body='{"links": {"next": ""}}')) assert type(request) is NoneType
def test_next_link(): spider = spider_with_crawler(spider_class=LinksSpider) spider.next_page_formatter = lambda url: 'next.json' request = spider.next_link(response_fixture()) assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'file_name': 'next.json'}
def test_parse_zip_empty_dir(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: empty_folder = ZipInfo(os.path.join('test', 'test', '/')) zipfile.writestr(empty_folder, '') response = response_fixture(body=io.getvalue()) generator = spider.parse(response) with pytest.raises(StopIteration): next(generator)
def test_parse_404(): spider = spider_with_crawler(spider_class=LinksSpider) generator = spider.parse(response_fixture(status=404, body=b'{"links": {"next": "http://example.com/next"}}')) item = next(generator) assert type(item) is FileError assert item == { 'file_name': 'test', 'url': 'http://example.com', 'errors': {'http_code': 404}, } with pytest.raises(StopIteration): next(generator)
def test_read_decompressed_middleware(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' middleware = ReadDataMiddleware() io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) transformed_items = list(generator) assert len(transformed_items) == 1 assert transformed_items[0]['data'] == b'{}'
def test_parse_json_lines(sample, len_items): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.compressed_file_format = 'json_lines' content = [] for i in range(1, 21): content.append('{"key": %s}\n' % i) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', ''.join(content)) response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) items = list(generator) assert type(item) is File assert len(item) == 6 assert item['file_name'] == 'test.zip' assert item['url'] == 'http://example.com' assert item['data_type'] == 'zip' assert item['encoding'] == 'utf-8' assert item['post_to_api'] is False assert len(items) == len_items for i, item in enumerate(items, 1): assert type(item) is FileItem assert item == { 'file_name': 'test.json', 'url': 'http://example.com', 'number': i, 'data': '{"key": %s}\n' % i, 'data_type': 'release_package', 'encoding': 'utf-8', }
def test_parse_release_package(sample, len_items, len_releases): spider = spider_with_crawler(spider_class=BigFileSpider, sample=sample) package = {'releases': []} for i in range(200): package['releases'].append({'key': 'value'}) response = response_fixture(body=json.dumps(package).encode(), meta={'file_name': 'test.json'}) generator = spider.parse(response) item = next(generator) assert type(item) is File assert len(item) == 5 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' assert item['data']['package'] is not None assert item['data']['data'] is not None with pytest.raises(StopIteration): next(generator)
def test_parse_release_package(sample, len_items, len_releases): spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample) spider.data_type = 'release_package' spider.compressed_file_format = 'release_package' package = {'releases': []} for i in range(200): package['releases'].append({'key': 'value'}) io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', json.dumps(package)) response = response_fixture(body=io.getvalue()) generator = spider.parse(response) item = next(generator) items = list(generator) assert type(item) is File assert len(item) == 6 assert item['file_name'] == 'test' assert item['url'] == 'http://example.com' assert item['data_type'] == 'zip' assert item['encoding'] == 'utf-8' assert item['post_to_api'] is False assert len(items) == len_items for i, item in enumerate(items, 1): assert type(item) is FileItem assert len(item) == 6 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com' assert item['number'] == i assert len(json.loads(item['data'])['releases']) == len_releases assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8'