コード例 #1
0
def test_next_link_not_found():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.filter_arguments = []
    body = '{"links": {"next": ""}}'

    with pytest.raises(MissingNextLinkError) as e:
        meta = {'file_name': 'test', 'depth': 0}
        spider.next_link(response_fixture(meta=meta, body=body))
    assert str(e.value) == 'next link not found on the first page: http://example.com'

    meta = {'file_name': 'test', 'depth': 10}
    response = spider.next_link(response_fixture(meta=meta, body=body))
    assert response is None
コード例 #2
0
def test_parse_200():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.data_type = 'release_package'
    spider.next_page_formatter = lambda url: 'next.json'

    generator = spider.parse(response_fixture())
    item = next(generator)
    request = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test',
        'url': 'http://example.com',
        'data': b'{"links": {"next": "http://example.com/next"}}',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True,
    }

    assert type(request) is Request
    assert request.url == 'http://example.com/next'
    assert request.meta == {'file_name': 'next.json'}

    with pytest.raises(StopIteration):
        next(generator)
def test_parse_release_package(sample, len_releases):
    spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
    spider.data_type = 'release_package'
    spider.resize_package = True

    middleware = ResizePackageMiddleware()

    package = {'releases': []}
    for i in range(200):
        package['releases'].append({'key': 'value'})

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', json.dumps(package))

    response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    generator = middleware.process_spider_output(response, [item], spider)
    transformed_items = list(generator)

    for i, item in enumerate(transformed_items, 1):
        assert type(item) is FileItem
        assert len(item) == 6
        assert item['file_name'] == 'test.json'
        assert item['url'] == 'http://example.com'
        assert item['number'] == i
        assert len(json.loads(item['data'])['releases']) == len_releases
        assert item['data_type'] == 'release_package'
        assert item['encoding'] == 'utf-8'
def test_line_delimited_json_middleware_compressed(sample):
    spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
    spider.data_type = 'release_package'
    spider.line_delimited = True

    middleware = LineDelimitedMiddleware()

    content = []
    for i in range(1, 21):
        content.append('{"key": %s}\n' % i)

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', ''.join(content))

    response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    generator = middleware.process_spider_output(response, [item], spider)
    transformed_items = list(generator)

    for i, item in enumerate(transformed_items, 1):
        assert type(item) is FileItem
        assert item == {
            'file_name': 'test.json',
            'url': 'http://example.com',
            'number': i,
            'data': '{"key": %s}\n' % i,
            'data_type': 'release_package',
            'encoding': 'utf-8'
        }
コード例 #5
0
def test_parse_rar_file():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'
    spider.archive_format = 'rar'

    # the rar library does'nt support the write mode so we use a static rar file
    rar_file_path = os.path.join(
        pathlib.Path(__file__).parent.absolute(), 'data', 'test.rar')
    with open(rar_file_path, 'rb') as f:
        io = BytesIO(f.read())
    response = response_fixture(body=io.getvalue())
    generator = spider.parse(response)
    item = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test.json',
        'url': 'http://example.com',
        'data': b'',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True
    }

    with pytest.raises(StopIteration):
        next(generator)
コード例 #6
0
def test_parse():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', '{}')

    response = response_fixture(body=io.getvalue(),
                                meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    assert type(item) is File
    assert item == {
        'file_name': 'test.json',
        'url': 'http://example.com',
        'data': b'{}',
        'data_type': 'release_package',
        'encoding': 'utf-8',
        'post_to_api': True,
    }

    with pytest.raises(StopIteration):
        next(generator)
コード例 #7
0
def test_next_link_condition():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.from_date = spider.until_date = date(2002, 12, 31)

    request = spider.next_link(response_fixture(body='{"links": {"next": ""}}'))

    assert type(request) is NoneType
コード例 #8
0
def test_next_link():
    spider = spider_with_crawler(spider_class=LinksSpider)
    spider.next_page_formatter = lambda url: 'next.json'

    request = spider.next_link(response_fixture())

    assert type(request) is Request
    assert request.url == 'http://example.com/next'
    assert request.meta == {'file_name': 'next.json'}
コード例 #9
0
def test_parse_zip_empty_dir():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        empty_folder = ZipInfo(os.path.join('test', 'test', '/'))
        zipfile.writestr(empty_folder, '')
    response = response_fixture(body=io.getvalue())
    generator = spider.parse(response)
    with pytest.raises(StopIteration):
        next(generator)
コード例 #10
0
def test_parse_404():
    spider = spider_with_crawler(spider_class=LinksSpider)

    generator = spider.parse(response_fixture(status=404, body=b'{"links": {"next": "http://example.com/next"}}'))
    item = next(generator)

    assert type(item) is FileError
    assert item == {
        'file_name': 'test',
        'url': 'http://example.com',
        'errors': {'http_code': 404},
    }

    with pytest.raises(StopIteration):
        next(generator)
def test_read_decompressed_middleware():
    spider = spider_with_crawler(spider_class=CompressedFileSpider)
    spider.data_type = 'release_package'

    middleware = ReadDataMiddleware()

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', '{}')

    response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)

    generator = middleware.process_spider_output(response, [item], spider)
    transformed_items = list(generator)

    assert len(transformed_items) == 1
    assert transformed_items[0]['data'] == b'{}'
コード例 #12
0
def test_parse_json_lines(sample, len_items):
    spider = spider_with_crawler(spider_class=CompressedFileSpider,
                                 sample=sample)
    spider.data_type = 'release_package'
    spider.compressed_file_format = 'json_lines'

    content = []
    for i in range(1, 21):
        content.append('{"key": %s}\n' % i)

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', ''.join(content))

    response = response_fixture(body=io.getvalue(),
                                meta={'file_name': 'test.zip'})
    generator = spider.parse(response)
    item = next(generator)
    items = list(generator)

    assert type(item) is File
    assert len(item) == 6
    assert item['file_name'] == 'test.zip'
    assert item['url'] == 'http://example.com'
    assert item['data_type'] == 'zip'
    assert item['encoding'] == 'utf-8'
    assert item['post_to_api'] is False

    assert len(items) == len_items

    for i, item in enumerate(items, 1):
        assert type(item) is FileItem
        assert item == {
            'file_name': 'test.json',
            'url': 'http://example.com',
            'number': i,
            'data': '{"key": %s}\n' % i,
            'data_type': 'release_package',
            'encoding': 'utf-8',
        }
コード例 #13
0
def test_parse_release_package(sample, len_items, len_releases):
    spider = spider_with_crawler(spider_class=BigFileSpider, sample=sample)
    package = {'releases': []}
    for i in range(200):
        package['releases'].append({'key': 'value'})

    response = response_fixture(body=json.dumps(package).encode(),
                                meta={'file_name': 'test.json'})
    generator = spider.parse(response)
    item = next(generator)

    assert type(item) is File
    assert len(item) == 5
    assert item['file_name'] == 'test.json'
    assert item['url'] == 'http://example.com'
    assert item['data_type'] == 'release_package'
    assert item['encoding'] == 'utf-8'
    assert item['data']['package'] is not None
    assert item['data']['data'] is not None

    with pytest.raises(StopIteration):
        next(generator)
コード例 #14
0
def test_parse_release_package(sample, len_items, len_releases):
    spider = spider_with_crawler(spider_class=CompressedFileSpider,
                                 sample=sample)
    spider.data_type = 'release_package'
    spider.compressed_file_format = 'release_package'

    package = {'releases': []}
    for i in range(200):
        package['releases'].append({'key': 'value'})

    io = BytesIO()
    with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
        zipfile.writestr('test.json', json.dumps(package))

    response = response_fixture(body=io.getvalue())
    generator = spider.parse(response)
    item = next(generator)
    items = list(generator)

    assert type(item) is File
    assert len(item) == 6
    assert item['file_name'] == 'test'
    assert item['url'] == 'http://example.com'
    assert item['data_type'] == 'zip'
    assert item['encoding'] == 'utf-8'
    assert item['post_to_api'] is False

    assert len(items) == len_items

    for i, item in enumerate(items, 1):
        assert type(item) is FileItem
        assert len(item) == 6
        assert item['file_name'] == 'test.json'
        assert item['url'] == 'http://example.com'
        assert item['number'] == i
        assert len(json.loads(item['data'])['releases']) == len_releases
        assert item['data_type'] == 'release_package'
        assert item['encoding'] == 'utf-8'