Ejemplo n.º 1
0
    def process_spider_output(self, response, result, spider):
        for item in result:
            if not isinstance(item, (File, FileItem)) or not spider.root_path:
                yield item
                continue

            data = item['data']

            for number, obj in enumerate(ijson.items(data, spider.root_path),
                                         1):
                # Avoid reading the rest of a large file, since the rest of the items will be dropped.
                if spider.sample and number > spider.sample:
                    return

                if isinstance(item, File):
                    yield FileItem({
                        'number': number,
                        'file_name': item['file_name'],
                        'data': obj,
                        'data_type': item['data_type'],
                        'url': item['url'],
                        'encoding': item['encoding'],
                    })
                else:
                    # If the JSON file is line-delimited and the root path is to a JSON array, then this method will
                    # need to yield multiple FileItems for each input FileItem. To do so, the input FileItem's number
                    # is multiplied by the maximum length of the JSON array, to avoid duplicate numbers. Note that, to
                    # be stored by Kingfisher Process, the number must be within PostgreSQL's integer range.
                    #
                    # If this is the case, then, on the spider, set a ``root_path_max_length`` class attribute to the
                    # maximum length of the JSON array at the root path.
                    #
                    # https://www.postgresql.org/docs/11/datatype-numeric.html
                    yield FileItem({
                        'number':
                        (item['number'] - 1) * spider.root_path_max_length +
                        number,
                        'file_name':
                        item['file_name'],
                        'data':
                        obj,
                        'data_type':
                        item['data_type'],
                        'url':
                        item['url'],
                        'encoding':
                        item['encoding'],
                    })
Ejemplo n.º 2
0
    def process_spider_output(self, response, result, spider):
        for item in result:
            if not isinstance(item, File) or not getattr(
                    spider, 'resize_package', False):
                yield item
                continue

            if spider.sample:
                size = spider.sample
            else:
                size = 100

            package = self._get_package_metadata(item['data']['package'],
                                                 'releases', item['data_type'])
            # We yield release packages containing a maximum of 100 releases.
            for number, items in enumerate(
                    util.grouper(
                        ijson.items(item['data']['data'], 'releases.item'),
                        size), 1):
                # Avoid reading the rest of a large file, since the rest of the items will be dropped.
                if spider.sample and number > spider.sample:
                    return

                package['releases'] = filter(None, items)
                data = json.dumps(package, default=util.default)

                yield FileItem({
                    'number': number,
                    'file_name': item['file_name'],
                    'data': data,
                    'data_type': item['data_type'],
                    'url': item['url'],
                    'encoding': item['encoding'],
                })
Ejemplo n.º 3
0
    def process_spider_output(self, response, result, spider):
        for item in result:
            if not isinstance(item, File) or not spider.line_delimited:
                yield item
                continue

            data = item['data']

            # Data can be bytes or a file-like object.
            if isinstance(data, bytes):
                data = data.decode(encoding=item['encoding']).splitlines(True)

            for number, line in enumerate(data, 1):
                # Avoid reading the rest of a large file, since the rest of the items will be dropped.
                if spider.sample and number > spider.sample:
                    return

                if isinstance(line, bytes):
                    line = line.decode(encoding=item['encoding'])

                yield FileItem({
                    'number': number,
                    'file_name': item['file_name'],
                    'data': line,
                    'data_type': item['data_type'],
                    'url': item['url'],
                    'encoding': item['encoding'],
                })
Ejemplo n.º 4
0
 def build_file_item(self, number, data, data_type, url, encoding,
                     file_name):
     return FileItem({
         'number': number,
         'file_name': file_name,
         'data': data,
         'data_type': data_type,
         'url': url,
         'encoding': encoding,
     })
Ejemplo n.º 5
0
def test_process_file_item():
    pipeline = Validate()
    item = FileItem({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
        'number': 1
    })
    assert pipeline.process_item(item, None) == item
Ejemplo n.º 6
0
def test_duplicate_file_item(caplog):
    pipeline = Validate()
    spider = spider_with_crawler()
    item = FileItem({
        'file_name': 'test1',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://example.com',
        'number': 1
    })

    pipeline.process_item(item, spider)
    pipeline.process_item(item, spider)
    item2 = item.copy()
    item2['number'] = 2
    pipeline.process_item(item2, spider)

    assert len(caplog.messages) == 1
    assert caplog.messages[0] == "Duplicate FileItem: ('test1', 1)"
Ejemplo n.º 7
0
 def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
     """
     Returns a FileItem item to yield.
     """
     return FileItem({
         'number': number,
         'file_name': file_name,
         'data': data,
         'data_type': data_type,
         'url': url,
         'encoding': encoding,
     })
Ejemplo n.º 8
0
def test_process_file_item_error():
    pipeline = Validate()
    item = FileItem({
        'file_name': 'test',
        'data': 'data',
        'data_type': 'release_package',
        'url': 'http://test.com',
        'number': "2"
    })
    with pytest.raises(ValidationError):
        pipeline.process_item(item, None)
    item['number'] = None
    with pytest.raises(ValidationError):
        pipeline.process_item(item, None)
def test_item_scraped_file_item(sample, is_sample, note, encoding, ok, tmpdir, caplog):
    with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked:
        mocked.return_value = 200 if ok else 400

        spider = spider_with_files_store(tmpdir, sample=sample, note=note)
        extension = KingfisherProcessAPI.from_crawler(spider.crawler)

        item = FileItem({
            'number': 1,
            'file_name': 'data.json',
            'data': b'{"key": "value"}',
            'data_type': 'release_package',
            'url': 'https://example.com/remote.json',
            'encoding': encoding,
        })

        response = yield extension.item_scraped(item, spider)
        data = yield response.json()

        form = {
            'collection_source': 'test',
            'collection_data_version': '2001-02-03 04:05:06',
            'collection_sample': str(is_sample),
            'file_name': 'data.json',
            'url': 'https://example.com/remote.json',
            # Specific to FileItem.
            'data_type': 'release_package',
            'encoding': encoding,
            'number': '1',
            'data': '{"key": "value"}',
        }
        if note:
            form['collection_note'] = note

        assert data['method'] == 'POST'
        assert data['url'] == 'http://httpbin.org/anything/api/v1/submit/item/'
        assert data['headers']['Authorization'] == 'ApiKey xxx'
        assert data['form'] == form
        assert data['args'] == {}
        assert data['data'] == ''
        assert data['files'] == {}

        if not ok:
            message = 'create_file_item failed (https://example.com/remote.json) with status code: 400'

            assert len(caplog.records) == 1
            assert caplog.records[0].name == 'test'
            assert caplog.records[0].levelname == 'WARNING'
            assert caplog.records[0].message == message
def test_item_scraped_file_item(caplog):
    spider = spider_with_crawler()
    item_extension = KingfisherItemCount.from_crawler(spider.crawler)
    item = FileItem({
        'number': 1,
        'file_name': 'file.json',
        'data': b'{"key": "value"}',
        'data_type': 'release_package',
        'url': 'https://example.com/remote.json',
        'encoding': 'utf-8',
    })

    item_extension.item_scraped(item, spider)

    assert item_extension.stats.get_value('fileitem_count') == 1
    assert item_extension.stats.get_value('file_count', 0) == 0
    assert item_extension.stats.get_value('fileerror_count', 0) == 0
Ejemplo n.º 11
0
    assert item['path'] == path
    assert item['files_store'] == tmpdir


@pytest.mark.parametrize('sample,directory', [
    (None, os.path.join('test', '20010203_040506')),
    ('true', os.path.join('test_sample', '20010203_040506')),
])
@pytest.mark.parametrize('data', [b'{"key": "value"}', {"key": "value"}])
@pytest.mark.parametrize('item,expected_file_name',
                         [(File({
                             'file_name': 'file.json',
                             'encoding': 'iso-8859-1'
                         }), 'file.json'),
                          (FileItem({
                              'number': 1,
                              'file_name': 'file.json'
                          }), 'file-1.json')])
def test_item_scraped_with_file_and_file_item(sample, directory, data, item,
                                              expected_file_name, tmpdir):
    spider = spider_with_files_store(tmpdir, sample=sample)
    extension = KingfisherFilesStore.from_crawler(spider.crawler)
    path = os.path.join(directory, expected_file_name)
    original_file_name = item['file_name']
    item['data'] = data
    extension.item_scraped(item, spider)
    with open(tmpdir.join(path)) as f:
        assert f.read() == '{"key": "value"}'

    assert item['path'] == path
    assert item['files_store'] == tmpdir
    assert item['file_name'] == original_file_name