Beispiel #1
0
def test_spider_closed_without_items():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname},
            release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)

        extension.spider_closed(spider, 'itemcount')

        with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
            assert 'closed: itemcount,test\n' == f.read()
Beispiel #2
0
def test_disabled():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname})
        extension = KingfisherPluck.from_crawler(spider.crawler)
        item = PluckedItem({'value': '2020-10-01'})

        extension.item_scraped(item, spider)
        extension.spider_closed(spider, 'itemcount')

        assert not glob(os.path.join(tmpdirname, 'pluck*.csv'))
Beispiel #3
0
def test_bytes_received_dont_stop_download():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(settings={
            'KINGFISHER_PLUCK_PATH': tmpdirname,
            'KINGFISHER_PLUCK_MAX_BYTES': 10
        },
                                     release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)
        request = Request('http://example.com',
                          meta={'file_name': 'test.json'})

        extension.bytes_received(data=b'12345', spider=spider, request=request)

        assert extension.total_bytes_received == 5
        assert extension.max_bytes == 10
Beispiel #4
0
def test_item_scraped():
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(
            settings={'KINGFISHER_PLUCK_PATH': tmpdirname},
            release_pointer='/date')
        extension = KingfisherPluck.from_crawler(spider.crawler)
        item = PluckedItem({'value': '2020-10-01'})

        extension.item_scraped(item, spider)

        with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
            assert '2020-10-01,test\n' == f.read()

        # Only one item from the same spider is written.
        extension.item_scraped(item, spider)

        with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f:
            assert '2020-10-01,test\n' == f.read()
Beispiel #5
0
def test_bytes_received_ignored_requests(test_request, spider_class,
                                         attributes):
    with TemporaryDirectory() as tmpdirname:
        spider = spider_with_crawler(spider_class=spider_class,
                                     release_pointer='/date',
                                     settings={
                                         'KINGFISHER_PLUCK_PATH': tmpdirname,
                                         'KINGFISHER_PLUCK_MAX_BYTES': 10
                                     })
        for attr, value in attributes.items():
            setattr(spider, attr, value)

        extension = KingfisherPluck.from_crawler(spider.crawler)

        extension.bytes_received(data=b'12345',
                                 spider=spider,
                                 request=test_request)

        assert extension.total_bytes_received == 0