Beispiel #1
0
    def test_should_return_python_objects_when_reading_file_without_custom_decoder(self, tmp_path, create_msgpack_file):
        given_data = [[1, 2], 'hello', {'fruit': 'apple'}]
        mp_file = tmp_path / 'data.mp'
        create_msgpack_file(mp_file, given_data)

        for file in [mp_file, f'{mp_file}']:
            assert [item for item in read_mp(file)] == given_data
Beispiel #2
0
    def test_should_write_bytes_when_giving_content_in_write_mode_without_custom_encoder(self, tmp_path):
        content = {'name': 'Kevin', 'fruit': 'water melon'}
        mp_file = tmp_path / 'data.mp'
        length = write_mp(mp_file, content, mode='w')

        assert length > 0
        assert [content] == [item for item in read_mp(mp_file)]
Beispiel #3
0
    def test_should_save_correct_output_when_giving_file_url(
            self, page_1_file_url, tmp_path, browser):
        backup_path = tmp_path / 'backup.mp'
        config = Configuration(item_processors=[self.processor],
                               backup_filename=f'{backup_path}',
                               selenium_driver_log_file=None,
                               selenium_browser=browser)
        spider = SeleniumSpider(urls=[page_1_file_url],
                                parse=self.parse,
                                config=config)
        spider.run()
        stats = spider.statistics()
        followed_urls = {
            page_1_file_url.replace('1', '2'),
            page_1_file_url.replace('1', '3')
        }

        assert followed_urls == stats.followed_urls
        assert {page_1_file_url} | followed_urls == stats.reachable_urls
        assert 3 == stats.request_counter
        assert stats.total_time > 0
        assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter
        assert set() == stats.unreachable_urls
        assert set() == stats.robot_excluded_urls
        assert stats.total_time > 0

        albert_count = 0
        for item in read_mp(backup_path, decoder=datetime_decoder):
            assert isinstance(item['date'], datetime)
            if item['author'] == 'Albert Einstein':
                print(item)
                albert_count += 1

        assert albert_count == 3
Beispiel #4
0
    def test_should_return_python_object_when_reading_file_with_custom_decoder(
        self, tmp_path, create_msgpack_file, decode_datetime
    ):
        given_data = ['hello', datetime.now()]
        mp_file = tmp_path / 'data.mp'
        create_msgpack_file(mp_file, given_data)

        assert [item for item in read_mp(mp_file, decoder=decode_datetime)] == given_data
Beispiel #5
0
    def test_should_write_bytes_when_giving_content_in_append_mode_without_custom_encoder(self, tmp_path):
        content = ['foo', 4, {'fruit': 'water melon'}, [1, 4]]
        mp_file = tmp_path / 'data.mp'

        for item in content:
            length = write_mp(mp_file, item, mode='a')
            assert length > 0

        assert content == [item for item in read_mp(mp_file)]
Beispiel #6
0
    def test_should_write_bytes_when_giving_content_in_write_mode_with_custom_encoder(
        self, tmp_path, encode_datetime, decode_datetime
    ):
        content = {'name': 'Kevin', 'date': datetime.now()}
        mp_file = tmp_path / 'data.mp'
        length = write_mp(f'{mp_file}', content, mode='w', encoder=encode_datetime)

        assert length > 0
        assert [content] == [item for item in read_mp(mp_file, decoder=decode_datetime)]
Beispiel #7
0
    def test_should_write_bytes_when_giving_content_in_append_mode_with_custom_encoder(
        self, tmp_path, encode_datetime, decode_datetime
    ):
        content = ['foo', datetime.now()]
        mp_file = tmp_path / 'data.mp'

        for item in content:
            length = write_mp(f'{mp_file}', item, mode='a', encoder=encode_datetime)
            assert length > 0

        assert content == [item for item in read_mp(mp_file, decoder=decode_datetime)]
Beispiel #8
0
    def test_should_raise_error_when_decoder_is_not_callable(self, decoder):
        with pytest.raises(TypeError) as exc_info:
            next(read_mp('foo.mp', decoder=decoder))

        assert f'{decoder} is not callable' == str(exc_info.value)
Beispiel #9
0
    next_link = None
    try:
        element = response.driver.find_element_by_xpath(
            '//nav/ul/li[@class="next"]/a')
        next_link = element.get_attribute('href')
    except NoSuchElementException:
        pass

    if next_link is not None:
        response.follow(next_link)


def date_processor(item: dict) -> dict:
    item['date'] = datetime.now()
    return item


if __name__ == '__main__':
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(selenium_driver_log_file=None,
                           backup_filename=f'{backup}',
                           item_processors=[date_processor])
    sel_spider = SeleniumSpider(urls=['http://quotes.toscrape.com'],
                                parse=parse,
                                config=config)
    sel_spider.run()
    print(sel_spider.statistics())
    # you can do whatever you want with the results
    for quote_data in read_mp(filename=backup, decoder=datetime_decoder):
        print(quote_data)
Beispiel #10
0
            'message': quote.xpath('./span[@class="text"]/text()').get(),
            'author': quote.xpath('./span/small/text()').get(),
            'tags': quote.xpath('./div/a/text()').getall(),
        }
        spider.save_item(data)

    next_link = response.xpath('//nav/ul/li[@class="next"]/a').xpath(
        '@href').get()
    if next_link is not None:
        response.follow(next_link)


def date_processor(item: dict) -> dict:
    item['date'] = datetime.now()
    return item


if __name__ == '__main__':
    backup = Path(__file__).parent / 'backup.mp'
    config = Configuration(backup_filename=f'{backup}',
                           item_processors=[date_processor])
    spider = StaticSpider(urls=['http://quotes.toscrape.com'],
                          parse=parse,
                          config=config)
    spider.run()
    print(spider.statistics())
    # you can do whatever you want with the results
    for quote_data in read_mp(filename=backup,
                              decoder=spider.config.msgpack_decoder):
        print(quote_data)