async def test_should_raise_error_when_decoder_is_not_callable( self, decoder): with pytest.raises(TypeError) as exc_info: async for item in read_mp('foo', decoder=decoder): print(item) assert f'{decoder} is not callable' == str(exc_info.value)
async def test_should_save_correct_output_when_giving_file_url(self, page_1_file_url, tmp_path, browser): backup_path = tmp_path / 'backup.mp' config = Configuration( item_processors=[self.processor], backup_filename=f'{backup_path}', selenium_driver_log_file=None, selenium_browser=browser ) spider = SeleniumSpider(urls=[page_1_file_url], parse=self.parse, config=config) await spider.run() stats = spider.statistics() followed_urls = { page_1_file_url.replace('1', '2'), page_1_file_url.replace('1', '3') } assert followed_urls == stats.followed_urls assert {page_1_file_url} | followed_urls == stats.reachable_urls assert stats.total_time > 0 assert stats.average_fetch_time == spider._total_fetch_time / stats.request_counter assert set() == stats.unreachable_urls assert set() == stats.robot_excluded_urls assert 3 == stats.request_counter assert stats.total_time > 0 albert_count = 0 async for item in read_mp(backup_path, decoder=datetime_decoder): assert isinstance(item['date'], datetime) if item['author'] == 'Albert Einstein': print(item) albert_count += 1 assert albert_count == 3
async def test_should_write_bytes_when_giving_content_in_write_mode_without_custom_encoder( self, trio_tmp_path): content = {'name': 'Kevin', 'fruit': 'water melon'} mp_file = trio_tmp_path / 'data.mp' length = await write_mp(mp_file, content, mode='w') assert length > 0 assert [content] == [item async for item in read_mp(mp_file)]
async def test_should_return_python_objects_when_reading_file_without_custom_decoder( self, tmp_path, create_msgpack_file): given_data = [[1, 2], 'hello', {'fruit': 'apple'}] mp_file = tmp_path / 'data.mp' create_msgpack_file(mp_file, given_data) for file in [f'{mp_file}', trio.Path(mp_file)]: assert [item async for item in read_mp(file)] == given_data
async def test_should_write_bytes_when_giving_content_in_append_mode_without_custom_encoder( self, trio_tmp_path): content = ['foo', 4, {'fruit': 'water melon'}, [1, 4]] mp_file = trio_tmp_path / 'data.mp' for item in content: length = await write_mp(mp_file, item, mode='a') assert length > 0 assert content == [item async for item in read_mp(mp_file)]
async def test_should_return_python_objects_when_reading_file_with_custom_decoder( self, tmp_path, decode_datetime, create_msgpack_file): given_data = ['hello', datetime.now()] mp_file = tmp_path / 'data.mp' create_msgpack_file(mp_file, given_data) for file in [str(mp_file), trio.Path(mp_file)]: assert [ item async for item in read_mp(file, decoder=decode_datetime) ] == given_data
async def common_assert(stats: SpiderStatistics, backup_path: Path): assert stats.unreachable_urls == set() assert stats.robot_excluded_urls == set() assert stats.total_time > 0 albert_count = 0 async for item in read_mp(backup_path, decoder=datetime_decoder): assert isinstance(item['date'], datetime) if item['author'] == 'Albert Einstein': albert_count += 1 assert albert_count == 3
async def main() -> None: backup = Path(__file__).parent / 'backup.mp' config = Configuration(backup_filename=f'{backup}', item_processors=[date_processor]) spider = StaticSpider(urls=['http://quotes.toscrape.com'], parse=parse, config=config) await spider.run() print(spider.statistics()) # you can do whatever you want with the results async for item in read_mp(backup, decoder=spider.config.msgpack_decoder): print(item)
async def main() -> None: backup = Path(__file__).parent / 'backup.mp' config = Configuration(selenium_driver_log_file=None, backup_filename=f'{backup}', item_processors=[date_processor]) spider = SeleniumSpider(urls=['http://quotes.toscrape.com'], parse=parse, config=config) await spider.run() print(spider.statistics()) # you can do whatever you want with the results async for quote in read_mp(filename=backup, decoder=datetime_decoder): print(quote)
async def test_should_write_bytes_when_giving_content_in_write_mode_with_custom_encoder( self, trio_tmp_path, encode_datetime, decode_datetime): content = {'name': 'Kevin', 'date': datetime.now()} mp_file = trio_tmp_path / 'data.mp' length = await write_mp(f'{mp_file}', content, mode='w', encoder=encode_datetime) assert length > 0 assert [content] == [ item async for item in read_mp(mp_file, decoder=decode_datetime) ]
async def test_should_write_bytes_when_giving_content_in_append_mode_with_custom_encoder( self, trio_tmp_path, encode_datetime, decode_datetime): content = ['foo', datetime.now()] mp_file = trio_tmp_path / 'data.mp' for item in content: length = await write_mp(f'{mp_file}', item, mode='a', encoder=encode_datetime) assert length > 0 assert content == [ item async for item in read_mp(mp_file, decoder=decode_datetime) ]
async def main() -> None: backup = Path(__file__).parent / 'backup.mp' config = Configuration(selenium_driver_log_file=None, backup_filename=f'{backup}', item_processors=[date_processor]) sel_spider = SeleniumSpider(urls=['http://httpbin.org/'], parse=parse, config=config) await sel_spider.run() print(sel_spider.statistics()) # you can do whatever you want with the results async for quote_data in read_mp(filename=backup, decoder=datetime_decoder): print('****', quote_data['title'], '****') print(quote_data['description']) print('== operations ==') for operation in quote_data['operations']: print('\tmethod:', operation['method']) print('\tpath:', operation['path']) print('\tdescription:', operation['description'], end='\n\n')
async def test_should_save_content_to_backup_file(self, tmp_path, capsys): def processor(item): print("I'm a processor") return item backup = tmp_path / 'backup.mp' fruit_1 = {'fruit': 'pineapple'} fruit_2 = {'fruit': 'orange'} config = Configuration(backup_filename=f'{backup.resolve()}', item_processors=[processor]) static_spider = StaticSpider(urls=['https://foo.com'], parse=lambda x, y: None, config=config) await static_spider.save_item(fruit_1) await static_spider.save_item(fruit_2) out, _ = capsys.readouterr() assert [fruit_1, fruit_2 ] == [item async for item in read_mp(f'{backup.resolve()}')] assert "I'm a processor" in out