async def test_handled_404s(): redis = FakeRedis() stats = StatsManager(redis) kafka = FakeProducer() rot_producer = AsyncProducer(kafka, 'foo') session = RateLimitedClientSession( FakeAioSession(corrupt=True, status=404), redis ) ident = '4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d' await process_image( persister=validate_thumbnail, session=session, url='fake_url', identifier=ident, stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000), rot_producer=rot_producer ) producer_task = asyncio.create_task(rot_producer.listen()) try: await asyncio.wait_for(producer_task, 0.01) except concurrent.futures.TimeoutError: pass rot_msg = kafka.messages[0] parsed = json.loads(str(rot_msg, 'utf-8')) assert ident == parsed['identifier']
async def producer_fixture(): # Run a processing task and capture the metadata results in a mock kafka # producer redis = FakeRedis() stats = StatsManager(redis) meta_producer = FakeProducer() retry_producer = FakeProducer() producer = AsyncProducer(meta_producer, 'foo') await process_image( persister=validate_thumbnail, session=RateLimitedClientSession(FakeAioSession(), redis), url='https://example.gov/hello.jpg', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000), metadata_producer=producer, retry_producer=retry_producer ) producer_task = asyncio.create_task(producer.listen()) try: await asyncio.wait_for(producer_task, 0.01) except concurrent.futures.TimeoutError: pass return meta_producer, retry_producer
async def test_records_errors(): redis = FakeRedis() stats = StatsManager(redis) session = RateLimitedClientSession(FakeAioSession(status=403), redis) retry_producer = FakeProducer() producer = AsyncProducer(retry_producer, 'foo') await process_image( persister=validate_thumbnail, session=session, url='https://example.gov/image.jpg', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000), retry_producer=producer ) expected_keys = [ 'resize_errors', 'resize_errors:example', 'resize_errors:example:403', 'status60s:example', 'status1hr:example', 'status12hr:example' ] for key in expected_keys: val = redis.store[key] assert val == 1 or len(val) == 1 producer_task = asyncio.create_task(producer.listen()) try: await asyncio.wait_for(producer_task, 0.01) except concurrent.futures.TimeoutError: pass retry = retry_producer.messages[0] parsed = json.loads(str(retry, 'utf-8')) assert parsed['attempts'] == 1
async def test_handles_corrupt_images_gracefully(): redis = FakeRedis() stats = StatsManager(redis) await process_image(persister=validate_thumbnail, session=RateLimitedClientSession( FakeAioSession(corrupt=True), redis), url='fake_url', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000))
async def test_pipeline(): """ Test that the image processor completes with a fake image. """ # validate_thumbnail callback performs the actual assertions redis = FakeRedis() stats = StatsManager(redis) await process_image(persister=validate_thumbnail, session=RateLimitedClientSession( FakeAioSession(), redis), url='https://example.gov/hello.jpg', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000)) assert redis.store['num_resized'] == 1 assert redis.store['num_resized:example'] == 1 assert len(redis.store['status60s:example']) == 1
async def test_records_errors(): redis = FakeRedis() stats = StatsManager(redis) session = RateLimitedClientSession(FakeAioSession(status=403), redis) await process_image(persister=validate_thumbnail, session=session, url='https://example.gov/image.jpg', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000)) expected_keys = [ 'resize_errors', 'resize_errors:example', 'resize_errors:example:403', 'status60s:example', 'status1hr:example', 'status12hr:example' ] for key in expected_keys: val = redis.store[key] assert val == 1 or len(val) == 1
async def test_handles_corrupt_images_gracefully(): redis = FakeRedis() stats = StatsManager(redis) kafka = FakeProducer() producer = AsyncProducer(kafka, 'foo') await process_image( persister=validate_thumbnail, session=RateLimitedClientSession(FakeAioSession(corrupt=True), redis), url='fake_url', identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d', stats=stats, source='example', semaphore=asyncio.BoundedSemaphore(1000), metadata_producer=producer ) producer_task = asyncio.create_task(producer.listen()) try: await asyncio.wait_for(producer_task, 0.01) except concurrent.futures.TimeoutError: pass
async def setup_io(): """ Set up all IO used by the scheduler. :return A tuple of awaitable tasks """ s3 = boto3.client( 's3', settings.AWS_DEFAULT_REGION, config=botocore.client.Config(max_pool_connections=settings.MAX_TASKS)) producer = Producer({'bootstrap.servers': settings.KAFKA_HOSTS}) metadata_producer = AsyncProducer(producer, 'image_metadata_updates') retry_producer = AsyncProducer(producer, 'inbound_images') link_rot_producer = AsyncProducer(producer, 'link_rot') redis_client = aredis.StrictRedis(host=settings.REDIS_HOST) connector = aiohttp.TCPConnector(ssl=False) aiosession = RateLimitedClientSession( aioclient=aiohttp.ClientSession(connector=connector), redis=redis_client) stats = StatsManager(redis_client) image_processor = partial(process_image, session=aiosession, persister=partial(save_thumbnail_s3, s3_client=s3), stats=stats, metadata_producer=metadata_producer, retry_producer=retry_producer, rot_producer=link_rot_producer) consumer_settings = { 'bootstrap.servers': settings.KAFKA_HOSTS, 'group.id': 'image_handlers', 'auto.offset.reset': 'earliest' } scheduler = CrawlScheduler(consumer_settings, redis_client, image_processor) return (metadata_producer.listen(), retry_producer.listen(), link_rot_producer.listen(), scheduler.schedule_loop())
async def setup_io(): """ Set up all IO used by the scheduler. """ kafka_client = kafka_connect() s3 = boto3.client( 's3', settings.AWS_DEFAULT_REGION, config=botocore.client.Config(max_pool_connections=settings.MAX_TASKS)) metadata_updates = kafka_client.topics['image_metadata_updates'] \ .get_producer(use_rdkafka=True) producer = MetadataProducer(producer=metadata_updates) redis_client = aredis.StrictRedis(host=settings.REDIS_HOST) aiosession = RateLimitedClientSession(aioclient=aiohttp.ClientSession(), redis=redis_client) stats = StatsManager(redis_client) image_processor = partial(process_image, session=aiosession, persister=partial(save_thumbnail_s3, s3_client=s3), stats=stats, metadata_producer=producer) scheduler = CrawlScheduler(kafka_client, redis_client, image_processor) return producer.listen(), scheduler.schedule_loop()