Ejemplo n.º 1
0
async def test_handled_404s():
    redis = FakeRedis()
    stats = StatsManager(redis)
    kafka = FakeProducer()
    rot_producer = AsyncProducer(kafka, 'foo')
    session = RateLimitedClientSession(
        FakeAioSession(corrupt=True, status=404), redis
    )
    ident = '4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d'
    await process_image(
        persister=validate_thumbnail,
        session=session,
        url='fake_url',
        identifier=ident,
        stats=stats,
        source='example',
        semaphore=asyncio.BoundedSemaphore(1000),
        rot_producer=rot_producer
    )
    producer_task = asyncio.create_task(rot_producer.listen())
    try:
        await asyncio.wait_for(producer_task, 0.01)
    except concurrent.futures.TimeoutError:
        pass
    rot_msg = kafka.messages[0]
    parsed = json.loads(str(rot_msg, 'utf-8'))
    assert ident == parsed['identifier']
Ejemplo n.º 2
0
async def producer_fixture():
    # Run a processing task and capture the metadata results in a mock kafka
    # producer
    redis = FakeRedis()
    stats = StatsManager(redis)
    meta_producer = FakeProducer()
    retry_producer = FakeProducer()
    producer = AsyncProducer(meta_producer, 'foo')
    await process_image(
        persister=validate_thumbnail,
        session=RateLimitedClientSession(FakeAioSession(), redis),
        url='https://example.gov/hello.jpg',
        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
        stats=stats,
        source='example',
        semaphore=asyncio.BoundedSemaphore(1000),
        metadata_producer=producer,
        retry_producer=retry_producer
    )
    producer_task = asyncio.create_task(producer.listen())
    try:
        await asyncio.wait_for(producer_task, 0.01)
    except concurrent.futures.TimeoutError:
        pass
    return meta_producer, retry_producer
Ejemplo n.º 3
0
async def test_records_errors():
    redis = FakeRedis()
    stats = StatsManager(redis)
    session = RateLimitedClientSession(FakeAioSession(status=403), redis)
    retry_producer = FakeProducer()
    producer = AsyncProducer(retry_producer, 'foo')
    await process_image(
        persister=validate_thumbnail,
        session=session,
        url='https://example.gov/image.jpg',
        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
        stats=stats,
        source='example',
        semaphore=asyncio.BoundedSemaphore(1000),
        retry_producer=producer
    )
    expected_keys = [
        'resize_errors',
        'resize_errors:example',
        'resize_errors:example:403',
        'status60s:example',
        'status1hr:example',
        'status12hr:example'
    ]
    for key in expected_keys:
        val = redis.store[key]
        assert val == 1 or len(val) == 1
    producer_task = asyncio.create_task(producer.listen())
    try:
        await asyncio.wait_for(producer_task, 0.01)
    except concurrent.futures.TimeoutError:
        pass
    retry = retry_producer.messages[0]
    parsed = json.loads(str(retry, 'utf-8'))
    assert parsed['attempts'] == 1
Ejemplo n.º 4
0
async def test_handles_corrupt_images_gracefully():
    redis = FakeRedis()
    stats = StatsManager(redis)
    await process_image(persister=validate_thumbnail,
                        session=RateLimitedClientSession(
                            FakeAioSession(corrupt=True), redis),
                        url='fake_url',
                        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
                        stats=stats,
                        source='example',
                        semaphore=asyncio.BoundedSemaphore(1000))
Ejemplo n.º 5
0
async def test_pipeline():
    """ Test that the image processor completes with a fake image. """
    # validate_thumbnail callback performs the actual assertions
    redis = FakeRedis()
    stats = StatsManager(redis)
    await process_image(persister=validate_thumbnail,
                        session=RateLimitedClientSession(
                            FakeAioSession(), redis),
                        url='https://example.gov/hello.jpg',
                        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
                        stats=stats,
                        source='example',
                        semaphore=asyncio.BoundedSemaphore(1000))
    assert redis.store['num_resized'] == 1
    assert redis.store['num_resized:example'] == 1
    assert len(redis.store['status60s:example']) == 1
Ejemplo n.º 6
0
async def test_records_errors():
    redis = FakeRedis()
    stats = StatsManager(redis)
    session = RateLimitedClientSession(FakeAioSession(status=403), redis)
    await process_image(persister=validate_thumbnail,
                        session=session,
                        url='https://example.gov/image.jpg',
                        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
                        stats=stats,
                        source='example',
                        semaphore=asyncio.BoundedSemaphore(1000))
    expected_keys = [
        'resize_errors', 'resize_errors:example', 'resize_errors:example:403',
        'status60s:example', 'status1hr:example', 'status12hr:example'
    ]
    for key in expected_keys:
        val = redis.store[key]
        assert val == 1 or len(val) == 1
Ejemplo n.º 7
0
async def test_handles_corrupt_images_gracefully():
    redis = FakeRedis()
    stats = StatsManager(redis)
    kafka = FakeProducer()
    producer = AsyncProducer(kafka, 'foo')
    await process_image(
        persister=validate_thumbnail,
        session=RateLimitedClientSession(FakeAioSession(corrupt=True), redis),
        url='fake_url',
        identifier='4bbfe191-1cca-4b9e-aff0-1d3044ef3f2d',
        stats=stats,
        source='example',
        semaphore=asyncio.BoundedSemaphore(1000),
        metadata_producer=producer
    )
    producer_task = asyncio.create_task(producer.listen())
    try:
        await asyncio.wait_for(producer_task, 0.01)
    except concurrent.futures.TimeoutError:
        pass
Ejemplo n.º 8
0
async def setup_io():
    """
    Set up all IO used by the scheduler.

    :return A tuple of awaitable tasks
    """
    s3 = boto3.client(
        's3',
        settings.AWS_DEFAULT_REGION,
        config=botocore.client.Config(max_pool_connections=settings.MAX_TASKS))
    producer = Producer({'bootstrap.servers': settings.KAFKA_HOSTS})
    metadata_producer = AsyncProducer(producer, 'image_metadata_updates')
    retry_producer = AsyncProducer(producer, 'inbound_images')
    link_rot_producer = AsyncProducer(producer, 'link_rot')
    redis_client = aredis.StrictRedis(host=settings.REDIS_HOST)
    connector = aiohttp.TCPConnector(ssl=False)
    aiosession = RateLimitedClientSession(
        aioclient=aiohttp.ClientSession(connector=connector),
        redis=redis_client)
    stats = StatsManager(redis_client)
    image_processor = partial(process_image,
                              session=aiosession,
                              persister=partial(save_thumbnail_s3,
                                                s3_client=s3),
                              stats=stats,
                              metadata_producer=metadata_producer,
                              retry_producer=retry_producer,
                              rot_producer=link_rot_producer)
    consumer_settings = {
        'bootstrap.servers': settings.KAFKA_HOSTS,
        'group.id': 'image_handlers',
        'auto.offset.reset': 'earliest'
    }
    scheduler = CrawlScheduler(consumer_settings, redis_client,
                               image_processor)
    return (metadata_producer.listen(), retry_producer.listen(),
            link_rot_producer.listen(), scheduler.schedule_loop())
Ejemplo n.º 9
0
async def setup_io():
    """
    Set up all IO used by the scheduler.
    """
    kafka_client = kafka_connect()
    s3 = boto3.client(
        's3',
        settings.AWS_DEFAULT_REGION,
        config=botocore.client.Config(max_pool_connections=settings.MAX_TASKS))
    metadata_updates = kafka_client.topics['image_metadata_updates'] \
        .get_producer(use_rdkafka=True)
    producer = MetadataProducer(producer=metadata_updates)
    redis_client = aredis.StrictRedis(host=settings.REDIS_HOST)
    aiosession = RateLimitedClientSession(aioclient=aiohttp.ClientSession(),
                                          redis=redis_client)
    stats = StatsManager(redis_client)
    image_processor = partial(process_image,
                              session=aiosession,
                              persister=partial(save_thumbnail_s3,
                                                s3_client=s3),
                              stats=stats,
                              metadata_producer=producer)
    scheduler = CrawlScheduler(kafka_client, redis_client, image_processor)
    return producer.listen(), scheduler.schedule_loop()