Ejemplo n.º 1
0
async def test_task_monitor(autojump_clock, nursery):
    # To simplify testing, we pick the current task as the root task:
    root_task = trio.hazmat.current_task()
    websocket = MockWebsocket()
    subscription = TaskMonitorSubscription(id_=1,
                                           websocket=websocket,
                                           period=2.0,
                                           root_task=root_task)

    # We create a few dummy tasks that will show up in the task monitor.
    async def dummy_parent(task_status):
        async with trio.open_nursery() as inner:
            await inner.start(dummy_child, name='Dummy Child 1')
            await inner.start(dummy_child, name='Dummy Child 2')
            task_status.started()

    async def dummy_child(task_status):
        task_status.started()
        await trio.sleep_forever()

    await nursery.start(dummy_parent, name='Dummy Parent 1')
    await nursery.start(dummy_parent, name='Dummy Parent 2')
    nursery.start_soon(subscription.run, name='Task Monitor Subscription')

    # We should receive the first event right away.
    with assert_max_elapsed(0.1):
        data = await websocket.get_message()
        event1 = ServerMessage.FromString(data).event
        assert event1.subscription_id == 1
        task_tree = event1.task_tree
        assert task_tree.name == '<Root>'
        subtask_1 = task_tree.subtasks[0]
        assert subtask_1.name == 'Dummy Parent 1'
        subtask_1_1 = subtask_1.subtasks[0]
        assert subtask_1_1.name == 'Dummy Child 1'
        subtask_1_2 = subtask_1.subtasks[1]
        assert subtask_1_2.name == 'Dummy Child 2'
        subtask_2 = task_tree.subtasks[1]
        assert subtask_2.name == 'Dummy Parent 2'
        subtask_2_1 = subtask_2.subtasks[0]
        assert subtask_2_1.name == 'Dummy Child 1'
        subtask_2_2 = subtask_2.subtasks[1]
        assert subtask_2_2.name == 'Dummy Child 2'
        subtask_3 = task_tree.subtasks[2]
        assert subtask_3.name == 'Task Monitor Subscription'

    # The second event won't arrive for two more seconds.
    with assert_min_elapsed(2.0):
        data = await websocket.get_message()
        event2 = ServerMessage.FromString(data).event
        assert event1.subscription_id == 1
        task_tree = event1.task_tree
        assert task_tree.name == '<Root>'
        assert len(task_tree.subtasks) == 3

    subscription.cancel()
Ejemplo n.º 2
0
async def test_profile(client):
    request = Request()
    request.request_id = 1
    request.performance_profile.duration = 0.1
    request.performance_profile.sort_by = 'calls'
    request.performance_profile.top_n = 5
    await client.send_message(request.SerializeToString())
    message_bytes = await client.get_message()
    message = ServerMessage.FromString(message_bytes)
    assert message.response.request_id == 1
    profile = message.response.performance_profile
    assert profile.total_calls > 1
    assert profile.total_time > 0.1
    assert len(profile.functions) == 5
Ejemplo n.º 3
0
async def test_job_state_subscription(autojump_clock, nursery):
    job1_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa')
    job2_id = UUID('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb')
    job1_doc = {
        'id': str(job1_id),
        'name': 'Job #1',
        'seeds': ['https://job1.example'],
        'tags': ['tag1a', 'tag1b'],
        'item_count': 10,
        'http_success_count': 7,
        'http_error_count': 2,
        'exception_count': 1,
        'http_status_counts': {
            200: 7,
            404: 2
        },
        'started_at': datetime(2019, 1, 25, 14, 44, 0, tzinfo=timezone.utc),
        'completed_at': None,
        'run_state': 'RUNNING',
    }
    job2_doc = {
        'id': str(job2_id),
        'name': 'Job #2',
        'seeds': ['https://job2.example'],
        'tags': ['tag2a'],
        'item_count': 20,
        'http_success_count': 14,
        'http_error_count': 4,
        'exception_count': 2,
        'http_status_counts': {
            200: 14,
            404: 4
        },
        'started_at': datetime(2019, 1, 25, 14, 55, 0, tzinfo=timezone.utc),
        'completed_at': None,
        'run_state': 'RUNNING',
    }
    stats_tracker = StatsTracker(timedelta(seconds=60))
    stats_tracker.add_job(job1_doc)
    stats_tracker.add_job(job2_doc)
    websocket = MockWebsocket()
    subscription = JobStatusSubscription(id_=1,
                                         stats_tracker=stats_tracker,
                                         websocket=websocket,
                                         min_interval=2)
    assert repr(subscription) == '<JobStatusSubscription id=1>'
    with pytest.raises(Exception):
        # Can't cancel before it starts running:
        subscription.cancel()
    nursery.start_soon(subscription.run)

    # The first two items should be received immediately and in full.
    with assert_max_elapsed(0.1):
        data = await websocket.get_message()
        message1 = ServerMessage.FromString(data).event
        assert message1.subscription_id == 1
        assert len(message1.job_list.jobs) == 2
        job1 = message1.job_list.jobs[0]
        assert job1.job_id == job1_id.bytes
        assert job1.name == 'Job #1'
        assert job1.seeds[0] == 'https://job1.example'
        assert job1.tags[0] == 'tag1a'
        assert job1.tags[1] == 'tag1b'
        assert job1.item_count == 10
        assert job1.http_success_count == 7
        assert job1.http_error_count == 2
        assert job1.exception_count == 1
        assert job1.http_status_counts[200] == 7
        assert job1.http_status_counts[404] == 2
        assert job1.started_at == '2019-01-25T14:44:00+00:00'
        assert not job1.HasField('completed_at')
        assert job1.run_state == JobRunState.Value('RUNNING')

        job2 = message1.job_list.jobs[1]
        assert job2.job_id == job2_id.bytes
        assert job2.name == 'Job #2'
        assert job2.seeds[0] == 'https://job2.example'
        assert job2.tags[0] == 'tag2a'
        assert job2.item_count == 20
        assert job2.http_success_count == 14
        assert job2.http_error_count == 4
        assert job2.exception_count == 2
        assert job2.http_status_counts[200] == 14
        assert job2.http_status_counts[404] == 4
        assert job2.started_at == '2019-01-25T14:55:00+00:00'
        assert not job2.HasField('completed_at')
        assert job2.run_state == JobRunState.Value('RUNNING')

    # Add 1 item to job 1. Two seconds later, we should get an update for job 1
    # but not job 2.
    with assert_min_elapsed(2):
        job1_doc.update({
            'item_count': 11,
            'http_success_count': 8,
            'http_status_counts': {
                200: 8,
                404: 2
            },
        })
        data = await websocket.get_message()
        message2 = ServerMessage.FromString(data).event
        assert message2.subscription_id == 1
        assert len(message2.job_list.jobs) == 1
        job1 = message2.job_list.jobs[0]
        assert job1.name == 'Job #1'
        assert job1.seeds[0] == 'https://job1.example'
        assert job1.tags[0] == 'tag1a'
        assert job1.tags[1] == 'tag1b'
        assert job1.item_count == 11
        assert job1.http_success_count == 8
        assert job1.http_error_count == 2
        assert job1.exception_count == 1
        assert job1.http_status_counts[200] == 8
        assert job1.http_status_counts[404] == 2
        assert job1.started_at == '2019-01-25T14:44:00+00:00'
        assert not job1.HasField('completed_at')
        assert job1.run_state == JobRunState.Value('RUNNING')

    # Add 2 items to job 2. Two seconds later, we should get an update for job 2
    # but not job 1.
    with assert_min_elapsed(2):
        completed_at = datetime(2019, 1, 25, 14, 56, 0, tzinfo=timezone.utc)
        job2_doc.update({
            'item_count': 22,
            'http_success_count': 15,
            'http_error_count': 5,
            'http_status_counts': {
                200: 15,
                404: 5
            },
        })
        data = await websocket.get_message()
        message3 = ServerMessage.FromString(data).event
        assert message3.subscription_id == 1
        assert len(message3.job_list.jobs) == 1
        job2 = message3.job_list.jobs[0]
        assert job2.name == 'Job #2'
        assert job2.seeds[0] == 'https://job2.example'
        assert job2.tags[0] == 'tag2a'
        assert job2.item_count == 22
        assert job2.http_success_count == 15
        assert job2.http_error_count == 5
        assert job2.exception_count == 2
        assert job2.http_status_counts[200] == 15
        assert job2.http_status_counts[404] == 5
        assert job2.started_at == '2019-01-25T14:55:00+00:00'
        assert job2.run_state == JobRunState.Value('RUNNING')

    # Cancel the subscription and wait 2 seconds to make sure it doesn't send us
    # any more events.
    subscription.cancel()
    with pytest.raises(trio.TooSlowError):
        with trio.fail_after(2):
            data = await websocket.get_message()
Ejemplo n.º 4
0
    # Instantiate subscription. Ask for 3 historical measurements, but only 2
    # are available so it should just send those 2.
    websocket = MockWebsocket()
    subscription = ResourceMonitorSubscription(
        id_=1,
        websocket=websocket,
        resource_monitor=resource_monitor,
        history=3)
    assert repr(subscription) == '<ResourceMonitorSubscription id=1>'
    nursery.start_soon(subscription.run)

    # We should be able to read two events immediately.
    with assert_max_elapsed(0.1):
        data = await websocket.get_message()
        event1 = ServerMessage.FromString(data).event
        assert event1.subscription_id == 1
        frame1 = event1.resource_frame
        assert frame1.timestamp == '2019-01-25T00:00:00+00:00'
        assert frame1.cpus[0].usage == 0.99
        assert frame1.cpus[1].usage == 0.55
        assert frame1.memory.used == 1_000_000
        assert frame1.memory.total == 2_000_000
        assert frame1.disks[0].mount == '/root'
        assert frame1.disks[0].used == 3_000_000
        assert frame1.disks[0].total == 4_000_000
        assert frame1.disks[1].mount == '/home'
        assert frame1.disks[1].used == 5_000_000
        assert frame1.disks[1].total == 6_000_000
        assert frame1.networks[0].name == 'eth0'
        assert frame1.networks[0].sent == 7_000_000
Ejemplo n.º 5
0
async def test_subscribe_to_crawl(db_pool, job_table, response_table,
                                  response_body_table, nursery):
    """ Subscribe to a job that has 3 items. Simulate interrupting and resuming
    sync using a sync token. """
    job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000")

    # Create sample data: a job with 3 downloaded items.
    async with db_pool.connection() as conn:
        await r.table("job").insert({
            "id": str(job_id),
            "run_state": RunState.COMPLETED,
        }).run(conn)

        await r.table("response_body").insert({
            # Response bodies are keyed by the blake2 hash of the body.
            "id":
            b"\x00" * 32,
            "is_compressed":
            True,
            "body":
            b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O"
            b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00",
        }).run(conn)
        await r.table("response").insert({
            "id":
            "bbbbbbbb-bbbb-bbbb-bbbb-000000000000",
            "body_id":
            b"\x00" * 32,
            "sequence":
            1,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            1.0,
            "is_success":
            True,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/",
            "url_can":
            "https://www.example/",
            "status_code":
            200,
            "content_type":
            "text/plain",
            "headers": [
                "Server",
                "FakeServer 1.0",
                "X-Foo",
                "Bar",
            ],
        }).run(conn)

        await r.table("response_body").insert({
            "id": b"\x01" * 32,
            "is_compressed": False,
            "body": b"File not found",
        }).run(conn)
        await r.table("response").insert({
            "id":
            "bbbbbbbb-bbbb-bbbb-bbbb-000000000001",
            "body_id":
            b"\x01" * 32,
            "sequence":
            3,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 2, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 3, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            2.0,
            "is_success":
            False,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/foo",
            "url_can":
            "https://www.example/foo",
            "status_code":
            404,
            "content_type":
            "text/plain",
            "headers": [],
        }).run(conn)

        await r.table("response_body").insert({
            "id":
            b"\x02" * 32,
            "is_compressed":
            True,
            "body":
            b"\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM"
            b"\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00",
        }).run(conn)
        await r.table("response").insert({
            "id":
            "bbbbbbbb-bbbb-bbbb-bbbb-000000000002",
            "body_id":
            b"\x02" * 32,
            "sequence":
            5,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            2.0,
            "is_success":
            True,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/bar",
            "url_can":
            "https://www.example/bar",
            "status_code":
            200,
            "content_type":
            "text/plain",
            "headers": [],
        }).run(conn)

    # Instantiate subscription
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = JobSyncSubscription(
        id_=1,
        websocket=websocket,
        job_id=str(job_id),
        subscription_db=subscription_db,
        compression_ok=True,
        job_state_recv=job_recv,
        sync_token=None,
    )
    assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>"
    nursery.start_soon(subscription.run)

    # Read from subscription
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == "https://www.example/"
    assert item1.url_can == "https://www.example/"
    assert item1.started_at == "2019-01-01T01:01:00+00:00"
    assert item1.completed_at == "2019-01-01T01:01:01+00:00"
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.headers[0].key == "Server"
    assert item1.headers[0].value == "FakeServer 1.0"
    assert item1.headers[1].key == "X-Foo"
    assert item1.headers[1].value == "Bar"
    assert item1.is_success
    assert item1.is_compressed
    assert gzip.decompress(item1.body) == b"Test document #1"
    sync_token = message1.sync_item.token

    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    item2 = message2.sync_item.item
    assert item2.job_id == job_id.bytes
    assert item2.url == "https://www.example/foo"
    assert item2.url_can == "https://www.example/foo"
    assert item2.started_at == "2019-01-01T01:01:02+00:00"
    assert item2.completed_at == "2019-01-01T01:01:03+00:00"
    assert item2.cost == 2.0
    assert item2.duration == 1.0
    assert item2.status_code == 404
    assert not item2.is_success
    assert not item2.is_compressed
    assert item2.body == b"File not found"

    # Act as if the subscription was interrupted in between the first and second
    # items, and then resume from there.
    subscription.cancel()
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = JobSyncSubscription(
        id_=2,
        websocket=websocket,
        job_id=str(job_id),
        subscription_db=subscription_db,
        compression_ok=True,
        job_state_recv=job_recv,
        sync_token=sync_token,
    )
    assert repr(subscription) == "<JobSyncSubscription id=2 job_id=aaaaaaaa>"
    nursery.start_soon(subscription.run)

    # The next message will be a repeat of the previous, since we "interrupted"
    # the sync before the previous item finished.
    data = await websocket.get_message()
    message3 = ServerMessage.FromString(data).event
    assert message3.subscription_id == 2
    item3 = message3.sync_item.item
    assert item3.url == "https://www.example/foo"

    data = await websocket.get_message()
    message4 = ServerMessage.FromString(data).event
    assert message4.subscription_id == 2
    item4 = message4.sync_item.item
    assert item4.job_id == job_id.bytes
    assert item4.url == "https://www.example/bar"
    assert item4.url_can == "https://www.example/bar"
    assert item4.started_at == "2019-01-01T01:01:04+00:00"
    assert item4.completed_at == "2019-01-01T01:01:05+00:00"
    assert item4.cost == 2.0
    assert item4.duration == 1.0
    assert item4.status_code == 200
    assert item4.is_success
    assert item4.is_compressed
    assert gzip.decompress(item4.body) == b"Test document #2"

    data = await websocket.get_message()
    message5 = ServerMessage.FromString(data).event
    assert message5.subscription_id == 2
    assert message5.subscription_closed.reason == SubscriptionClosed.COMPLETE
Ejemplo n.º 6
0
async def test_subscribe_to_unfinished_crawl(db_pool, job_table,
                                             response_table,
                                             response_body_table, nursery):
    """ Subscribe to a job that currently has 1 items. After receiving the first
    item, the crawl adds a second item and finishes. The subscription should
    send the second item and also finish. """
    job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000")

    # Create sample data: a job with 1 downloaded items.
    async with db_pool.connection() as conn:
        await r.table("job").insert({
            "id": str(job_id),
            "run_state": RunState.RUNNING,
        }).run(conn)

        await r.table("response_body").insert({
            "id":
            b"\x00" * 32,
            "is_compressed":
            True,
            "body":
            b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O"
            b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00",
        }).run(conn)
        await r.table("response").insert({
            "id":
            "bbbbbbbb-bbbb-bbbb-bbbb-000000000000",
            "body_id":
            b"\x00" * 32,
            "sequence":
            1,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            1.0,
            "is_success":
            True,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/",
            "url_can":
            "https://www.example/",
            "status_code":
            200,
            "content_type":
            "text/plain",
            "headers": [
                "Server",
                "FakeServer 1.0",
                "X-Foo",
                "Bar",
            ],
        }).run(conn)

    # Instantiate subscription
    logger.info("Set up subscription…")
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = JobSyncSubscription(
        id_=1,
        websocket=websocket,
        job_id=str(job_id),
        subscription_db=subscription_db,
        compression_ok=True,
        job_state_recv=job_recv,
        sync_token=None,
    )
    assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>"
    nursery.start_soon(subscription.run)

    # Read from subscription
    logger.info("Read first event…")
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == "https://www.example/"
    assert item1.url_can == "https://www.example/"
    assert item1.started_at == "2019-01-01T01:01:00+00:00"
    assert item1.completed_at == "2019-01-01T01:01:01+00:00"
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.headers[0].key == "Server"
    assert item1.headers[0].value == "FakeServer 1.0"
    assert item1.headers[1].key == "X-Foo"
    assert item1.headers[1].value == "Bar"
    assert item1.is_success
    assert item1.is_compressed
    assert gzip.decompress(item1.body) == b"Test document #1"

    # The subscription should time out because there are no items to send:
    logger.info("Time out on next event…")
    with pytest.raises(trio.TooSlowError):
        with trio.fail_after(1) as cancel_scope:
            data = await websocket.get_message()

    # Now add second result and mark the crawl as completed:
    logger.info("Add second result…")
    async with db_pool.connection() as conn:
        await r.table("response_body").insert({
            "id":
            b"\x02" * 32,
            "is_compressed":
            True,
            "body":
            b"\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM"
            b"\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00",
        }).run(conn)
        await r.table("response").insert({
            "id":
            "bbbbbbbb-bbbb-bbbb-bbbb-000000000002",
            "body_id":
            b"\x02" * 32,
            "sequence":
            5,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            2.0,
            "is_success":
            True,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/bar",
            "url_can":
            "https://www.example/bar",
            "status_code":
            200,
            "content_type":
            "text/plain",
            "headers": [],
        }).run(conn)
    await job_send.send(
        JobStateEvent(
            job_id=str(job_id),
            schedule_id=None,
            run_state=RunState.COMPLETED,
            event_time=datetime.now(timezone.utc),
        ))

    # Now wait to receive the second result
    logger.info("Read second event…")
    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    item2 = message2.sync_item.item
    assert item2.job_id == job_id.bytes
    assert item2.url == "https://www.example/bar"
    assert item2.url_can == "https://www.example/bar"
    assert item2.started_at == "2019-01-01T01:01:04+00:00"
    assert item2.completed_at == "2019-01-01T01:01:05+00:00"
    assert item2.cost == 2.0
    assert item2.duration == 1.0
    assert item2.status_code == 200
    assert item2.is_success
    assert item2.is_compressed
    assert gzip.decompress(item2.body) == b"Test document #2"

    logger.info("Read subscription close…")
    data = await websocket.get_message()
    message3 = ServerMessage.FromString(data).event
    assert message3.subscription_id == 1
    assert message3.subscription_closed.reason == SubscriptionClosed.COMPLETE
Ejemplo n.º 7
0
async def test_subscribe_to_crawl_decompress(db_pool, job_table,
                                             response_table,
                                             response_body_table, nursery):
    """ If requested, the server will decompress response bodies. """
    job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000")

    # Create sample data: a job with 1 downloaded item.
    async with db_pool.connection() as conn:
        await r.table("job").insert({
            "id": str(job_id),
            "run_state": RunState.COMPLETED,
        }).run(conn)

        await r.table("response_body").insert({
            "id":
            b"\x00" * 32,
            "is_compressed":
            True,
            "body":
            b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O"
            b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00",
        }).run(conn)
        await r.table("response").insert({
            "id":
            UUID("bbbbbbbb-bbbb-bbbb-bbbb-000000000000").bytes,
            "body_id":
            b"\x00" * 32,
            "sequence":
            1,
            "started_at":
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            "completed_at":
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            "duration":
            1.0,
            "cost":
            1.0,
            "is_success":
            True,
            "job_id":
            str(job_id),
            "url":
            "https://www.example/",
            "url_can":
            "https://www.example/",
            "status_code":
            200,
            "content_type":
            "text/plain",
            "headers": [],
        }).run(conn)

    # Instantiate subscription
    logger.info("Creating subscription…")
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = JobSyncSubscription(
        id_=1,
        websocket=websocket,
        job_id=str(job_id),
        subscription_db=subscription_db,
        compression_ok=False,
        job_state_recv=job_recv,
        sync_token=None,
    )
    assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>"
    nursery.start_soon(subscription.run)

    # Read from subscription
    logger.info("Reading one event…")
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == "https://www.example/"
    assert item1.url_can == "https://www.example/"
    assert item1.started_at == "2019-01-01T01:01:00+00:00"
    assert item1.completed_at == "2019-01-01T01:01:01+00:00"
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.is_success
    assert not item1.is_compressed
    assert item1.body == b"Test document #1"

    logger.info("Reading subscription close…")
    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    assert message2.subscription_closed.reason == SubscriptionClosed.COMPLETE
Ejemplo n.º 8
0
    async def _handle_request(self, request_data):
        '''
        Handle a single API request.

        :param request: A protobuf request object.
        '''
        start = trio.current_time()
        message = ServerMessage()
        message.response.is_success = False
        request = None

        try:
            # Prepare response.
            request = Request.FromString(request_data)
            message.response.request_id = request.request_id

            # Find an appropriate handler.
            command_name = request.WhichOneof('Command')
            if command_name is None:
                raise InvalidRequestException('No command specified')
            command = getattr(request, command_name)
            try:
                handler = _handlers[command_name]
            except KeyError:
                raise InvalidRequestException(
                    'Invalid command name: {}'.format(command_name)) from None

            # Inject dependencies into argument list, then call the handler.
            argspec = inspect.getfullargspec(handler)
            args = list()
            for var in argspec[0]:
                if var == 'command':
                    args.append(command)
                elif var == 'crawl_manager':
                    args.append(self._crawl_manager)
                elif var == 'nursery':
                    args.append(self._nursery)
                elif var == 'rate_limiter':
                    args.append(self._rate_limiter)
                elif var == 'resource_monitor':
                    args.append(self._resource_monitor)
                elif var == 'response':
                    args.append(message.response)
                elif var == 'scheduler':
                    args.append(self._scheduler)
                elif var == 'server_db':
                    args.append(self._server_db)
                elif var == 'subscription_manager':
                    args.append(self._subscription_manager)
                elif var == 'stats_tracker':
                    args.append(self._stats_tracker)
                elif var == 'websocket':
                    args.append(self._ws)
                else:
                    raise Exception(
                        'Unknown dependency "{}" in handler {}()'.format(
                            var, command_name))

            await handler(*args)
            message.response.is_success = True
            elapsed = trio.current_time() - start
            logger.info('Request OK %s %s %0.3fs', self._client_ip,
                        command_name, elapsed)
        except DecodeError:
            # Failure to decode a protobuf message means that the connection
            # is severely damaged; raise to the nursery so we can close the
            # entire connection.
            raise
        except InvalidRequestException as ire:
            error_message = str(ire)
            logger.error('Request ERROR %s %s (%s)', command_name,
                         self._client_ip, error_message)
            message.response.error_message = error_message
        except:
            logger.exception('Exception while handling request:\n%r', request)
            message.response.error_message = 'A server exception occurred'

        message_data = message.SerializeToString()
        await self._ws.send_message(message_data)
Ejemplo n.º 9
0
async def send_test_command(client, command):
    ''' A little helper to reduce some boilerplate. '''
    await client.send_message(command.SerializeToString())
    message = await client.get_message()
    return ServerMessage.FromString(message).response
Ejemplo n.º 10
0
async def test_subscribe_to_crawl(db_pool, job_table, response_table,
                                  response_body_table, nursery):
    ''' Subscribe to a job that has 3 items. Simulate interrupting and resuming
    sync using a sync token. '''
    job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000')

    # Create sample data: a job with 3 downloaded items.
    async with db_pool.connection() as conn:
        await r.table('job').insert({
            'id': str(job_id),
            'run_state': RunState.COMPLETED,
        }).run(conn)

        await r.table('response_body').insert({
            # Response bodies are keyed by the blake2 hash of the body.
            'id':
            b'\x00' * 32,
            'is_compressed':
            True,
            'body':
            b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O'
            b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00',
        }).run(conn)
        await r.table('response').insert({
            'id':
            'bbbbbbbb-bbbb-bbbb-bbbb-000000000000',
            'body_id':
            b'\x00' * 32,
            'sequence':
            1,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            1.0,
            'is_success':
            True,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/',
            'url_can':
            'https://www.example/',
            'status_code':
            200,
            'content_type':
            'text/plain',
            'headers': [
                'Server',
                'FakeServer 1.0',
                'X-Foo',
                'Bar',
            ]
        }).run(conn)

        await r.table('response_body').insert({
            'id': b'\x01' * 32,
            'is_compressed': False,
            'body': b'File not found',
        }).run(conn)
        await r.table('response').insert({
            'id':
            'bbbbbbbb-bbbb-bbbb-bbbb-000000000001',
            'body_id':
            b'\x01' * 32,
            'sequence':
            3,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 2, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 3, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            2.0,
            'is_success':
            False,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/foo',
            'url_can':
            'https://www.example/foo',
            'status_code':
            404,
            'content_type':
            'text/plain',
            'headers': []
        }).run(conn)

        await r.table('response_body').insert({
            'id':
            b'\x02' * 32,
            'is_compressed':
            True,
            'body':
            b'\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM'
            b'\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00'
        }).run(conn)
        await r.table('response').insert({
            'id':
            'bbbbbbbb-bbbb-bbbb-bbbb-000000000002',
            'body_id':
            b'\x02' * 32,
            'sequence':
            5,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            2.0,
            'is_success':
            True,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/bar',
            'url_can':
            'https://www.example/bar',
            'status_code':
            200,
            'content_type':
            'text/plain',
            'headers': []
        }).run(conn)

    # Instantiate subscription
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = CrawlSyncSubscription(id_=1,
                                         websocket=websocket,
                                         job_id=str(job_id),
                                         subscription_db=subscription_db,
                                         compression_ok=True,
                                         job_state_recv=job_recv,
                                         sync_token=None)
    assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>'
    nursery.start_soon(subscription.run)

    # Read from subscription
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == 'https://www.example/'
    assert item1.url_can == 'https://www.example/'
    assert item1.started_at == '2019-01-01T01:01:00+00:00'
    assert item1.completed_at == '2019-01-01T01:01:01+00:00'
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.headers[0].key == 'Server'
    assert item1.headers[0].value == 'FakeServer 1.0'
    assert item1.headers[1].key == 'X-Foo'
    assert item1.headers[1].value == 'Bar'
    assert item1.is_success
    assert item1.is_compressed
    assert gzip.decompress(item1.body) == b'Test document #1'
    sync_token = message1.sync_item.token

    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    item2 = message2.sync_item.item
    assert item2.job_id == job_id.bytes
    assert item2.url == 'https://www.example/foo'
    assert item2.url_can == 'https://www.example/foo'
    assert item2.started_at == '2019-01-01T01:01:02+00:00'
    assert item2.completed_at == '2019-01-01T01:01:03+00:00'
    assert item2.cost == 2.0
    assert item2.duration == 1.0
    assert item2.status_code == 404
    assert not item2.is_success
    assert not item2.is_compressed
    assert item2.body == b'File not found'

    # Act as if the subscription was interrupted in between the first and second
    # items, and then resume from there.
    subscription.cancel()
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = CrawlSyncSubscription(id_=2,
                                         websocket=websocket,
                                         job_id=str(job_id),
                                         subscription_db=subscription_db,
                                         compression_ok=True,
                                         job_state_recv=job_recv,
                                         sync_token=sync_token)
    assert repr(subscription) == '<CrawlSyncSubscription id=2 job_id=aaaaaaaa>'
    nursery.start_soon(subscription.run)

    # The next message will be a repeat of the previous, since we "interrupted"
    # the sync before the previous item finished.
    data = await websocket.get_message()
    message3 = ServerMessage.FromString(data).event
    assert message3.subscription_id == 2
    item3 = message3.sync_item.item
    assert item3.url == 'https://www.example/foo'

    data = await websocket.get_message()
    message4 = ServerMessage.FromString(data).event
    assert message4.subscription_id == 2
    item4 = message4.sync_item.item
    assert item4.job_id == job_id.bytes
    assert item4.url == 'https://www.example/bar'
    assert item4.url_can == 'https://www.example/bar'
    assert item4.started_at == '2019-01-01T01:01:04+00:00'
    assert item4.completed_at == '2019-01-01T01:01:05+00:00'
    assert item4.cost == 2.0
    assert item4.duration == 1.0
    assert item4.status_code == 200
    assert item4.is_success
    assert item4.is_compressed
    assert gzip.decompress(item4.body) == b'Test document #2'

    data = await websocket.get_message()
    message5 = ServerMessage.FromString(data).event
    assert message5.subscription_id == 2
    assert message5.subscription_closed.reason == SubscriptionClosed.COMPLETE
Ejemplo n.º 11
0
async def test_subscribe_to_unfinished_crawl(db_pool, job_table,
                                             response_table,
                                             response_body_table, nursery):
    ''' Subscribe to a job that currently has 1 items. After receiving the first
    item, the crawl adds a second item and finishes. The subscription should
    send the second item and also finish. '''
    job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000')

    # Create sample data: a job with 1 downloaded items.
    async with db_pool.connection() as conn:
        await r.table('job').insert({
            'id': str(job_id),
            'run_state': RunState.RUNNING,
        }).run(conn)

        await r.table('response_body').insert({
            'id':
            b'\x00' * 32,
            'is_compressed':
            True,
            'body':
            b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O'
            b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00',
        }).run(conn)
        await r.table('response').insert({
            'id':
            'bbbbbbbb-bbbb-bbbb-bbbb-000000000000',
            'body_id':
            b'\x00' * 32,
            'sequence':
            1,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            1.0,
            'is_success':
            True,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/',
            'url_can':
            'https://www.example/',
            'status_code':
            200,
            'content_type':
            'text/plain',
            'headers': [
                'Server',
                'FakeServer 1.0',
                'X-Foo',
                'Bar',
            ]
        }).run(conn)

    # Instantiate subscription
    logger.info('Set up subscription…')
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = CrawlSyncSubscription(id_=1,
                                         websocket=websocket,
                                         job_id=str(job_id),
                                         subscription_db=subscription_db,
                                         compression_ok=True,
                                         job_state_recv=job_recv,
                                         sync_token=None)
    assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>'
    nursery.start_soon(subscription.run)

    # Read from subscription
    logger.info('Read first event…')
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == 'https://www.example/'
    assert item1.url_can == 'https://www.example/'
    assert item1.started_at == '2019-01-01T01:01:00+00:00'
    assert item1.completed_at == '2019-01-01T01:01:01+00:00'
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.headers[0].key == 'Server'
    assert item1.headers[0].value == 'FakeServer 1.0'
    assert item1.headers[1].key == 'X-Foo'
    assert item1.headers[1].value == 'Bar'
    assert item1.is_success
    assert item1.is_compressed
    assert gzip.decompress(item1.body) == b'Test document #1'

    # The subscription should time out because there are no items to send:
    logger.info('Time out on next event…')
    with pytest.raises(trio.TooSlowError):
        with trio.fail_after(1) as cancel_scope:
            data = await websocket.get_message()

    # Now add second result and mark the crawl as completed:
    logger.info('Add second result…')
    async with db_pool.connection() as conn:
        await r.table('response_body').insert({
            'id':
            b'\x02' * 32,
            'is_compressed':
            True,
            'body':
            b'\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM'
            b'\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00'
        }).run(conn)
        await r.table('response').insert({
            'id':
            'bbbbbbbb-bbbb-bbbb-bbbb-000000000002',
            'body_id':
            b'\x02' * 32,
            'sequence':
            5,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            2.0,
            'is_success':
            True,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/bar',
            'url_can':
            'https://www.example/bar',
            'status_code':
            200,
            'content_type':
            'text/plain',
            'headers': []
        }).run(conn)
    await job_send.send(
        JobStateEvent(job_id=str(job_id),
                      schedule_id=None,
                      run_state=RunState.COMPLETED,
                      event_time=datetime.now(timezone.utc)))

    # Now wait to receive the second result
    logger.info('Read second event…')
    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    item2 = message2.sync_item.item
    assert item2.job_id == job_id.bytes
    assert item2.url == 'https://www.example/bar'
    assert item2.url_can == 'https://www.example/bar'
    assert item2.started_at == '2019-01-01T01:01:04+00:00'
    assert item2.completed_at == '2019-01-01T01:01:05+00:00'
    assert item2.cost == 2.0
    assert item2.duration == 1.0
    assert item2.status_code == 200
    assert item2.is_success
    assert item2.is_compressed
    assert gzip.decompress(item2.body) == b'Test document #2'

    logger.info('Read subscription close…')
    data = await websocket.get_message()
    message3 = ServerMessage.FromString(data).event
    assert message3.subscription_id == 1
    assert message3.subscription_closed.reason == SubscriptionClosed.COMPLETE
Ejemplo n.º 12
0
async def test_subscribe_to_crawl_decompress(db_pool, job_table,
                                             response_table,
                                             response_body_table, nursery):
    ''' If requested, the server will decompress response bodies. '''
    job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000')

    # Create sample data: a job with 1 downloaded item.
    async with db_pool.connection() as conn:
        await r.table('job').insert({
            'id': str(job_id),
            'run_state': RunState.COMPLETED,
        }).run(conn)

        await r.table('response_body').insert({
            'id':
            b'\x00' * 32,
            'is_compressed':
            True,
            'body':
            b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O'
            b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00',
        }).run(conn)
        await r.table('response').insert({
            'id':
            UUID('bbbbbbbb-bbbb-bbbb-bbbb-000000000000').bytes,
            'body_id':
            b'\x00' * 32,
            'sequence':
            1,
            'started_at':
            datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc),
            'completed_at':
            datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc),
            'duration':
            1.0,
            'cost':
            1.0,
            'is_success':
            True,
            'job_id':
            str(job_id),
            'url':
            'https://www.example/',
            'url_can':
            'https://www.example/',
            'status_code':
            200,
            'content_type':
            'text/plain',
            'headers': []
        }).run(conn)

    # Instantiate subscription
    logger.info('Creating subscription…')
    websocket = MockWebsocket()
    job_send, job_recv = trio.open_memory_channel(0)
    subscription_db = SubscriptionDb(db_pool)
    subscription = CrawlSyncSubscription(id_=1,
                                         websocket=websocket,
                                         job_id=str(job_id),
                                         subscription_db=subscription_db,
                                         compression_ok=False,
                                         job_state_recv=job_recv,
                                         sync_token=None)
    assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>'
    nursery.start_soon(subscription.run)

    # Read from subscription
    logger.info('Reading one event…')
    data = await websocket.get_message()
    message1 = ServerMessage.FromString(data).event
    assert message1.subscription_id == 1
    item1 = message1.sync_item.item
    assert item1.job_id == job_id.bytes
    assert item1.url == 'https://www.example/'
    assert item1.url_can == 'https://www.example/'
    assert item1.started_at == '2019-01-01T01:01:00+00:00'
    assert item1.completed_at == '2019-01-01T01:01:01+00:00'
    assert item1.cost == 1.0
    assert item1.duration == 1.0
    assert item1.status_code == 200
    assert item1.is_success
    assert not item1.is_compressed
    assert item1.body == b'Test document #1'

    logger.info('Reading subscription close…')
    data = await websocket.get_message()
    message2 = ServerMessage.FromString(data).event
    assert message2.subscription_id == 1
    assert message2.subscription_closed.reason == SubscriptionClosed.COMPLETE