async def test_task_monitor(autojump_clock, nursery): # To simplify testing, we pick the current task as the root task: root_task = trio.hazmat.current_task() websocket = MockWebsocket() subscription = TaskMonitorSubscription(id_=1, websocket=websocket, period=2.0, root_task=root_task) # We create a few dummy tasks that will show up in the task monitor. async def dummy_parent(task_status): async with trio.open_nursery() as inner: await inner.start(dummy_child, name='Dummy Child 1') await inner.start(dummy_child, name='Dummy Child 2') task_status.started() async def dummy_child(task_status): task_status.started() await trio.sleep_forever() await nursery.start(dummy_parent, name='Dummy Parent 1') await nursery.start(dummy_parent, name='Dummy Parent 2') nursery.start_soon(subscription.run, name='Task Monitor Subscription') # We should receive the first event right away. with assert_max_elapsed(0.1): data = await websocket.get_message() event1 = ServerMessage.FromString(data).event assert event1.subscription_id == 1 task_tree = event1.task_tree assert task_tree.name == '<Root>' subtask_1 = task_tree.subtasks[0] assert subtask_1.name == 'Dummy Parent 1' subtask_1_1 = subtask_1.subtasks[0] assert subtask_1_1.name == 'Dummy Child 1' subtask_1_2 = subtask_1.subtasks[1] assert subtask_1_2.name == 'Dummy Child 2' subtask_2 = task_tree.subtasks[1] assert subtask_2.name == 'Dummy Parent 2' subtask_2_1 = subtask_2.subtasks[0] assert subtask_2_1.name == 'Dummy Child 1' subtask_2_2 = subtask_2.subtasks[1] assert subtask_2_2.name == 'Dummy Child 2' subtask_3 = task_tree.subtasks[2] assert subtask_3.name == 'Task Monitor Subscription' # The second event won't arrive for two more seconds. with assert_min_elapsed(2.0): data = await websocket.get_message() event2 = ServerMessage.FromString(data).event assert event1.subscription_id == 1 task_tree = event1.task_tree assert task_tree.name == '<Root>' assert len(task_tree.subtasks) == 3 subscription.cancel()
async def test_profile(client): request = Request() request.request_id = 1 request.performance_profile.duration = 0.1 request.performance_profile.sort_by = 'calls' request.performance_profile.top_n = 5 await client.send_message(request.SerializeToString()) message_bytes = await client.get_message() message = ServerMessage.FromString(message_bytes) assert message.response.request_id == 1 profile = message.response.performance_profile assert profile.total_calls > 1 assert profile.total_time > 0.1 assert len(profile.functions) == 5
async def test_job_state_subscription(autojump_clock, nursery): job1_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa') job2_id = UUID('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb') job1_doc = { 'id': str(job1_id), 'name': 'Job #1', 'seeds': ['https://job1.example'], 'tags': ['tag1a', 'tag1b'], 'item_count': 10, 'http_success_count': 7, 'http_error_count': 2, 'exception_count': 1, 'http_status_counts': { 200: 7, 404: 2 }, 'started_at': datetime(2019, 1, 25, 14, 44, 0, tzinfo=timezone.utc), 'completed_at': None, 'run_state': 'RUNNING', } job2_doc = { 'id': str(job2_id), 'name': 'Job #2', 'seeds': ['https://job2.example'], 'tags': ['tag2a'], 'item_count': 20, 'http_success_count': 14, 'http_error_count': 4, 'exception_count': 2, 'http_status_counts': { 200: 14, 404: 4 }, 'started_at': datetime(2019, 1, 25, 14, 55, 0, tzinfo=timezone.utc), 'completed_at': None, 'run_state': 'RUNNING', } stats_tracker = StatsTracker(timedelta(seconds=60)) stats_tracker.add_job(job1_doc) stats_tracker.add_job(job2_doc) websocket = MockWebsocket() subscription = JobStatusSubscription(id_=1, stats_tracker=stats_tracker, websocket=websocket, min_interval=2) assert repr(subscription) == '<JobStatusSubscription id=1>' with pytest.raises(Exception): # Can't cancel before it starts running: subscription.cancel() nursery.start_soon(subscription.run) # The first two items should be received immediately and in full. with assert_max_elapsed(0.1): data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 assert len(message1.job_list.jobs) == 2 job1 = message1.job_list.jobs[0] assert job1.job_id == job1_id.bytes assert job1.name == 'Job #1' assert job1.seeds[0] == 'https://job1.example' assert job1.tags[0] == 'tag1a' assert job1.tags[1] == 'tag1b' assert job1.item_count == 10 assert job1.http_success_count == 7 assert job1.http_error_count == 2 assert job1.exception_count == 1 assert job1.http_status_counts[200] == 7 assert job1.http_status_counts[404] == 2 assert job1.started_at == '2019-01-25T14:44:00+00:00' assert not job1.HasField('completed_at') assert job1.run_state == JobRunState.Value('RUNNING') job2 = message1.job_list.jobs[1] assert job2.job_id == job2_id.bytes assert job2.name == 'Job #2' assert job2.seeds[0] == 'https://job2.example' assert job2.tags[0] == 'tag2a' assert job2.item_count == 20 assert job2.http_success_count == 14 assert job2.http_error_count == 4 assert job2.exception_count == 2 assert job2.http_status_counts[200] == 14 assert job2.http_status_counts[404] == 4 assert job2.started_at == '2019-01-25T14:55:00+00:00' assert not job2.HasField('completed_at') assert job2.run_state == JobRunState.Value('RUNNING') # Add 1 item to job 1. Two seconds later, we should get an update for job 1 # but not job 2. with assert_min_elapsed(2): job1_doc.update({ 'item_count': 11, 'http_success_count': 8, 'http_status_counts': { 200: 8, 404: 2 }, }) data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 assert len(message2.job_list.jobs) == 1 job1 = message2.job_list.jobs[0] assert job1.name == 'Job #1' assert job1.seeds[0] == 'https://job1.example' assert job1.tags[0] == 'tag1a' assert job1.tags[1] == 'tag1b' assert job1.item_count == 11 assert job1.http_success_count == 8 assert job1.http_error_count == 2 assert job1.exception_count == 1 assert job1.http_status_counts[200] == 8 assert job1.http_status_counts[404] == 2 assert job1.started_at == '2019-01-25T14:44:00+00:00' assert not job1.HasField('completed_at') assert job1.run_state == JobRunState.Value('RUNNING') # Add 2 items to job 2. Two seconds later, we should get an update for job 2 # but not job 1. with assert_min_elapsed(2): completed_at = datetime(2019, 1, 25, 14, 56, 0, tzinfo=timezone.utc) job2_doc.update({ 'item_count': 22, 'http_success_count': 15, 'http_error_count': 5, 'http_status_counts': { 200: 15, 404: 5 }, }) data = await websocket.get_message() message3 = ServerMessage.FromString(data).event assert message3.subscription_id == 1 assert len(message3.job_list.jobs) == 1 job2 = message3.job_list.jobs[0] assert job2.name == 'Job #2' assert job2.seeds[0] == 'https://job2.example' assert job2.tags[0] == 'tag2a' assert job2.item_count == 22 assert job2.http_success_count == 15 assert job2.http_error_count == 5 assert job2.exception_count == 2 assert job2.http_status_counts[200] == 15 assert job2.http_status_counts[404] == 5 assert job2.started_at == '2019-01-25T14:55:00+00:00' assert job2.run_state == JobRunState.Value('RUNNING') # Cancel the subscription and wait 2 seconds to make sure it doesn't send us # any more events. subscription.cancel() with pytest.raises(trio.TooSlowError): with trio.fail_after(2): data = await websocket.get_message()
# Instantiate subscription. Ask for 3 historical measurements, but only 2 # are available so it should just send those 2. websocket = MockWebsocket() subscription = ResourceMonitorSubscription( id_=1, websocket=websocket, resource_monitor=resource_monitor, history=3) assert repr(subscription) == '<ResourceMonitorSubscription id=1>' nursery.start_soon(subscription.run) # We should be able to read two events immediately. with assert_max_elapsed(0.1): data = await websocket.get_message() event1 = ServerMessage.FromString(data).event assert event1.subscription_id == 1 frame1 = event1.resource_frame assert frame1.timestamp == '2019-01-25T00:00:00+00:00' assert frame1.cpus[0].usage == 0.99 assert frame1.cpus[1].usage == 0.55 assert frame1.memory.used == 1_000_000 assert frame1.memory.total == 2_000_000 assert frame1.disks[0].mount == '/root' assert frame1.disks[0].used == 3_000_000 assert frame1.disks[0].total == 4_000_000 assert frame1.disks[1].mount == '/home' assert frame1.disks[1].used == 5_000_000 assert frame1.disks[1].total == 6_000_000 assert frame1.networks[0].name == 'eth0' assert frame1.networks[0].sent == 7_000_000
async def test_subscribe_to_crawl(db_pool, job_table, response_table, response_body_table, nursery): """ Subscribe to a job that has 3 items. Simulate interrupting and resuming sync using a sync token. """ job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000") # Create sample data: a job with 3 downloaded items. async with db_pool.connection() as conn: await r.table("job").insert({ "id": str(job_id), "run_state": RunState.COMPLETED, }).run(conn) await r.table("response_body").insert({ # Response bodies are keyed by the blake2 hash of the body. "id": b"\x00" * 32, "is_compressed": True, "body": b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O" b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00", }).run(conn) await r.table("response").insert({ "id": "bbbbbbbb-bbbb-bbbb-bbbb-000000000000", "body_id": b"\x00" * 32, "sequence": 1, "started_at": datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), "duration": 1.0, "cost": 1.0, "is_success": True, "job_id": str(job_id), "url": "https://www.example/", "url_can": "https://www.example/", "status_code": 200, "content_type": "text/plain", "headers": [ "Server", "FakeServer 1.0", "X-Foo", "Bar", ], }).run(conn) await r.table("response_body").insert({ "id": b"\x01" * 32, "is_compressed": False, "body": b"File not found", }).run(conn) await r.table("response").insert({ "id": "bbbbbbbb-bbbb-bbbb-bbbb-000000000001", "body_id": b"\x01" * 32, "sequence": 3, "started_at": datetime(2019, 1, 1, 1, 1, 2, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 3, tzinfo=timezone.utc), "duration": 1.0, "cost": 2.0, "is_success": False, "job_id": str(job_id), "url": "https://www.example/foo", "url_can": "https://www.example/foo", "status_code": 404, "content_type": "text/plain", "headers": [], }).run(conn) await r.table("response_body").insert({ "id": b"\x02" * 32, "is_compressed": True, "body": b"\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM" b"\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00", }).run(conn) await r.table("response").insert({ "id": "bbbbbbbb-bbbb-bbbb-bbbb-000000000002", "body_id": b"\x02" * 32, "sequence": 5, "started_at": datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc), "duration": 1.0, "cost": 2.0, "is_success": True, "job_id": str(job_id), "url": "https://www.example/bar", "url_can": "https://www.example/bar", "status_code": 200, "content_type": "text/plain", "headers": [], }).run(conn) # Instantiate subscription websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = JobSyncSubscription( id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=None, ) assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>" nursery.start_soon(subscription.run) # Read from subscription data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == "https://www.example/" assert item1.url_can == "https://www.example/" assert item1.started_at == "2019-01-01T01:01:00+00:00" assert item1.completed_at == "2019-01-01T01:01:01+00:00" assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.headers[0].key == "Server" assert item1.headers[0].value == "FakeServer 1.0" assert item1.headers[1].key == "X-Foo" assert item1.headers[1].value == "Bar" assert item1.is_success assert item1.is_compressed assert gzip.decompress(item1.body) == b"Test document #1" sync_token = message1.sync_item.token data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 item2 = message2.sync_item.item assert item2.job_id == job_id.bytes assert item2.url == "https://www.example/foo" assert item2.url_can == "https://www.example/foo" assert item2.started_at == "2019-01-01T01:01:02+00:00" assert item2.completed_at == "2019-01-01T01:01:03+00:00" assert item2.cost == 2.0 assert item2.duration == 1.0 assert item2.status_code == 404 assert not item2.is_success assert not item2.is_compressed assert item2.body == b"File not found" # Act as if the subscription was interrupted in between the first and second # items, and then resume from there. subscription.cancel() websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = JobSyncSubscription( id_=2, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=sync_token, ) assert repr(subscription) == "<JobSyncSubscription id=2 job_id=aaaaaaaa>" nursery.start_soon(subscription.run) # The next message will be a repeat of the previous, since we "interrupted" # the sync before the previous item finished. data = await websocket.get_message() message3 = ServerMessage.FromString(data).event assert message3.subscription_id == 2 item3 = message3.sync_item.item assert item3.url == "https://www.example/foo" data = await websocket.get_message() message4 = ServerMessage.FromString(data).event assert message4.subscription_id == 2 item4 = message4.sync_item.item assert item4.job_id == job_id.bytes assert item4.url == "https://www.example/bar" assert item4.url_can == "https://www.example/bar" assert item4.started_at == "2019-01-01T01:01:04+00:00" assert item4.completed_at == "2019-01-01T01:01:05+00:00" assert item4.cost == 2.0 assert item4.duration == 1.0 assert item4.status_code == 200 assert item4.is_success assert item4.is_compressed assert gzip.decompress(item4.body) == b"Test document #2" data = await websocket.get_message() message5 = ServerMessage.FromString(data).event assert message5.subscription_id == 2 assert message5.subscription_closed.reason == SubscriptionClosed.COMPLETE
async def test_subscribe_to_unfinished_crawl(db_pool, job_table, response_table, response_body_table, nursery): """ Subscribe to a job that currently has 1 items. After receiving the first item, the crawl adds a second item and finishes. The subscription should send the second item and also finish. """ job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000") # Create sample data: a job with 1 downloaded items. async with db_pool.connection() as conn: await r.table("job").insert({ "id": str(job_id), "run_state": RunState.RUNNING, }).run(conn) await r.table("response_body").insert({ "id": b"\x00" * 32, "is_compressed": True, "body": b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O" b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00", }).run(conn) await r.table("response").insert({ "id": "bbbbbbbb-bbbb-bbbb-bbbb-000000000000", "body_id": b"\x00" * 32, "sequence": 1, "started_at": datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), "duration": 1.0, "cost": 1.0, "is_success": True, "job_id": str(job_id), "url": "https://www.example/", "url_can": "https://www.example/", "status_code": 200, "content_type": "text/plain", "headers": [ "Server", "FakeServer 1.0", "X-Foo", "Bar", ], }).run(conn) # Instantiate subscription logger.info("Set up subscription…") websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = JobSyncSubscription( id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=None, ) assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>" nursery.start_soon(subscription.run) # Read from subscription logger.info("Read first event…") data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == "https://www.example/" assert item1.url_can == "https://www.example/" assert item1.started_at == "2019-01-01T01:01:00+00:00" assert item1.completed_at == "2019-01-01T01:01:01+00:00" assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.headers[0].key == "Server" assert item1.headers[0].value == "FakeServer 1.0" assert item1.headers[1].key == "X-Foo" assert item1.headers[1].value == "Bar" assert item1.is_success assert item1.is_compressed assert gzip.decompress(item1.body) == b"Test document #1" # The subscription should time out because there are no items to send: logger.info("Time out on next event…") with pytest.raises(trio.TooSlowError): with trio.fail_after(1) as cancel_scope: data = await websocket.get_message() # Now add second result and mark the crawl as completed: logger.info("Add second result…") async with db_pool.connection() as conn: await r.table("response_body").insert({ "id": b"\x02" * 32, "is_compressed": True, "body": b"\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM" b"\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00", }).run(conn) await r.table("response").insert({ "id": "bbbbbbbb-bbbb-bbbb-bbbb-000000000002", "body_id": b"\x02" * 32, "sequence": 5, "started_at": datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc), "duration": 1.0, "cost": 2.0, "is_success": True, "job_id": str(job_id), "url": "https://www.example/bar", "url_can": "https://www.example/bar", "status_code": 200, "content_type": "text/plain", "headers": [], }).run(conn) await job_send.send( JobStateEvent( job_id=str(job_id), schedule_id=None, run_state=RunState.COMPLETED, event_time=datetime.now(timezone.utc), )) # Now wait to receive the second result logger.info("Read second event…") data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 item2 = message2.sync_item.item assert item2.job_id == job_id.bytes assert item2.url == "https://www.example/bar" assert item2.url_can == "https://www.example/bar" assert item2.started_at == "2019-01-01T01:01:04+00:00" assert item2.completed_at == "2019-01-01T01:01:05+00:00" assert item2.cost == 2.0 assert item2.duration == 1.0 assert item2.status_code == 200 assert item2.is_success assert item2.is_compressed assert gzip.decompress(item2.body) == b"Test document #2" logger.info("Read subscription close…") data = await websocket.get_message() message3 = ServerMessage.FromString(data).event assert message3.subscription_id == 1 assert message3.subscription_closed.reason == SubscriptionClosed.COMPLETE
async def test_subscribe_to_crawl_decompress(db_pool, job_table, response_table, response_body_table, nursery): """ If requested, the server will decompress response bodies. """ job_id = UUID("aaaaaaaa-aaaa-aaaa-aaaa-000000000000") # Create sample data: a job with 1 downloaded item. async with db_pool.connection() as conn: await r.table("job").insert({ "id": str(job_id), "run_state": RunState.COMPLETED, }).run(conn) await r.table("response_body").insert({ "id": b"\x00" * 32, "is_compressed": True, "body": b"\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O" b".\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00", }).run(conn) await r.table("response").insert({ "id": UUID("bbbbbbbb-bbbb-bbbb-bbbb-000000000000").bytes, "body_id": b"\x00" * 32, "sequence": 1, "started_at": datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), "completed_at": datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), "duration": 1.0, "cost": 1.0, "is_success": True, "job_id": str(job_id), "url": "https://www.example/", "url_can": "https://www.example/", "status_code": 200, "content_type": "text/plain", "headers": [], }).run(conn) # Instantiate subscription logger.info("Creating subscription…") websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = JobSyncSubscription( id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=False, job_state_recv=job_recv, sync_token=None, ) assert repr(subscription) == "<JobSyncSubscription id=1 job_id=aaaaaaaa>" nursery.start_soon(subscription.run) # Read from subscription logger.info("Reading one event…") data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == "https://www.example/" assert item1.url_can == "https://www.example/" assert item1.started_at == "2019-01-01T01:01:00+00:00" assert item1.completed_at == "2019-01-01T01:01:01+00:00" assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.is_success assert not item1.is_compressed assert item1.body == b"Test document #1" logger.info("Reading subscription close…") data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 assert message2.subscription_closed.reason == SubscriptionClosed.COMPLETE
async def _handle_request(self, request_data): ''' Handle a single API request. :param request: A protobuf request object. ''' start = trio.current_time() message = ServerMessage() message.response.is_success = False request = None try: # Prepare response. request = Request.FromString(request_data) message.response.request_id = request.request_id # Find an appropriate handler. command_name = request.WhichOneof('Command') if command_name is None: raise InvalidRequestException('No command specified') command = getattr(request, command_name) try: handler = _handlers[command_name] except KeyError: raise InvalidRequestException( 'Invalid command name: {}'.format(command_name)) from None # Inject dependencies into argument list, then call the handler. argspec = inspect.getfullargspec(handler) args = list() for var in argspec[0]: if var == 'command': args.append(command) elif var == 'crawl_manager': args.append(self._crawl_manager) elif var == 'nursery': args.append(self._nursery) elif var == 'rate_limiter': args.append(self._rate_limiter) elif var == 'resource_monitor': args.append(self._resource_monitor) elif var == 'response': args.append(message.response) elif var == 'scheduler': args.append(self._scheduler) elif var == 'server_db': args.append(self._server_db) elif var == 'subscription_manager': args.append(self._subscription_manager) elif var == 'stats_tracker': args.append(self._stats_tracker) elif var == 'websocket': args.append(self._ws) else: raise Exception( 'Unknown dependency "{}" in handler {}()'.format( var, command_name)) await handler(*args) message.response.is_success = True elapsed = trio.current_time() - start logger.info('Request OK %s %s %0.3fs', self._client_ip, command_name, elapsed) except DecodeError: # Failure to decode a protobuf message means that the connection # is severely damaged; raise to the nursery so we can close the # entire connection. raise except InvalidRequestException as ire: error_message = str(ire) logger.error('Request ERROR %s %s (%s)', command_name, self._client_ip, error_message) message.response.error_message = error_message except: logger.exception('Exception while handling request:\n%r', request) message.response.error_message = 'A server exception occurred' message_data = message.SerializeToString() await self._ws.send_message(message_data)
async def send_test_command(client, command): ''' A little helper to reduce some boilerplate. ''' await client.send_message(command.SerializeToString()) message = await client.get_message() return ServerMessage.FromString(message).response
async def test_subscribe_to_crawl(db_pool, job_table, response_table, response_body_table, nursery): ''' Subscribe to a job that has 3 items. Simulate interrupting and resuming sync using a sync token. ''' job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000') # Create sample data: a job with 3 downloaded items. async with db_pool.connection() as conn: await r.table('job').insert({ 'id': str(job_id), 'run_state': RunState.COMPLETED, }).run(conn) await r.table('response_body').insert({ # Response bodies are keyed by the blake2 hash of the body. 'id': b'\x00' * 32, 'is_compressed': True, 'body': b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O' b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00', }).run(conn) await r.table('response').insert({ 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-000000000000', 'body_id': b'\x00' * 32, 'sequence': 1, 'started_at': datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 1.0, 'is_success': True, 'job_id': str(job_id), 'url': 'https://www.example/', 'url_can': 'https://www.example/', 'status_code': 200, 'content_type': 'text/plain', 'headers': [ 'Server', 'FakeServer 1.0', 'X-Foo', 'Bar', ] }).run(conn) await r.table('response_body').insert({ 'id': b'\x01' * 32, 'is_compressed': False, 'body': b'File not found', }).run(conn) await r.table('response').insert({ 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-000000000001', 'body_id': b'\x01' * 32, 'sequence': 3, 'started_at': datetime(2019, 1, 1, 1, 1, 2, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 3, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 2.0, 'is_success': False, 'job_id': str(job_id), 'url': 'https://www.example/foo', 'url_can': 'https://www.example/foo', 'status_code': 404, 'content_type': 'text/plain', 'headers': [] }).run(conn) await r.table('response_body').insert({ 'id': b'\x02' * 32, 'is_compressed': True, 'body': b'\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM' b'\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00' }).run(conn) await r.table('response').insert({ 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-000000000002', 'body_id': b'\x02' * 32, 'sequence': 5, 'started_at': datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 2.0, 'is_success': True, 'job_id': str(job_id), 'url': 'https://www.example/bar', 'url_can': 'https://www.example/bar', 'status_code': 200, 'content_type': 'text/plain', 'headers': [] }).run(conn) # Instantiate subscription websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = CrawlSyncSubscription(id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=None) assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>' nursery.start_soon(subscription.run) # Read from subscription data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == 'https://www.example/' assert item1.url_can == 'https://www.example/' assert item1.started_at == '2019-01-01T01:01:00+00:00' assert item1.completed_at == '2019-01-01T01:01:01+00:00' assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.headers[0].key == 'Server' assert item1.headers[0].value == 'FakeServer 1.0' assert item1.headers[1].key == 'X-Foo' assert item1.headers[1].value == 'Bar' assert item1.is_success assert item1.is_compressed assert gzip.decompress(item1.body) == b'Test document #1' sync_token = message1.sync_item.token data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 item2 = message2.sync_item.item assert item2.job_id == job_id.bytes assert item2.url == 'https://www.example/foo' assert item2.url_can == 'https://www.example/foo' assert item2.started_at == '2019-01-01T01:01:02+00:00' assert item2.completed_at == '2019-01-01T01:01:03+00:00' assert item2.cost == 2.0 assert item2.duration == 1.0 assert item2.status_code == 404 assert not item2.is_success assert not item2.is_compressed assert item2.body == b'File not found' # Act as if the subscription was interrupted in between the first and second # items, and then resume from there. subscription.cancel() websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = CrawlSyncSubscription(id_=2, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=sync_token) assert repr(subscription) == '<CrawlSyncSubscription id=2 job_id=aaaaaaaa>' nursery.start_soon(subscription.run) # The next message will be a repeat of the previous, since we "interrupted" # the sync before the previous item finished. data = await websocket.get_message() message3 = ServerMessage.FromString(data).event assert message3.subscription_id == 2 item3 = message3.sync_item.item assert item3.url == 'https://www.example/foo' data = await websocket.get_message() message4 = ServerMessage.FromString(data).event assert message4.subscription_id == 2 item4 = message4.sync_item.item assert item4.job_id == job_id.bytes assert item4.url == 'https://www.example/bar' assert item4.url_can == 'https://www.example/bar' assert item4.started_at == '2019-01-01T01:01:04+00:00' assert item4.completed_at == '2019-01-01T01:01:05+00:00' assert item4.cost == 2.0 assert item4.duration == 1.0 assert item4.status_code == 200 assert item4.is_success assert item4.is_compressed assert gzip.decompress(item4.body) == b'Test document #2' data = await websocket.get_message() message5 = ServerMessage.FromString(data).event assert message5.subscription_id == 2 assert message5.subscription_closed.reason == SubscriptionClosed.COMPLETE
async def test_subscribe_to_unfinished_crawl(db_pool, job_table, response_table, response_body_table, nursery): ''' Subscribe to a job that currently has 1 items. After receiving the first item, the crawl adds a second item and finishes. The subscription should send the second item and also finish. ''' job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000') # Create sample data: a job with 1 downloaded items. async with db_pool.connection() as conn: await r.table('job').insert({ 'id': str(job_id), 'run_state': RunState.RUNNING, }).run(conn) await r.table('response_body').insert({ 'id': b'\x00' * 32, 'is_compressed': True, 'body': b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O' b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00', }).run(conn) await r.table('response').insert({ 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-000000000000', 'body_id': b'\x00' * 32, 'sequence': 1, 'started_at': datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 1.0, 'is_success': True, 'job_id': str(job_id), 'url': 'https://www.example/', 'url_can': 'https://www.example/', 'status_code': 200, 'content_type': 'text/plain', 'headers': [ 'Server', 'FakeServer 1.0', 'X-Foo', 'Bar', ] }).run(conn) # Instantiate subscription logger.info('Set up subscription…') websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = CrawlSyncSubscription(id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=True, job_state_recv=job_recv, sync_token=None) assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>' nursery.start_soon(subscription.run) # Read from subscription logger.info('Read first event…') data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == 'https://www.example/' assert item1.url_can == 'https://www.example/' assert item1.started_at == '2019-01-01T01:01:00+00:00' assert item1.completed_at == '2019-01-01T01:01:01+00:00' assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.headers[0].key == 'Server' assert item1.headers[0].value == 'FakeServer 1.0' assert item1.headers[1].key == 'X-Foo' assert item1.headers[1].value == 'Bar' assert item1.is_success assert item1.is_compressed assert gzip.decompress(item1.body) == b'Test document #1' # The subscription should time out because there are no items to send: logger.info('Time out on next event…') with pytest.raises(trio.TooSlowError): with trio.fail_after(1) as cancel_scope: data = await websocket.get_message() # Now add second result and mark the crawl as completed: logger.info('Add second result…') async with db_pool.connection() as conn: await r.table('response_body').insert({ 'id': b'\x02' * 32, 'is_compressed': True, 'body': b'\x1f\x8b\x08\x00\xe7\x01J\\\x02\xff\x0bI-.QH\xc9O.\xcdM' b'\xcd+QP6\x02\x00R\xda\x93\n\x10\x00\x00\x00' }).run(conn) await r.table('response').insert({ 'id': 'bbbbbbbb-bbbb-bbbb-bbbb-000000000002', 'body_id': b'\x02' * 32, 'sequence': 5, 'started_at': datetime(2019, 1, 1, 1, 1, 4, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 5, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 2.0, 'is_success': True, 'job_id': str(job_id), 'url': 'https://www.example/bar', 'url_can': 'https://www.example/bar', 'status_code': 200, 'content_type': 'text/plain', 'headers': [] }).run(conn) await job_send.send( JobStateEvent(job_id=str(job_id), schedule_id=None, run_state=RunState.COMPLETED, event_time=datetime.now(timezone.utc))) # Now wait to receive the second result logger.info('Read second event…') data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 item2 = message2.sync_item.item assert item2.job_id == job_id.bytes assert item2.url == 'https://www.example/bar' assert item2.url_can == 'https://www.example/bar' assert item2.started_at == '2019-01-01T01:01:04+00:00' assert item2.completed_at == '2019-01-01T01:01:05+00:00' assert item2.cost == 2.0 assert item2.duration == 1.0 assert item2.status_code == 200 assert item2.is_success assert item2.is_compressed assert gzip.decompress(item2.body) == b'Test document #2' logger.info('Read subscription close…') data = await websocket.get_message() message3 = ServerMessage.FromString(data).event assert message3.subscription_id == 1 assert message3.subscription_closed.reason == SubscriptionClosed.COMPLETE
async def test_subscribe_to_crawl_decompress(db_pool, job_table, response_table, response_body_table, nursery): ''' If requested, the server will decompress response bodies. ''' job_id = UUID('aaaaaaaa-aaaa-aaaa-aaaa-000000000000') # Create sample data: a job with 1 downloaded item. async with db_pool.connection() as conn: await r.table('job').insert({ 'id': str(job_id), 'run_state': RunState.COMPLETED, }).run(conn) await r.table('response_body').insert({ 'id': b'\x00' * 32, 'is_compressed': True, 'body': b'\x1f\x8b\x08\x00\x0b\xf0I\\\x02\xff\x0bI-.QH\xc9O' b'.\xcdM\xcd+QP6\x04\x00\xe8\x8b\x9a\x93\x10\x00\x00\x00', }).run(conn) await r.table('response').insert({ 'id': UUID('bbbbbbbb-bbbb-bbbb-bbbb-000000000000').bytes, 'body_id': b'\x00' * 32, 'sequence': 1, 'started_at': datetime(2019, 1, 1, 1, 1, 0, tzinfo=timezone.utc), 'completed_at': datetime(2019, 1, 1, 1, 1, 1, tzinfo=timezone.utc), 'duration': 1.0, 'cost': 1.0, 'is_success': True, 'job_id': str(job_id), 'url': 'https://www.example/', 'url_can': 'https://www.example/', 'status_code': 200, 'content_type': 'text/plain', 'headers': [] }).run(conn) # Instantiate subscription logger.info('Creating subscription…') websocket = MockWebsocket() job_send, job_recv = trio.open_memory_channel(0) subscription_db = SubscriptionDb(db_pool) subscription = CrawlSyncSubscription(id_=1, websocket=websocket, job_id=str(job_id), subscription_db=subscription_db, compression_ok=False, job_state_recv=job_recv, sync_token=None) assert repr(subscription) == '<CrawlSyncSubscription id=1 job_id=aaaaaaaa>' nursery.start_soon(subscription.run) # Read from subscription logger.info('Reading one event…') data = await websocket.get_message() message1 = ServerMessage.FromString(data).event assert message1.subscription_id == 1 item1 = message1.sync_item.item assert item1.job_id == job_id.bytes assert item1.url == 'https://www.example/' assert item1.url_can == 'https://www.example/' assert item1.started_at == '2019-01-01T01:01:00+00:00' assert item1.completed_at == '2019-01-01T01:01:01+00:00' assert item1.cost == 1.0 assert item1.duration == 1.0 assert item1.status_code == 200 assert item1.is_success assert not item1.is_compressed assert item1.body == b'Test document #1' logger.info('Reading subscription close…') data = await websocket.get_message() message2 = ServerMessage.FromString(data).event assert message2.subscription_id == 1 assert message2.subscription_closed.reason == SubscriptionClosed.COMPLETE