def test_bot_update_pubsub_error(self): data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), pubsub_topic='projects/abc/topics/def') request = task_request.make_request(data, True) task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual('localhost', run_result.bot_id) # Attempt to terminate the task with success, but make PubSub call fail. self.mock_pub_sub(publish_successful=False) self.assertEqual( (False, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) # Bot retries bot_update, now PubSub works and notification is sent. pub_sub_calls = self.mock_pub_sub(publish_successful=True) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) self.assertEqual(1, len(pub_sub_calls)) # notification is sent
def test_bot_update_task_new_overwrite(self): run_result = _quick_reap() self.assertEqual( (True, False), task_scheduler.bot_update_task(run_result.key, 'localhost', 'hi', 0, None, None, False, False, 0.1)) self.assertEqual( (True, False), task_scheduler.bot_update_task(run_result.key, 'localhost', 'hey', 1, None, None, False, False, 0.1)) self.assertEqual(['hhey'], list(run_result.key.get().get_outputs()))
def test_bot_update_task(self): run_result = _quick_reap() self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hi', 0, 0, 0.1, False, False, 0.1)) self.assertEqual( (True, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hey', 2, 0, 0.1, False, False, 0.1)) self.assertEqual(['hihey'], list(run_result.key.get().get_outputs()))
def test_bot_update_task_new_overwrite(self): run_result = _quick_reap() self.assertEqual( (True, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hi', 0, None, None, False, False, 0.1, None)) self.assertEqual( (True, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hey', 1, None, None, False, False, 0.1, None)) self.assertEqual(['hhey'], list(run_result.key.get().get_outputs()))
def _bot_update_timeouts(self, hard, io): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hi', 0, 0, 0.1, hard, io, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict())
def _bot_update_timeouts(self, hard, io): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'hi', 0, 0, 0.1, hard, io, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict())
def test_bot_update_exception(self): run_result = _quick_reap() def r(*_): raise datastore_utils.CommitError('Sorry!') self.mock(ndb, 'put_multi', r) self.assertEqual( (False, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hi', 0, 0, 0.1, False, False, 0.1))
def test_bot_update_exception(self): run_result = _quick_reap() def r(*_): raise datastore_utils.CommitError('Sorry!') self.mock(ndb, 'put_multi', r) self.assertEqual( (False, False), task_scheduler.bot_update_task(run_result.key, 'localhost', 'hi', 0, 0, 0.1, False, False, 0.1))
def test_task_parent_isolated(self): request = task_request.make_request( _gen_request( properties={ 'commands': None, 'dimensions': {u'OS': u'Windows-3.1.1'}, 'inputs_ref': { 'isolated': '1' * 40, 'isolatedserver': 'http://localhost:1', 'namespace': 'default-gzip', }, }), True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) parent_id = run_result.task_id request = task_request.make_request( _gen_request( parent_task_id=parent_id, properties={'dimensions':{u'OS': u'Windows-3.1.1'}}), True) result_summary = task_scheduler.schedule_request(request) self.assertEqual([], result_summary.children_task_ids) self.assertEqual(parent_id, request.parent_task_id) parent_run_result_key = task_pack.unpack_run_result_key(parent_id) parent_res_summary_key = task_pack.run_result_key_to_result_summary_key( parent_run_result_key) expected = [result_summary.task_id] self.assertEqual(expected, parent_run_result_key.get().children_task_ids) self.assertEqual(expected, parent_res_summary_key.get().children_task_ids)
def _task_ran_successfully(self): """Runs a task successfully and returns the task_id.""" data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1)) return unicode(run_result.key_packed)
def _task_ran_successfully(self): """Runs a task successfully and returns the task_id.""" data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) return unicode(run_result.task_id)
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] task_id = request['task_id'] machine_type = None bot_info = bot_management.get_info_key(bot_id).get() if bot_info: machine_type = bot_info.machine_type # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_auth.validate_bot_id_and_fetch_config(bot_id, machine_type) bot_overhead = request.get('bot_overhead') cipd_pins = request.get('cipd_pins') cipd_stats = request.get('cipd_stats') cost_usd = request.get('cost_usd', 0) duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if (isolated_stats or cipd_stats) and bot_overhead is None: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error= 'isolated_stats and cipd_stats require bot_overhead to be set' '\nbot_overhead: %s\nisolate_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if bot_overhead is not None: performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead) if isolated_stats: download = isolated_stats.get('download') or {} upload = isolated_stats.get('upload') or {} def unpack_base64(d, k): x = d.get(k) if x: return base64.b64decode(x) performance_stats.isolated_download = task_result.OperationStats( duration=download.get('duration'), initial_number_items=download.get('initial_number_items'), initial_size=download.get('initial_size'), items_cold=unpack_base64(download, 'items_cold'), items_hot=unpack_base64(download, 'items_hot')) performance_stats.isolated_upload = task_result.OperationStats( duration=upload.get('duration'), items_cold=unpack_base64(upload, 'items_cold'), items_hot=unpack_base64(upload, 'items_hot')) if cipd_stats: performance_stats.package_installation = task_result.OperationStats( duration=cipd_stats.get('duration')) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) if cipd_pins: cipd_pins = task_result.CipdPins( client_package=task_request.CipdPackage( **cipd_pins['client_package']), packages=[ task_request.CipdPackage(**args) for args in cipd_pins['packages'] ]) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, cipd_pins=cipd_pins, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' elif state == task_result.State.KILLED: action = 'task_killed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) self.send_response({ 'must_stop': state == task_result.State.KILLED, 'ok': True })
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref) if not success: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') action = 'task_completed' if completed else 'task_update' bot_management.bot_event(event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})
def test_get_results(self): # TODO(maruel): Split in more focused tests. self.mock(random, 'getrandbits', lambda _: 0x88) created_ts = self.now self.mock_now(created_ts) data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) # The TaskRequest was enqueued, the TaskResultSummary was created but no # TaskRunResult exist yet since the task was not scheduled on any bot. result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': created_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': State.PENDING, 'try_number': None, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) self.assertEqual([], run_results) # A bot reaps the TaskToRun. reaped_ts = self.now + datetime.timedelta(seconds=60) self.mock_now(reaped_ts) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertTrue(run_result) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': created_ts, # Time the TaskRequest was created. 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reaped_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': reaped_ts, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, }, ] self.assertEqual(expected, [i.to_dict() for i in run_results]) # The bot completes the task. done_ts = self.now + datetime.timedelta(seconds=120) self.mock_now(done_ts) self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1)) self.assertEqual( (True, False), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Bar22', 0, 0, 0.2, False, False, 0.1)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [0.1, 0.2], 'exit_codes': [0, 0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': done_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'cost_usd': 0.1, 'durations': [0.1, 0.2], 'exit_codes': [0, 0], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': done_ts, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def test_exit_code_failure(self): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = {'OS': 'Windows-3.1.1'} reaped_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 1, 0.1, False, False, 0.1, None)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [1], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.COMPLETED, 'try_number': 1, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [1], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def test_get_results(self): # TODO(maruel): Split in more focused tests. self.mock(random, 'getrandbits', lambda _: 0x88) created_ts = self.now self.mock_now(created_ts) data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) # The TaskRequest was enqueued, the TaskResultSummary was created but no # TaskRunResult exist yet since the task was not scheduled on any bot. result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': created_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': State.PENDING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': None, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) self.assertEqual([], run_results) # A bot reaps the TaskToRun. reaped_ts = self.now + datetime.timedelta(seconds=60) self.mock_now(reaped_ts) bot_dimensions = {u'OS': u'Windows-3.1.1'} reaped_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertTrue(run_result) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': created_ts, # Time the TaskRequest was created. 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reaped_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': reaped_ts, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, }, ] self.assertEqual(expected, [i.to_dict() for i in run_results]) # The bot completes the task. done_ts = self.now + datetime.timedelta(seconds=120) self.mock_now(done_ts) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) self.assertEqual( (True, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Bar22', 0, 0, 0.2, False, False, 0.1, None)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': done_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': done_ts, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] bot_overhead = request.get('bot_overhead') duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if bool(isolated_stats) != (bot_overhead is not None): ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error='Both bot_overhead and isolated_stats must be set ' 'simultaneously\nbot_overhead: %s\nisolated_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if isolated_stats: download = isolated_stats['download'] upload = isolated_stats['upload'] performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead, isolated_download=task_result.IsolatedOperation( duration=download['duration'], initial_number_items=download['initial_number_items'], initial_size=download['initial_size'], items_cold=base64.b64decode(download['items_cold']), items_hot=base64.b64decode(download['items_hot'])), isolated_upload=task_result.IsolatedOperation( duration=upload['duration'], items_cold=base64.b64decode(upload['items_cold']), items_hot=base64.b64decode(upload['items_hot']))) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event(event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})
def test_cron_handle_bot_died(self): pub_sub_calls = self.mock_pub_sub() # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600), pubsub_topic='projects/abc/topics/def') request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual(([], 1, 0), task_scheduler.cron_handle_bot_died('f.local')) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # No PubSub notifications yet. self.assertEqual(0, len(pub_sub_calls)) # Task was retried. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()]) self.assertEqual(2, run_result.try_number) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost-second', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now_2, 'costs_usd': [0., 0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_2, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_2, 'state': task_result.State.COMPLETED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) self.assertEqual(0.1, run_result.key.get().cost_usd) # PubSub notification is sent. self.assertEqual(1, len(pub_sub_calls))
def test_cron_handle_bot_died(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), scheduling_expiration_secs=600) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()]) self.assertEqual(2, run_result.try_number) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost-second', 'Foo1', 0, 0, 0.1, False, False, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now_2, 'costs_usd': [0., 0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_2, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_2, 'state': task_result.State.COMPLETED, 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) self.assertEqual(0.1, run_result.key.get().cost_usd)
def test_cron_handle_bot_died(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), scheduling_expiration_secs=600) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()]) self.assertEqual(2, run_result.try_number) self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost-second', 'Foo1', 0, 0, 0.1, False, False, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now_2, 'costs_usd': [0., 0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_2, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_2, 'state': task_result.State.COMPLETED, 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) self.assertEqual(0.1, run_result.key.get().cost_usd)
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, "bot", "keys") if msg: self.abort_with_error(400, error=msg) bot_id = request["id"] cost_usd = request["cost_usd"] task_id = request["task_id"] duration = request.get("duration") exit_code = request.get("exit_code") hard_timeout = request.get("hard_timeout") io_timeout = request.get("io_timeout") output = request.get("output") output_chunk_start = request.get("output_chunk_start") outputs_ref = request.get("outputs_ref") run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error("Failed to decode output\n%s\n%r", e, output) output = output.encode("ascii", "replace") except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error("Failed to decode output\n%s\n%r", e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ) if not success: self.abort_with_error(500, error="Failed to update, please retry") action = "task_completed" if completed else "task_update" bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None, ) except ValueError as e: ereporter2.log_request( request=self.request, source="server", category="task_failure", message="Failed to update task: %s" % e ) self.abort_with_error(400, error=str(e)) except Exception as e: self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({"ok": True})
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys( self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] cost_usd = request['cost_usd'] task_id = request['task_id'] duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') run_result_key = task_pack.unpack_run_result_key(task_id) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) try: success, completed = task_scheduler.bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref) if not success: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') action = 'task_completed' if completed else 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=None, state=None, version=None, quarantined=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request( request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # TODO(maruel): When a task is canceled, reply with 'DIE' so that the bot # reboots itself to abort the task abruptly. It is useful when a task hangs # and the timeout was set too long or the task was superseded by a newer # task with more recent executable (e.g. a new Try Server job on a newer # patchset on Rietveld). self.send_response({'ok': True})