def test_cron_handle_bot_died_second(self): # Test two tries internal_failure's leading to a BOT_DIED status. self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual(([], 1, 0), task_scheduler.cron_handle_bot_died('f.local')) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) # It must be a different bot. _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') now_2 = self.mock_now(self.now + 2 * task_result.BOT_PING_TOLERANCE, 3) self.assertEqual( (['1d69b9f088008812'], 0, 0), task_scheduler.cron_handle_bot_died('f.local')) self.assertEqual(([], 0, 0), task_scheduler.cron_handle_bot_died('f.local')) expected = { 'abandoned_ts': now_2, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0., 0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': True, 'modified_ts': now_2, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_1, 'state': task_result.State.BOT_DIED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict())
def test_cron_handle_bot_died_second(self): # Test two tries internal_failure's leading to a BOT_DIED status. self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) # It must be a different bot. _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') now_2 = self.mock_now(self.now + 2 * task_result.BOT_PING_TOLERANCE, 3) self.assertEqual((1, 0, 0), task_scheduler.cron_handle_bot_died()) self.assertEqual((0, 0, 0), task_scheduler.cron_handle_bot_died()) expected = { 'abandoned_ts': now_2, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0., 0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': True, 'modified_ts': now_2, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_1, 'state': task_result.State.BOT_DIED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict())
def test_bot_kill_task(self): self.mock(random, 'getrandbits', lambda _: 0x88) dimensions = {u'OS': u'Windows-3.1.1'} request = task_request.make_request( _gen_request(properties={'dimensions': dimensions}), True) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual( None, task_scheduler.bot_kill_task(run_result.key, 'localhost')) expected = { 'abandoned_ts': self.now, 'bot_dimensions': dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': True, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.BOT_DIED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'abandoned_ts': self.now, 'bot_dimensions': dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': self.now, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict())
def test_bot_update_pubsub_error(self): data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), pubsub_topic='projects/abc/topics/def') request = task_request.make_request(data, True) task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual('localhost', run_result.bot_id) # Attempt to terminate the task with success, but make PubSub call fail. self.mock_pub_sub(publish_successful=False) self.assertEqual( (False, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) # Bot retries bot_update, now PubSub works and notification is sent. pub_sub_calls = self.mock_pub_sub(publish_successful=True) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) self.assertEqual(1, len(pub_sub_calls)) # notification is sent
def _bot_update_timeouts(self, hard, io): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'hi', 0, 0, 0.1, hard, io, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict())
def _quick_reap(): """Reaps a task.""" data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') return run_result
def _quick_reap(): """Reaps a task.""" data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') return run_result
def _bot_update_timeouts(self, hard, io): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'hi', 0, 0, 0.1, hard, io, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.TIMED_OUT, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict())
def test_cron_abort_expired_task_to_run_retry(self): self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) result_summary = task_scheduler.schedule_request(request) # Fake first try bot died. bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) self.assertEqual(task_result.State.BOT_DIED, run_result.key.get().state) self.assertEqual( task_result.State.PENDING, run_result.result_summary_key.get().state) # BOT_DIED is kept instead of EXPIRED. abandoned_ts = self.mock_now(self.now, request.expiration_secs+1) self.assertEqual(1, task_scheduler.cron_abort_expired_task_to_run()) self.assertEqual(1, len(task_result.TaskRunResult.query().fetch())) expected = { 'abandoned_ts': abandoned_ts, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': True, 'modified_ts': abandoned_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.key.get().to_dict())
def test_cancel_task_running(self): data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') ok, was_running = task_scheduler.cancel_task(result_summary.key) self.assertEqual(False, ok) self.assertEqual(True, was_running) result_summary = result_summary.key.get() self.assertEqual(task_result.State.RUNNING, result_summary.state)
def test_cancel_task_running(self): data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') ok, was_running = task_scheduler.cancel_task(result_summary.key) self.assertEqual(False, ok) self.assertEqual(True, was_running) result_summary = result_summary.key.get() self.assertEqual(task_result.State.RUNNING, result_summary.state)
def _task_deduped(self, new_ts, deduped_from, task_id='1d8dc670a0008810', now=None): data = _gen_request_data(name='yay', user='******', properties=dict( dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) actual_request_2, run_result_2 = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(None, actual_request_2) result_summary_duped, run_results_duped = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now or self.now, 'costs_usd': [], 'cost_saved_usd': 0.1, 'created_ts': new_ts, 'deduped_from': deduped_from, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': task_id, 'internal_failure': False, # Only this value is updated to 'now', the rest uses the previous run # timestamps. 'modified_ts': new_ts, 'name': u'yay', # A deduped task cannot be deduped against. 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now or self.now, 'state': State.COMPLETED, 'try_number': 0, 'user': u'Raoul', } self.assertEqual(expected, result_summary_duped.to_dict()) self.assertEqual([], run_results_duped)
def test_bot_kill_task_wrong_bot(self): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') expected = ( 'Bot bot1 sent task kill for task 1d69b9f088008811 owned by bot ' 'localhost') self.assertEqual( expected, task_scheduler.bot_kill_task(run_result.key, 'bot1'))
def test_bot_kill_task_wrong_bot(self): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') expected = ( 'Bot bot1 sent task kill for task 1d69b9f088008811 owned by bot ' 'localhost') self.assertEqual(expected, task_scheduler.bot_kill_task(run_result.key, 'bot1'))
def _task_deduped( self, new_ts, deduped_from, task_id='1d8dc670a0008810', now=None): data = _gen_request( name='yay', user='******', properties=dict(dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) actual_request_2, run_result_2 = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(None, actual_request_2) result_summary_duped, run_results_duped = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now or self.now, 'costs_usd': [], 'cost_saved_usd': 0.1, 'created_ts': new_ts, 'deduped_from': deduped_from, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': task_id, 'internal_failure': False, # Only this value is updated to 'now', the rest uses the previous run # timestamps. 'modified_ts': new_ts, 'name': u'yay', 'outputs_ref': None, # A deduped task cannot be deduped against. 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now or self.now, 'state': State.COMPLETED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Raoul'], 'try_number': 0, 'user': u'Raoul', } self.assertEqual(expected, result_summary_duped.to_dict()) self.assertEqual([], run_results_duped)
def test_bot_reap_task(self): data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number)
def test_cancel_task_running(self): data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), pubsub_topic='projects/abc/topics/def') pub_sub_calls = self.mock_pub_sub() request = task_request.make_request(data, True) result_summary = task_scheduler.schedule_request(request) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') ok, was_running = task_scheduler.cancel_task(result_summary.key) self.assertEqual(False, ok) self.assertEqual(True, was_running) result_summary = result_summary.key.get() self.assertEqual(task_result.State.RUNNING, result_summary.state) self.assertEqual(0, len(pub_sub_calls)) # no notifications
def test_task_parent_isolated(self): request = task_request.make_request( _gen_request( properties={ 'commands': None, 'dimensions': {u'OS': u'Windows-3.1.1'}, 'inputs_ref': { 'isolated': '1' * 40, 'isolatedserver': 'http://localhost:1', 'namespace': 'default-gzip', }, }), True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) parent_id = run_result.task_id request = task_request.make_request( _gen_request( parent_task_id=parent_id, properties={'dimensions':{u'OS': u'Windows-3.1.1'}}), True) result_summary = task_scheduler.schedule_request(request) self.assertEqual([], result_summary.children_task_ids) self.assertEqual(parent_id, request.parent_task_id) parent_run_result_key = task_pack.unpack_run_result_key(parent_id) parent_res_summary_key = task_pack.run_result_key_to_result_summary_key( parent_run_result_key) expected = [result_summary.task_id] self.assertEqual(expected, parent_run_result_key.get().children_task_ids) self.assertEqual(expected, parent_res_summary_key.get().children_task_ids)
def test_cron_handle_bot_died_ignored_expired(self): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), scheduling_expiration_secs=600) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 601) self.assertEqual((1, 0, 0), task_scheduler.cron_handle_bot_died())
def test_cron_handle_bot_died_ignored_expired(self): self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 601) self.assertEqual( (['1d69b9f088008811'], 0, 0), task_scheduler.cron_handle_bot_died('f.local'))
def _task_ran_successfully(self): """Runs a task successfully and returns the task_id.""" data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) return unicode(run_result.task_id)
def _task_ran_successfully(self): """Runs a task successfully and returns the task_id.""" data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'}, idempotent=True)) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } actual_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, actual_request) self.assertEqual('localhost', run_result.bot_id) self.assertEqual(None, task_to_run.TaskToRun.query().get().queue_number) # It's important to terminate the task with success. self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1)) return unicode(run_result.key_packed)
def test_cron_handle_bot_died(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), scheduling_expiration_secs=600) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()]) self.assertEqual(2, run_result.try_number) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost-second', 'Foo1', 0, 0, 0.1, False, False, 0.1)) expected = { 'abandoned_ts': None, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now_2, 'costs_usd': [0., 0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_2, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_2, 'state': task_result.State.COMPLETED, 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) self.assertEqual(0.1, run_result.key.get().cost_usd)
def test_exit_code_failure(self): self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = {'OS': 'Windows-3.1.1'} reaped_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 1, 0.1, False, False, 0.1, None)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [1], 'failure': True, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': self.now, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.COMPLETED, 'try_number': 1, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': self.now, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [1], 'failure': True, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': self.now, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def test_get_results(self): # TODO(maruel): Split in more focused tests. self.mock(random, 'getrandbits', lambda _: 0x88) created_ts = self.now self.mock_now(created_ts) data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) # The TaskRequest was enqueued, the TaskResultSummary was created but no # TaskRunResult exist yet since the task was not scheduled on any bot. result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': created_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': State.PENDING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': None, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) self.assertEqual([], run_results) # A bot reaps the TaskToRun. reaped_ts = self.now + datetime.timedelta(seconds=60) self.mock_now(reaped_ts) bot_dimensions = {u'OS': u'Windows-3.1.1'} reaped_request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertTrue(run_result) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': created_ts, # Time the TaskRequest was created. 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reaped_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': reaped_ts, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, }, ] self.assertEqual(expected, [i.to_dict() for i in run_results]) # The bot completes the task. done_ts = self.now + datetime.timedelta(seconds=120) self.mock_now(done_ts) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) self.assertEqual( (True, False), task_scheduler.bot_update_task( run_result.key, 'localhost', 'Bar22', 0, 0, 0.2, False, False, 0.1, None)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': done_ts, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'cost_usd': 0.1, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': done_ts, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def test_cron_handle_bot_died_same_bot_denied(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) data = _gen_request_data( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), scheduling_expiration_secs=600) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried but the same bot polls again, it's denied the task. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(None, request) self.assertEqual(None, run_result) logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()])
def test_cron_handle_bot_died(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual((0, 1, 0), task_scheduler.cron_handle_bot_died()) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost-second', 'abc') logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()]) self.assertEqual(2, run_result.try_number) self.assertEqual( (True, True), task_scheduler.bot_update_task( run_result.key, 'localhost-second', 'Foo1', 0, 0, 0.1, False, False, 0.1, None)) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost-second', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': now_2, 'costs_usd': [0., 0.1], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [0.1], 'exit_codes': [0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_2, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': now_2, 'state': task_result.State.COMPLETED, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 2, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) self.assertEqual(0.1, run_result.key.get().cost_usd)
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() sleep_streak = state.get('sleep_streak', 0) quarantined = bool(quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = 'bot_inactive' if quarantined else 'bot_active' stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event(event_type=event_type, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=quarantined_msg) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if version != expected_version: bot_event('request_update') self._cmd_update(expected_version) return if quarantined: bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot( bot_id, state) if needs_restart: bot_event('request_restart') self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task( dimensions, bot_id, version) if not request: # No task found, tell it to sleep a bit. bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event('bot_terminate', task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event('request_task', task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, bot_id) except: logging.exception('Dang, exception after reaping') raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, 'Deadline')
def test_cron_handle_bot_died_same_bot_denied(self): # Test first retry, then success. self.mock(random, 'getrandbits', lambda _: 0x88) now = utils.utcnow() data = _gen_request( properties=dict(dimensions={u'OS': u'Windows-3.1.1'}), created_ts=now, expiration_ts=now+datetime.timedelta(seconds=600)) request = task_request.make_request(data, True) _result_summary = task_scheduler.schedule_request(request) bot_dimensions = { u'OS': [u'Windows', u'Windows-3.1.1'], u'hostname': u'localhost', u'foo': u'bar', } _request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(1, run_result.try_number) self.assertEqual(task_result.State.RUNNING, run_result.state) now_1 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 1) self.assertEqual(([], 1, 0), task_scheduler.cron_handle_bot_died('f.local')) # Refresh and compare: expected = { 'abandoned_ts': now_1, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': True, 'modified_ts': now_1, 'outputs_ref': None, 'server_versions': [u'v1a'], 'started_ts': self.now, 'state': task_result.State.BOT_DIED, 'try_number': 1, } self.assertEqual(expected, run_result.key.get().to_dict()) expected = { 'abandoned_ts': None, 'bot_dimensions': bot_dimensions, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': self.now, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': now_1, 'name': u'Request name', 'outputs_ref': None, 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': None, 'state': task_result.State.PENDING, 'tags': [u'OS:Windows-3.1.1', u'priority:50', u'tag:1', u'user:Jesus'], 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, run_result.result_summary_key.get().to_dict()) # Task was retried but the same bot polls again, it's denied the task. now_2 = self.mock_now(self.now + task_result.BOT_PING_TOLERANCE, 2) request, run_result = task_scheduler.bot_reap_task( bot_dimensions, 'localhost', 'abc') self.assertEqual(None, request) self.assertEqual(None, run_result) logging.info('%s', [t.to_dict() for t in task_to_run.TaskToRun.query()])
def test_get_results(self): # TODO(maruel): Split in more focused tests. self.mock(random, 'getrandbits', lambda _: 0x88) created_ts = self.now self.mock_now(created_ts) data = _gen_request_data(properties=dict( dimensions={u'OS': u'Windows-3.1.1'})) request = task_request.make_request(data) _result_summary = task_scheduler.schedule_request(request) # The TaskRequest was enqueued, the TaskResultSummary was created but no # TaskRunResult exist yet since the task was not scheduled on any bot. result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': None, 'bot_version': None, 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': created_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [], 'started_ts': None, 'state': State.PENDING, 'try_number': None, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) self.assertEqual([], run_results) # A bot reaps the TaskToRun. reaped_ts = self.now + datetime.timedelta(seconds=60) self.mock_now(reaped_ts) reaped_request, run_result = task_scheduler.bot_reap_task( {'OS': 'Windows-3.1.1'}, 'localhost', 'abc') self.assertEqual(request, reaped_request) self.assertTrue(run_result) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'costs_usd': [0.], 'cost_saved_usd': None, 'created_ts': created_ts, # Time the TaskRequest was created. 'deduped_from': None, 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': reaped_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': None, 'cost_usd': 0., 'durations': [], 'exit_codes': [], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': reaped_ts, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.RUNNING, 'try_number': 1, }, ] self.assertEqual(expected, [i.to_dict() for i in run_results]) # The bot completes the task. done_ts = self.now + datetime.timedelta(seconds=120) self.mock_now(done_ts) self.assertEqual( (True, True), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Foo1', 0, 0, 0.1, False, False, 0.1)) self.assertEqual( (True, False), task_scheduler.bot_update_task(run_result.key, 'localhost', 'Bar22', 0, 0, 0.2, False, False, 0.1)) result_summary, run_results = get_results(request.key) expected = { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'costs_usd': [0.1], 'cost_saved_usd': None, 'created_ts': created_ts, 'deduped_from': None, 'durations': [0.1, 0.2], 'exit_codes': [0, 0], 'failure': False, 'id': '1d69b9f088008810', 'internal_failure': False, 'modified_ts': done_ts, 'name': u'Request name', 'properties_hash': None, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, 'user': u'Jesus', } self.assertEqual(expected, result_summary.to_dict()) expected = [ { 'abandoned_ts': None, 'bot_id': u'localhost', 'bot_version': u'abc', 'children_task_ids': [], 'completed_ts': done_ts, 'cost_usd': 0.1, 'durations': [0.1, 0.2], 'exit_codes': [0, 0], 'failure': False, 'id': '1d69b9f088008811', 'internal_failure': False, 'modified_ts': done_ts, 'server_versions': [u'v1a'], 'started_ts': reaped_ts, 'state': State.COMPLETED, 'try_number': 1, }, ] self.assertEqual(expected, [t.to_dict() for t in run_results])
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ logging.debug('Request started') if config.settings().force_bots_to_sleep_and_not_run_task: # Ignore everything, just sleep. Tell the bot it is quarantined to inform # it that it won't be running anything anyway. Use a large streak so it # will sleep for 60s. self._cmd_sleep(1000, True) return res = self._process() sleep_streak = res.state.get('sleep_streak', 0) quarantined = bool(res.quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event( event_type=event_type, bot_id=res.bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=res.dimensions, state=res.state, version=res.version, quarantined=quarantined, maintenance_msg=res.maintenance_msg, task_id=task_id, task_name=task_name, message=res.quarantined_msg) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip logging.debug('Fetching bot code version') expected_version, _ = bot_code.get_bot_version(self.request.host_url) if res.version != expected_version: bot_event('request_update') self._cmd_update(expected_version) return if quarantined: bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return # If the server-side per-bot config for the bot has changed, we need # to restart this particular bot, so it picks up new config in /handshake. # Do this check only for bots that know about server-side per-bot configs # already (such bots send 'bot_group_cfg_version' state attribute). cur_bot_cfg_ver = res.state.get('bot_group_cfg_version') if cur_bot_cfg_ver and cur_bot_cfg_ver != res.bot_group_cfg.version: bot_event('request_restart') self._cmd_bot_restart('Restarting to pick up new bots.cfg config') return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # If a bot advertise itself with a key state 'maintenance', do not give # a task to it until this key is removed. # # It's an 'hack' because this is not listed in the DB as a separate state, # which hinders system monitoring. See bot_management.BotInfo. In practice, # ts_mon_metrics.py can look a BotInfo.get('maintenance') to determine if a # bot is in maintenance or idle. if res.state.get('maintenance'): bot_event('request_sleep') # Tell the bot it's considered quarantined. self._cmd_sleep(sleep_streak, True) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, secret_bytes, run_result = task_scheduler.bot_reap_task( res.dimensions, res.version, res.lease_expiration_ts) if not request: # No task found, tell it to sleep a bit. bot_event('request_sleep') self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.task_slice( run_result.current_task_slice).properties.is_terminate: bot_event('bot_terminate', task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event('request_task', task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, secret_bytes, run_result, res.bot_id, res.bot_group_cfg) except: logging.exception('Dang, exception after reaping') raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, 'Deadline')
def post(self): """Handles a polling request. Be very permissive on missing values. This can happen because of errors on the bot, *we don't want to deny them the capacity to update*, so that the bot code is eventually fixed and the bot self-update to this working code. It makes recovery of the fleet in case of catastrophic failure much easier. """ (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process() sleep_streak = state.get("sleep_streak", 0) quarantined = bool(quarantined_msg) # Note bot existence at two places, one for stats at 1 minute resolution, # the other for the list of known bots. action = "bot_inactive" if quarantined else "bot_active" stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions) def bot_event(event_type, task_id=None, task_name=None): bot_management.bot_event( event_type=event_type, bot_id=bot_id, external_ip=self.request.remote_addr, dimensions=dimensions, state=state, version=version, quarantined=quarantined, task_id=task_id, task_name=task_name, message=quarantined_msg, ) # Bot version is host-specific because the host URL is embedded in # swarming_bot.zip expected_version = bot_code.get_bot_version(self.request.host_url) if version != expected_version: bot_event("request_update") self._cmd_update(expected_version) return if quarantined: bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return # # At that point, the bot should be in relatively good shape since it's # running the right version. It is still possible that invalid code was # pushed to the server, so be diligent about it. # # Bot may need a reboot if it is running for too long. We do not reboot # quarantined bots. needs_restart, restart_message = bot_management.should_restart_bot(bot_id, state) if needs_restart: bot_event("request_restart") self._cmd_restart(restart_message) return # The bot is in good shape. Try to grab a task. try: # This is a fairly complex function call, exceptions are expected. request, run_result = task_scheduler.bot_reap_task(dimensions, bot_id, version) if not request: # No task found, tell it to sleep a bit. bot_event("request_sleep") self._cmd_sleep(sleep_streak, quarantined) return try: # This part is tricky since it intentionally runs a transaction after # another one. if request.properties.is_terminate: bot_event("bot_terminate", task_id=run_result.task_id) self._cmd_terminate(run_result.task_id) else: bot_event("request_task", task_id=run_result.task_id, task_name=request.name) self._cmd_run(request, run_result.key, bot_id) except: logging.exception("Dang, exception after reaping") raise except runtime.DeadlineExceededError: # If the timeout happened before a task was assigned there is no problems. # If the timeout occurred after a task was assigned, that task will # timeout (BOT_DIED) since the bot didn't get the details required to # run it) and it will automatically get retried (TODO) when the task times # out. # TODO(maruel): Note the task if possible and hand it out on next poll. # https://code.google.com/p/swarming/issues/detail?id=130 self.abort(500, "Deadline")