def save_problem_ci_results(ci_error, db, error, eval_data, gist, problem_ci, results, should_merge): if not should_merge: # If problem_ci fails, don't save to aggregate bot scores collection if ci_error: log.error('Problem CI failed, not saving to bots ' 'official scores as this is likely an issue ' 'with the new version of the problem.') problem_ci.status = PROBLEM_CI_STATUS_FAILED problem_ci.error = ci_error update_pr_status_problem_ci(ci_error, problem_ci, eval_data) else: log.info('Problem CI not yet finished') else: # Aggregate data from bot evals now that they're done gists = BoxList() for bot_eval_key in problem_ci.bot_eval_keys: bot_eval = db.get(get_eval_db_key(bot_eval_key)) save_to_bot_scores( bot_eval, bot_eval.eval_key, Box(score=bot_eval.results.score, eval_key=bot_eval.eval_key)) gists.append(bot_eval.gist) problem_ci.gists = gists update_pr_status_problem_ci(error, problem_ci, eval_data) problem_ci.status = PROBLEM_CI_STATUS_PASSED db.set(problem_ci.id, problem_ci)
def reduce(): result = dbox(problem_ci) # Refetch all bots in case scores came in after initial request for bot_eval_key in problem_ci.bot_eval_keys: bot_eval = db.get(get_eval_db_key(bot_eval_key)) past_bot_scores = get_past_bot_scores(bot_eval) bot_eval_no_eval_key = deepcopy(bot_eval) del bot_eval_no_eval_key['eval_key'] log.info(f'Checking confidence interval for bot_eval ' f'{box2json(bot_eval)}\n' f'past scores: {box2json(past_bot_scores)}') if bot_eval.results.errors: result.error = str(bot_eval.results.errors) log.error(result.error + ': bot details ' \ f'{box2json(bot_eval_no_eval_key)}') return result in_interval, interval_info = score_within_confidence_interval( bot_eval, past_bot_scores) if not in_interval: result.error = f'Score for bot {bot_eval.results.score}' \ f' not within confidence interval ' \ f'{interval_info.low} to {interval_info.high}, ' \ f'mean: {interval_info.mean} ' \ f'problem CI failed' log.error(result.error + ': bot details ' \ f'{box2json(bot_eval_no_eval_key)}') return result else: log.success('Score for bot within confidence interval, ' 'problem CI successful!') return result
def get_eval_data(eval_key, db: DB) -> Box: db_key = get_eval_db_key(eval_key) # eval_key is secret, do not make public anywhere! eval_data = Box(db.get(db_key)) if eval_data and eval_data.eval_key != eval_key: raise RuntimeError(INVALID_DB_KEY_STATE_MESSAGE) return eval_data
def test_results_handler_already_complete(): payload = Mockable.read_test_box('results_success.json') db = get_liaison_db_store() db_key = get_eval_db_key(payload.eval_key) eval_data = get_test_eval_data() db.set(db_key, eval_data) error, results, eval_data, gist, _ = process_results(payload, db) assert error assert error.http_status_code == 400 assert 'finished' in results
def bots_done(): for bot_eval_key in bot_eval_keys: bot = db.get(get_eval_db_key(bot_eval_key)) log.info(f'Checking if bot is done... bot: {box2json(bot)}') if bot.status != constants.EVAL_STATUS_COMPLETE: log.info('Bot not done') return False else: log.info('All bots done!') return True
def test_confirm_handler(): payload = Mockable.read_test_box('request.json') db = get_liaison_db_store() db_key = get_eval_db_key(payload.eval_key) eval_data = get_test_eval_data() db.set(db_key, eval_data) error, resp = process_confirm(payload, db) eval_data = get_eval_data(payload.eval_key, db) assert not error assert resp.confirmed assert eval_data.status == constants.EVAL_STATUS_CONFIRMED
def test_db_invalid_key_handler(): payload = Mockable.read_test_box('request.json') db = get_liaison_db_store() db_key = get_eval_db_key(payload.eval_key) eval_data = get_test_eval_data() db.set(db_key, eval_data) try: error, results, eval_data, gist, _ = process_results(payload, db) except RuntimeError as e: assert INVALID_DB_KEY_STATE_MESSAGE == str(e) else: raise RuntimeError('Expected exception')
def test_results_handler(): payload = Mockable.read_test_box('results_success.json') db = get_liaison_db_store() db_key = get_eval_db_key(payload.eval_key) eval_data = get_test_eval_data() db.set(db_key, eval_data) error, results, eval_data, gist, _ = process_results(payload, db) assert not error assert 'finished' in results assert 'started' in results assert results.started < results.finished assert results.username == 'crizcraig' assert results.botname == 'forward-agent' assert results.problem == 'deepdrive/domain_randomization'
def collect_bot_scores( docker_tag='deepdriveio/deepdrive:bot_domain_randomization'): """ Catches up bot scores using deepdrive_jobs. This is a violation of data boundaries across deepdrive and botleague, and won't be possible for future independent problem providers. We are now storing results in the bot_eval data as well, to avoid such problems in the future. Alternatively, we could have just downloaded all results from gist/botleague-results which is a source of truth, but this was easier. """ job_db = get_db('deepdrive_jobs') ldb = get_liaison_db_store() for job in job_db.where('eval_spec.docker_tag', '==', docker_tag): eval_key = job.eval_spec.eval_key eval_data = ldb.get(get_eval_db_key(eval_key)) score = Box(score=job.results.score, eval_key=eval_key) save_to_bot_scores(eval_data, eval_key, score)
def trigger_single_eval(self, bot_def, problem_def, problem_id, problem_ci_replace_sim_url=None, container_postfix=None) -> PrResponse: endpoint = problem_def.endpoint if problem_ci_replace_sim_url: problem_def.problem_ci_replace_sim_url = problem_ci_replace_sim_url if container_postfix: problem_def.container_postfix = container_postfix eval_key = generate_rand_alphanumeric(25) eval_id = generate_rand_alphanumeric(25) eval_data = self.get_eval_data(eval_id, eval_key, problem_id, bot_def, problem_def) db = get_liaison_db_store() db_key = get_eval_db_key(eval_data.eval_key) db.set(db_key, eval_data) eval_data = db.get(db_key) # Resolve timestamp resp = self.request_eval(endpoint, eval_data) return resp
def save_eval_data(eval_data: Box, db: DB): db_key = get_eval_db_key(eval_data.eval_key) # eval_key is secret, do not make public anywhere! db.set(db_key, eval_data)