def test_failure(self): # Not enough elements for the placeholders. self.assertEqual(format_status_text(["%s"]), "N/A") self.assertEqual(format_status_text(["%s"], self._tr), "N/E") # No elements at all. self.assertEqual(format_status_text([]), "N/A") self.assertEqual(format_status_text([], self._tr), "N/E")
def test_insuccess(self): # Not enough elements for the placeholders. self.assertEqual(format_status_text(["%s"]), "N/A") self.assertEqual(format_status_text(["%s"], self._tr), "N/E") # No elements at all. self.assertEqual(format_status_text([]), "N/A") self.assertEqual(format_status_text([], self._tr), "N/E")
def test_success_with_translator(self): self.assertEqual(format_status_text([""], self._tr), "") self.assertEqual(format_status_text(["ASD"], self._tr), "ESD") # Translation is applied before formatting. self.assertEqual(format_status_text(["A%s", "ASD"], self._tr), "EASD") self.assertEqual( format_status_text(["AAA %s\n%s", "AAA", "AE"], self._tr), "EEE AAA\nAE")
def test_testcases(base_dir, solution, language, assume=None): global task, file_cacher # Use a FileCacher with a NullBackend in order to avoid to fill # the database with junk if file_cacher is None: file_cacher = FileCacher(null=True) cmscontrib.loaders.italy_yaml.logger = NullLogger() # Load the task # TODO - This implies copying a lot of data to the FileCacher, # which is annoying if you have to do it continuously; it would be # better to use a persistent cache (although local, possibly # filesystem-based instead of database-based) and somehow detect # when the task has already been loaded if task is None: loader = cmscontrib.loaders.italy_yaml.YamlLoader( base_dir, file_cacher) task = loader.get_task(get_statement=False) # Prepare the EvaluationJob dataset = task.active_dataset digest = file_cacher.put_file_from_path( os.path.join(base_dir, solution), "Solution %s for task %s" % (solution, task.name)) executables = {task.name: Executable(filename=task.name, digest=digest)} jobs = [ (t, EvaluationJob( operation=ESOperation(ESOperation.EVALUATION, None, dataset.id, dataset.testcases[t].codename).to_dict(), language=language, task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases ] tasktype = get_task_type(dataset=dataset) ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print(jobinfo[0]) sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") move_cursor(directions.UP, erase=True) continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) status = job.plus.get("exit_status") info.append( (job.plus.get("execution_time"), job.plus.get("execution_memory"))) points.append(float(job.outcome)) # Avoid printing unneeded newline job.text = [t.rstrip() for t in job.text] comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print("Want to stop and consider everything to timeout? [y/N] ", end='') sys.stdout.flush() if assume is not None: tmp = assume print(tmp) else: # User input with a timeout of 5 seconds, at the end of which # we automatically say "n". ready will be a list of input ready # for reading, or an empty list if the timeout expired. # See: http://stackoverflow.com/a/2904057 ready, _, _ = select.select([sys.stdin], [], [], 5) if ready: tmp = sys.stdin.readline().strip().lower() else: tmp = 'n' print(tmp) if tmp in ['y', 'yes']: stop = True else: ask_again = False print() move_cursor(directions.UP, erase=True) # Subtasks scoring subtasks = json.loads(dataset.score_type_parameters) if not isinstance(subtasks, list) or len(subtasks) == 0: subtasks = [[100, len(info)]] if dataset.score_type == 'GroupMin': scoreFun = min else: if dataset.score_type != 'Sum': logger.warning("Score type %s not yet supported! Using Sum" % dataset.score_type) def scoreFun(x): return sum(x) / len(x) pos = 0 sts = [] # For each subtask generate a list of testcase it owns, the score gained # and the highest time and memory usage. for i in subtasks: stscores = [] stsdata = [] worst = [0, 0] try: for _ in xrange(i[1]): stscores.append(points[pos]) stsdata.append( (tcnames[pos], points[pos], comments[pos], info[pos])) if info[pos][0] > worst[0]: worst[0] = info[pos][0] if info[pos][1] > worst[1]: worst[1] = info[pos][1] pos += 1 sts.append((scoreFun(stscores) * i[0], i[0], stsdata, worst)) except: sts.append((0, i[0], stsdata, [0, 0])) # Result pretty printing # Strips sol/ and _EVAL from the solution's name solution = solution[4:-5] print() clen = max(len(c) for c in comments) for st, d in enumerate(sts): print( "Subtask %d:" % st, add_color_to_string( "%5.2f/%d" % (d[0], d[1]), colors.RED if abs(d[0] - d[1]) > 0.01 else colors.GREEN, bold=True)) for (i, p, c, w) in d[2]: print("%s)" % i, add_color_to_string( "%5.2lf" % p, colors.RED if abs(p - 1) > 0.01 else colors.BLACK), "--- %s [Time:" % c.ljust(clen), add_color_to_string( ("%5.3f" % w[0]) if w[0] is not None else "N/A", colors.BLUE if w[0] is not None and w[0] >= 0.95 * d[3][0] else colors.BLACK), "Memory:", add_color_to_string( "%5s" % mem_human(w[1]) if w[1] is not None else "N/A", colors.BLUE if w[1] is not None and w[1] >= 0.95 * d[3][1] else colors.BLACK, ), end="]") move_cursor(directions.RIGHT, 1000) move_cursor(directions.LEFT, len(solution) - 1) print(add_color_to_string(solution, colors.BLACK, bold=True)) print() sols.append((solution, sum([st[0] for st in sts]))) global tested_something if not tested_something: tested_something = True atexit.register(print_at_exit) return zip(points, comments, info)
def wrapped_format_status_text(ctx, status_text): translation = ctx.get("translation", DEFAULT_TRANSLATION) return format_status_text(status_text, translation=translation)
def get(self, submission_id): """Retrieve a single submission. Query the database for the submission with the given ID, and the dataset given as query parameter (or the active one). submission_id (int): the ID of a submission. """ # If it's not an integer we will ignore it. But if it's an # integer of a dataset that doesn't exist we'll raise a 404. dataset_id = local.request.args.get("dataset_id", type=int) with SessionGen() as local.session: # Load the submission, and check for existence. submission = Submission.get_from_id(submission_id, local.session) if submission is None: raise NotFound() # Load the dataset. if dataset_id is not None: dataset = Dataset.get_from_id(dataset_id, local.session) if dataset is None: raise NotFound() else: q = local.session.query(Dataset) q = q.join(Task, Dataset.id == Task.active_dataset_id) q = q.filter(Task.id == submission.task_id) dataset = q.one() # Get the result (will fire a query). submission_result = submission.get_result(dataset) # Get the ScoreType (will fire a query for testcases). score_type = get_score_type(dataset=dataset) # Produce the data structure. s = submission sr = submission_result result = { '_ref': "%s" % s.id, 'dataset': '%s' % dataset.id, 'user': "******" % s.user_id, 'task': "%s" % s.task_id, 'timestamp': make_timestamp(s.timestamp), 'language': s.language, # No files, no token: AWS doesn't need them. } if sr is not None: result.update({ 'compilation_outcome': {"ok": True, "fail": False}.get(sr.compilation_outcome), 'compilation_text': format_status_text(sr.compilation_text), 'compilation_tries': sr.compilation_tries, 'compilation_stdout': sr.compilation_stdout, 'compilation_stderr': sr.compilation_stderr, 'compilation_time': sr.compilation_time, 'compilation_wall_clock_time': sr.compilation_wall_clock_time, 'compilation_memory': sr.compilation_memory, 'compilation_shard': sr.compilation_shard, 'compilation_sandbox': sr.compilation_sandbox, 'evaluation_outcome': {"ok": True}.get(sr.evaluation_outcome), 'evaluation_tries': sr.evaluation_tries, 'evaluations': dict((ev.codename, { 'codename': ev.codename, 'outcome': ev.outcome, 'text': format_status_text(ev.text), 'execution_time': ev.execution_time, 'execution_wall_clock_time': ev.execution_wall_clock_time, 'execution_memory': ev.execution_memory, 'evaluation_shard': ev.evaluation_shard, 'evaluation_sandbox': ev.evaluation_sandbox, }) for ev in sr.evaluations), 'score': sr.score, 'max_score': score_type.max_score, 'score_details': score_type.get_html_details(sr.score_details) if sr.score is not None else None, }) else: # Just copy all fields with None. result.update({ 'compilation_outcome': None, 'compilation_text': None, 'compilation_tries': 0, 'compilation_stdout': None, 'compilation_stderr': None, 'compilation_time': None, 'compilation_wall_clock_time': None, 'compilation_memory': None, 'compilation_shard': None, 'compilation_sandbox': None, 'evaluation_outcome': None, 'evaluation_tries': 0, 'evaluations': {}, 'score': None, 'max_score': score_type.max_score, 'score_details': None, }) # Encode and send. local.response.mimetype = "application/json" local.response.data = json.dumps(result)
def compute_unit_test_score(self, submission_result, submission_info): """Compute the score of a unit test. Format of the returned details: unit_test: True/False subtasks: name: name of the subtask status: (0, "okay") groups: verdict: (42, "") cases: line: (,) case_line() verdict: (42, "No expl. exp.") judge_case() if len(mandatory) != 0 time: 0.412 memory: 33659290 in bytes """ if submission_info is None: return {"unit_test": True, "verdict": (-1, "Not a Unit Test")} submission_info = json.loads(submission_info) expected_sample_score = submission_info["expected_sample_score"] expected_partial_feedback_score = submission_info["expected_partial_feedback_score"] expected_final_score = submission_info["expected_final_score"] expected_sample_score_info = submission_info["expected_sample_score_info"] expected_partial_feedback_score_info = submission_info["expected_partial_feedback_score_info"] expected_final_score_info = submission_info["expected_final_score_info"] expectations = {tuple(json.loads(key)): val for key, val in iteritems(submission_info["expected"])} case_expectations = submission_info["expected_case"] possible_task = expectations[()] # Actually, this means it didn't even compile if not submission_result.evaluated(): subtasks_failed = True subtasks = [] else: evaluations = dict((ev.codename, ev) for ev in submission_result.evaluations) subtasks = [] subtasks_failed = False for subtask in self.parameters["subtasks"]: subtasks.append({ "name": subtask["name"], "groups": [] }) possible_subtask = expectations[tuple(subtask["key"])] worst_group = (1, "okay") group_status = [] for i, g in enumerate(subtask["groups"]): possible_group = expectations[tuple(g["key"])] possible = possible_task + possible_subtask + possible_group subtasks[-1]["groups"].append({"verdict": (42, ""), "cases": []}) min_f = 1.0 # Minimum "score" of a test case in this group cases_failed = False # List of all results of all test cases in this group case_results = [] extended_results = [] for testcase in g["testcases"]: idx = testcase["codename"] r = UnitTest.get_result(submission_info["limits"], evaluations[idx]) min_f = min(min_f, float(evaluations[idx].outcome)) mandatory = case_expectations[idx] l = UnitTest.case_line(r, mandatory, possible) v = (42, "No case-specific expectations.") # Test case expectations if len(mandatory) != 0: accepted, desc = v = \ UnitTest.judge_case(r, mandatory, possible) if accepted <= 0: cases_failed = True extended_results += r case_results += \ [x for x in r if not UnitTest.ignore(x, mandatory) and x not in mandatory] else: case_results += r v = (v[0], v[1] + "\nGrader output: " + format_status_text((evaluations[idx].text)).strip()) subtasks[-1]["groups"][-1]["cases"].\ append({"line": l, "verdict": v, "time": evaluations[idx].execution_time, "memory": evaluations[idx].execution_memory}) status, short, desc = \ UnitTest.judge_group(case_results, extended_results, possible, []) if cases_failed: if status > 0: desc = "" else: desc += "\n\n" status = -1 desc += "At least one testcase did not behave as " \ "expected, cf. the \"test verdict\" column." short = "failed" subtasks[-1]["groups"][-1]["verdict"] = (status, desc) worst_group = min(worst_group, (status, short)) group_status.append(status) subtasks[-1]["status"] = (worst_group[0], worst_group[1].upper()) if all(s == 1337 for s in group_status): subtasks[-1]["status"] = (1337, "IGNORED") elif subtasks[-1]["status"][0] > 0 and any(s == 1337 for s in group_status): subtasks[-1]["status"] = (1337, "PARTIALLY IGNORED") if len(group_status) == 0: subtasks[-1]["status"] = (1337, "EMPTY") if subtasks[-1]["status"][0] <= 0: subtasks_failed = True def is_in(x, l): return l[0] <= x <= l[1] sample_score = self._compute_score(submission_result, "sample")[0] partial_feedback_score = self._compute_score(submission_result, "partial")[0] final_score = self._compute_score(submission_result, "final")[0] sample_score_okay = is_in(sample_score, expected_sample_score) partial_feedback_score_okay = is_in(partial_feedback_score, expected_partial_feedback_score) final_score_okay = is_in(final_score, expected_final_score) partial_feedback_enabled = self.parameters["feedback"] == "partial" okay = not subtasks_failed \ and sample_score_okay \ and (partial_feedback_score_okay or not partial_feedback_enabled) \ and final_score_okay \ details = { "unit_test": True, "subtasks": subtasks, "verdict": (1, "Okay") if okay else (0, "Failed"), "sample_score_okay": sample_score_okay, "sample_score": sample_score, "expected_sample_score": expected_sample_score_info, "partial_feedback_enabled": partial_feedback_enabled, "partial_feedback_score_okay": partial_feedback_score_okay, "partial_feedback_score": partial_feedback_score, "expected_partial_feedback_score": expected_partial_feedback_score_info, "final_score_okay": final_score_okay, "final_score": final_score, "expected_final_score": expected_final_score_info, } return details
def test_success_with_placeholders(self): self.assertEqual(format_status_text(["%s", "ASD"]), "ASD") self.assertEqual(format_status_text(["ASD%s\n%s", "QWE", "123"]), "ASDQWE\n123")
def test_success_no_placeholders(self): self.assertEqual(format_status_text([]), "N/A") self.assertEqual(format_status_text([""]), "") self.assertEqual(format_status_text(["ASD"]), "ASD") self.assertEqual(format_status_text(["你好"]), "你好")
def test_testcases(base_dir, soluzione, language, assume=None): global task, file_cacher # Use a FileCacher with a NullBackend in order to avoid to fill # the database with junk if file_cacher is None: file_cacher = FileCacher(null=True) # Load the task # TODO - This implies copying a lot of data to the FileCacher, # which is annoying if you have to do it continuously; it would be # better to use a persistent cache (although local, possibly # filesystem-based instead of database-based) and somehow detect # when the task has already been loaded if task is None: loader = YamlLoader( os.path.realpath(os.path.join(base_dir, "..")), file_cacher) # Normally we should import the contest before, but YamlLoader # accepts get_task() even without previous get_contest() calls task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1]) # Prepare the EvaluationJob dataset = task.active_dataset digest = file_cacher.put_file_from_path( os.path.join(base_dir, soluzione), "Solution %s for task %s" % (soluzione, task.name)) executables = {task.name: Executable(filename=task.name, digest=digest)} jobs = [(t, EvaluationJob( language=language, task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] tasktype = get_task_type(dataset=dataset) ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print(jobinfo[0], end='') sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) status = job.plus["exit_status"] info.append("Time: %5.3f Wall: %5.3f Memory: %s" % (job.plus["execution_time"], job.plus["execution_wall_clock_time"], mem_human(job.plus["execution_memory"]))) points.append(float(job.outcome)) comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print() print("Want to stop and consider everything to timeout? [y/N]", end='') if assume is not None: print(assume) tmp = assume else: tmp = raw_input().lower() if tmp in ['y', 'yes']: stop = True else: ask_again = False # Result pretty printing print() clen = max(len(c) for c in comments) ilen = max(len(i) for i in info) for (i, p, c, b) in zip(tcnames, points, comments, info): print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen))) return zip(points, comments, info)
def compute_unit_test_score(self, submission_result, submission_info): """Compute the score of a unit test. Format of the returned details: unit_test: True/False subtasks: name: name of the subtask status: (0, "okay") groups: verdict: (42, "") max_runtime: 0.412 max_memory: 33659290 in bytes cases: line: (,) case_line() verdict: (42, "No expl. exp.") judge_case() if len(mandatory) != 0 time: 0.412 memory: 33659290 in bytes """ eps = SubtaskGroup.EPS threshold_lax = SubtaskGroup.THRESHOLD_LAX threshold_strict = SubtaskGroup.THRESHOLD_STRICT threshold_very_strict = SubtaskGroup.THRESHOLD_VERY_STRICT if submission_info is None: return {"unit_test": True, "verdict": (-1, "Not a Unit Test")} submission_info = json.loads(submission_info) expected_sample_score = submission_info["expected_sample_score"] expected_partial_feedback_score = submission_info[ "expected_partial_feedback_score"] expected_final_score = submission_info["expected_final_score"] expected_sample_score_info = submission_info[ "expected_sample_score_info"] expected_partial_feedback_score_info = submission_info[ "expected_partial_feedback_score_info"] expected_final_score_info = submission_info[ "expected_final_score_info"] expectations = { tuple(json.loads(key)): val for key, val in iteritems(submission_info["expected"]) } case_expectations = submission_info["expected_case"] possible_task = expectations[()] useful = set() essential = set() all_cases = { c["codename"] for s in self.parameters["subtasks"] for g in s["groups"] for c in g["testcases"] } dominated = {d: {c for c in all_cases if c != d} for d in all_cases} # Actually, this means it didn't even compile if not submission_result.evaluated(): subtasks_failed = True subtasks = [] else: evaluations = dict( (ev.codename, ev) for ev in submission_result.evaluations) subtasks = [] subtasks_failed = False for subtask in self.parameters["subtasks"]: subtasks.append({"name": subtask["name"], "groups": []}) possible_subtask = expectations[tuple(subtask["key"])] worst_group = (1, "okay") group_status = [] curr_group_dict = {} for i, g in enumerate(subtask["groups"]): """ Check unit test """ possible_group = expectations[tuple(g["key"])] possible = possible_task + possible_subtask + possible_group subtasks[-1]["groups"].append({ "verdict": (42, ""), "cases": [] }) min_f = 1.0 # Minimum "score" of a test case in this group cases_failed = False # List of all results of all test cases in this group case_results = [] extended_results = [] for testcase in g["testcases"]: idx = testcase["codename"] r = UnitTest.get_result(submission_info["limits"], evaluations[idx]) this_score = float(evaluations[idx].outcome) curr_group_dict[idx] = this_score min_f = min(min_f, this_score) mandatory = case_expectations[idx] l = UnitTest.case_line(r, mandatory, possible) v = (42, "No case-specific expectations.") # Test case expectations if len(mandatory) != 0: accepted, desc = v = \ UnitTest.judge_case(r, mandatory, possible) if accepted <= 0: cases_failed = True extended_results += r case_results += \ [x for x in r if not UnitTest.ignore(x, mandatory) and x not in mandatory] else: case_results += r v = (v[0], v[1] + "\nGrader output: " + format_status_text( (evaluations[idx].text)).strip()) subtasks[-1]["groups"][-1]["cases"].\ append({"line": l, "verdict": v, "time": evaluations[idx].execution_time, "memory": evaluations[idx].execution_memory, "codename": idx}) status, short, desc = \ UnitTest.judge_group(case_results, extended_results, possible, []) if cases_failed: if status > 0: desc = "" else: desc += "\n\n" status = -1 desc += "At least one testcase did not behave as " \ "expected, cf. the \"test verdict\" column." short = "failed" subtasks[-1]["groups"][-1]["verdict"] = (status, desc) subtasks[-1]["groups"][-1]["max_runtime"] = \ max((c["time"] for c in subtasks[-1]["groups"][-1]["cases"]), default=None) subtasks[-1]["groups"][-1]["max_memory"] = \ max((c["memory"] for c in subtasks[-1]["groups"][-1]["cases"]), default=None) worst_group = min(worst_group, (status, short)) group_status.append(status) """ Check testcase utility """ if len(g["testcases"]) == 0 or subtask["sample"]: continue group_scores = sorted(curr_group_dict.values()) for c in g["testcases"]: id = c["codename"] s = curr_group_dict[id] is_useful = (s <= min( (1 + threshold_lax) * group_scores[0] + eps, 1 - eps)) is_essential = ( len(group_scores) == 1 or (1 + threshold_strict) * s + eps < group_scores[1]) if is_essential: essential.add(id) elif is_useful: useful.add(id) dominated[id] &= { c for c, p in curr_group_dict.items() if p <= (1 + threshold_very_strict) * s + eps } subtasks[-1]["status"] = (worst_group[0], worst_group[1].upper()) if all(s == 1337 for s in group_status): subtasks[-1]["status"] = (1337, "IGNORED") elif subtasks[-1]["status"][0] > 0 and any( s == 1337 for s in group_status): subtasks[-1]["status"] = (1337, "PARTIALLY IGNORED") if len(group_status) == 0: subtasks[-1]["status"] = (1337, "EMPTY") if subtasks[-1]["status"][0] <= 0: subtasks_failed = True score_precision = submission_info["score_precision"] def is_in(x, l): return round(l[0], score_precision) <= round(x, score_precision) \ <= round(l[1], score_precision) sample_score = self._compute_score(submission_result, "sample")[0] partial_feedback_score = self._compute_score(submission_result, "partial")[0] final_score = self._compute_score(submission_result, "final")[0] sample_score_okay = is_in(sample_score, expected_sample_score) partial_feedback_score_okay = is_in(partial_feedback_score, expected_partial_feedback_score) final_score_okay = is_in(final_score, expected_final_score) partial_feedback_enabled = self.parameters["feedback"] == "partial" okay = not subtasks_failed \ and sample_score_okay \ and (partial_feedback_score_okay or not partial_feedback_enabled) \ and final_score_okay \ details = { "unit_test": True, "subtasks": subtasks, "verdict": (1, "Okay") if okay else (0, "Failed"), "sample_score_okay": sample_score_okay, "sample_score": sample_score, "expected_sample_score": expected_sample_score_info, "partial_feedback_enabled": partial_feedback_enabled, "partial_feedback_score_okay": partial_feedback_score_okay, "partial_feedback_score": partial_feedback_score, "expected_partial_feedback_score": expected_partial_feedback_score_info, "final_score_okay": final_score_okay, "final_score": final_score, "expected_final_score": expected_final_score_info, "dominated": dominated, "essential": essential, "useful": useful } return details
def test_testcases(base_dir, soluzione, language, assume=None): global task, file_cacher # Use a disabled FileCacher with a FSBackend in order to avoid to fill # the database with junk and to save up space. if file_cacher is None: file_cacher = FileCacher(path=os.path.join(config.cache_dir, 'cmsMake'), enabled=False) # Load the task if task is None: loader = YamlLoader(os.path.realpath(os.path.join(base_dir, "..")), file_cacher) # Normally we should import the contest before, but YamlLoader # accepts get_task() even without previous get_contest() calls task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1]) # Prepare the EvaluationJob dataset = task.active_dataset if dataset.task_type != "OutputOnly": digest = file_cacher.put_file_from_path( os.path.join(base_dir, soluzione), "Solution %s for task %s" % (soluzione, task.name)) executables = { task.name: Executable(filename=task.name, digest=digest) } jobs = [(t, EvaluationJob(language=language, task_type=dataset.task_type, task_type_parameters=json.loads( dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] tasktype = get_task_type(dataset=dataset) else: print("Generating outputs...", end='') files = {} for t in sorted(dataset.testcases.keys()): with file_cacher.get_file(dataset.testcases[t].input) as fin: with TemporaryFile() as fout: print(str(t), end='') call(soluzione, stdin=fin, stdout=fout, cwd=base_dir) fout.seek(0) digest = file_cacher.put_file_from_fobj(fout) outname = "output_%s.txt" % t files[outname] = File(filename=outname, digest=digest) jobs = [(t, EvaluationJob(task_type=dataset.task_type, task_type_parameters=json.loads( dataset.task_type_parameters), managers=dict(dataset.managers), files=files, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] for k, job in jobs: job._key = k tasktype = get_task_type(dataset=dataset) print() ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print(jobinfo[0], end='') sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) if dataset.task_type != "OutputOnly": status = job.plus["exit_status"] info.append("Time: %5.3f Wall: %5.3f Memory: %s" % (job.plus["execution_time"], job.plus["execution_wall_clock_time"], mem_human(job.plus["execution_memory"]))) else: status = "ok" info.append("N/A") points.append(float(job.outcome)) comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print() print("Want to stop and consider everything to timeout? [y/N]", end='') if assume is not None: print(assume) tmp = assume else: tmp = raw_input().lower() if tmp in ['y', 'yes']: stop = True else: ask_again = False # Result pretty printing print() clen = max(len(c) for c in comments) ilen = max(len(i) for i in info) for (i, p, c, b) in zip(tcnames, points, comments, info): print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen))) return zip(points, comments, info)
def test_testcases(base_dir, solution, language, assume=None): global task, file_cacher # Use a FileCacher with a NullBackend in order to avoid to fill # the database with junk if file_cacher is None: file_cacher = FileCacher(null=True) cmscontrib.loaders.italy_yaml.logger = NullLogger() # Load the task # TODO - This implies copying a lot of data to the FileCacher, # which is annoying if you have to do it continuously; it would be # better to use a persistent cache (although local, possibly # filesystem-based instead of database-based) and somehow detect # when the task has already been loaded if task is None: loader = cmscontrib.loaders.italy_yaml.YamlLoader(base_dir, file_cacher) task = loader.get_task(get_statement=False) # Prepare the EvaluationJob dataset = task.active_dataset digest = file_cacher.put_file_from_path( os.path.join(base_dir, solution), "Solution %s for task %s" % (solution, task.name)) executables = {task.name: Executable(filename=task.name, digest=digest)} jobs = [(t, EvaluationJob( language=language, task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] tasktype = get_task_type(dataset=dataset) ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print(jobinfo[0]) sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") move_cursor(directions.UP, erase=True) continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) status = job.plus.get("exit_status") info.append((job.plus.get("execution_time"), job.plus.get("execution_memory"))) points.append(float(job.outcome)) comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print("Want to stop and consider everything to timeout? [y/N]", end='') if assume is not None: print(assume) tmp = assume else: tmp = raw_input().lower() if tmp in ['y', 'yes']: stop = True else: ask_again = False print() move_cursor(directions.UP, erase=True) # Subtasks scoring try: subtasks = json.loads(dataset.score_type_parameters) subtasks[0] except: subtasks = [[100, len(info)]] if dataset.score_type == 'GroupMin': scoreFun = min else: if dataset.score_type != 'Sum': logger.warning("Score type %s not yet supported! Using Sum" % dataset.score_type) def scoreFun(x): return sum(x) / len(x) pos = 0 sts = [] # For each subtask generate a list of testcase it owns, the score gained # and the highest time and memory usage. for i in subtasks: stscores = [] stsdata = [] worst = [0, 0] try: for _ in xrange(i[1]): stscores.append(points[pos]) stsdata.append((tcnames[pos], points[pos], comments[pos], info[pos])) if info[pos][0] > worst[0]: worst[0] = info[pos][0] if info[pos][1] > worst[1]: worst[1] = info[pos][1] pos += 1 sts.append((scoreFun(stscores) * i[0], i[0], stsdata, worst)) except: sts.append((0, i[0], stsdata, [0, 0])) # Result pretty printing # Strips sol/ and _EVAL from the solution's name solution = solution[4:-5] print() clen = max(len(c) for c in comments) for st, d in enumerate(sts): print( "Subtask %d:" % st, add_color_to_string( "%5.2f/%d" % (d[0], d[1]), colors.RED if abs(d[0] - d[1]) > 0.01 else colors.GREEN, bold=True ) ) for (i, p, c, w) in d[2]: print( "%s)" % i, add_color_to_string( "%5.2lf" % p, colors.RED if abs(p - 1) > 0.01 else colors.BLACK ), "--- %s [Time:" % c.ljust(clen), add_color_to_string( ("%5.3f" % w[0]) if w[0] is not None else "N/A", colors.BLUE if w[0] is not None and w[0] >= 0.95 * d[3][0] else colors.BLACK ), "Memory:", add_color_to_string( "%5s" % mem_human(w[1]) if w[1] is not None else "N/A", colors.BLUE if w[1] is not None and w[1] >= 0.95 * d[3][1] else colors.BLACK, ), end="]" ) move_cursor(directions.RIGHT, 1000) move_cursor(directions.LEFT, len(solution) - 1) print(add_color_to_string(solution, colors.BLACK, bold=True)) print() sols.append((solution, sum([st[0] for st in sts]))) global tested_something if not tested_something: tested_something = True atexit.register(print_at_exit) return zip(points, comments, info)
def test_testcases(base_dir, soluzione, assume=None): global task, file_cacher # Use a FileCacher with a NullBackend in order to avoid to fill # the database with junk if file_cacher is None: file_cacher = FileCacher(null=True) # Load the task # TODO - This implies copying a lot of data to the FileCacher, # which is annoying if you have to do it continuously; it would be # better to use a persistent cache (although local, possibly # filesystem-based instead of database-based) and somehow detect # when the task has already been loaded if task is None: loader = YamlLoader( os.path.realpath(os.path.join(base_dir, "..")), file_cacher) # Normally we should import the contest before, but YamlLoader # accepts get_task() even without previous get_contest() calls task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1]) # Prepare the EvaluationJob dataset = task.active_dataset digest = file_cacher.put_file_from_path( os.path.join(base_dir, soluzione), "Solution %s for task %s" % (soluzione, task.name)) executables = {task.name: Executable(filename=task.name, digest=digest)} jobs = [(t, EvaluationJob( task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] tasktype = get_task_type(dataset=dataset) ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print jobinfo[0], sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) status = job.plus["exit_status"] info.append("Time: %5.3f Wall: %5.3f Memory: %s" % (job.plus["execution_time"], job.plus["execution_wall_clock_time"], mem_human(job.plus["execution_memory"]))) points.append(float(job.outcome)) comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print print "Want to stop and consider everything to timeout? [y/N]", if assume is not None: print assume tmp = assume else: tmp = raw_input().lower() if tmp in ['y', 'yes']: stop = True else: ask_again = False # Result pretty printing print clen = max(len(c) for c in comments) ilen = max(len(i) for i in info) for (i, p, c, b) in zip(tcnames, points, comments, info): print "%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen)) return zip(points, comments, info)
def test_testcases(base_dir, soluzione, language, assume=None): global task, file_cacher # Use a disabled FileCacher with a FSBackend in order to avoid to fill # the database with junk and to save up space. if file_cacher is None: file_cacher = FileCacher(path=os.path.join(config.cache_dir, 'cmsMake'), enabled=False) # Load the task if task is None: loader = YamlLoader( os.path.realpath(os.path.join(base_dir, "..")), file_cacher) # Normally we should import the contest before, but YamlLoader # accepts get_task() even without previous get_contest() calls task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1]) # Prepare the EvaluationJob dataset = task.active_dataset if dataset.task_type != "OutputOnly": digest = file_cacher.put_file_from_path( os.path.join(base_dir, soluzione), "Solution %s for task %s" % (soluzione, task.name)) executables = {task.name: Executable(filename=task.name, digest=digest)} jobs = [(t, EvaluationJob( language=language, task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), executables=executables, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] tasktype = get_task_type(dataset=dataset) else: print("Generating outputs...", end='') files = {} for t in sorted(dataset.testcases.keys()): with file_cacher.get_file(dataset.testcases[t].input) as fin: with TemporaryFile() as fout: print(str(t), end='') call(soluzione, stdin=fin, stdout=fout, cwd=base_dir) fout.seek(0) digest = file_cacher.put_file_from_fobj(fout) outname = "output_%s.txt" % t files[outname] = File(filename=outname, digest=digest) jobs = [(t, EvaluationJob( task_type=dataset.task_type, task_type_parameters=json.loads(dataset.task_type_parameters), managers=dict(dataset.managers), files=files, input=dataset.testcases[t].input, output=dataset.testcases[t].output, time_limit=dataset.time_limit, memory_limit=dataset.memory_limit)) for t in dataset.testcases] for k, job in jobs: job._key = k tasktype = get_task_type(dataset=dataset) print() ask_again = True last_status = "ok" status = "ok" stop = False info = [] points = [] comments = [] tcnames = [] for jobinfo in sorted(jobs): print(jobinfo[0], end='') sys.stdout.flush() job = jobinfo[1] # Skip the testcase if we decide to consider everything to # timeout if stop: info.append("Time limit exceeded") points.append(0.0) comments.append("Timeout.") continue # Evaluate testcase last_status = status tasktype.evaluate(job, file_cacher) if dataset.task_type != "OutputOnly": status = job.plus["exit_status"] info.append("Time: %5.3f Wall: %5.3f Memory: %s" % (job.plus["execution_time"], job.plus["execution_wall_clock_time"], mem_human(job.plus["execution_memory"]))) else: status = "ok" info.append("N/A") points.append(float(job.outcome)) comments.append(format_status_text(job.text)) tcnames.append(jobinfo[0]) # If we saw two consecutive timeouts, ask wether we want to # consider everything to timeout if ask_again and status == "timeout" and last_status == "timeout": print() print("Want to stop and consider everything to timeout? [y/N]", end='') if assume is not None: print(assume) tmp = assume else: tmp = raw_input().lower() if tmp in ['y', 'yes']: stop = True else: ask_again = False # Result pretty printing print() clen = max(len(c) for c in comments) ilen = max(len(i) for i in info) for (i, p, c, b) in zip(tcnames, points, comments, info): print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen))) return zip(points, comments, info)