Example #1
0
 def test_failure(self):
     # Not enough elements for the placeholders.
     self.assertEqual(format_status_text(["%s"]), "N/A")
     self.assertEqual(format_status_text(["%s"], self._tr), "N/E")
     # No elements at all.
     self.assertEqual(format_status_text([]), "N/A")
     self.assertEqual(format_status_text([], self._tr), "N/E")
Example #2
0
 def test_insuccess(self):
     # Not enough elements for the placeholders.
     self.assertEqual(format_status_text(["%s"]), "N/A")
     self.assertEqual(format_status_text(["%s"], self._tr), "N/E")
     # No elements at all.
     self.assertEqual(format_status_text([]), "N/A")
     self.assertEqual(format_status_text([], self._tr), "N/E")
Example #3
0
 def test_success_with_translator(self):
     self.assertEqual(format_status_text([""], self._tr), "")
     self.assertEqual(format_status_text(["ASD"], self._tr), "ESD")
     # Translation is applied before formatting.
     self.assertEqual(format_status_text(["A%s", "ASD"], self._tr), "EASD")
     self.assertEqual(
         format_status_text(["AAA %s\n%s", "AAA", "AE"], self._tr),
         "EEE AAA\nAE")
Example #4
0
 def test_success_with_translator(self):
     self.assertEqual(format_status_text([""], self._tr), "")
     self.assertEqual(format_status_text(["ASD"], self._tr), "ESD")
     # Translation is applied before formatting.
     self.assertEqual(format_status_text(["A%s", "ASD"], self._tr), "EASD")
     self.assertEqual(
         format_status_text(["AAA %s\n%s", "AAA", "AE"], self._tr),
         "EEE AAA\nAE")
Example #5
0
def test_testcases(base_dir, solution, language, assume=None):
    global task, file_cacher

    # Use a FileCacher with a NullBackend in order to avoid to fill
    # the database with junk
    if file_cacher is None:
        file_cacher = FileCacher(null=True)

    cmscontrib.loaders.italy_yaml.logger = NullLogger()
    # Load the task
    # TODO - This implies copying a lot of data to the FileCacher,
    # which is annoying if you have to do it continuously; it would be
    # better to use a persistent cache (although local, possibly
    # filesystem-based instead of database-based) and somehow detect
    # when the task has already been loaded
    if task is None:
        loader = cmscontrib.loaders.italy_yaml.YamlLoader(
            base_dir, file_cacher)
        task = loader.get_task(get_statement=False)

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    digest = file_cacher.put_file_from_path(
        os.path.join(base_dir, solution),
        "Solution %s for task %s" % (solution, task.name))
    executables = {task.name: Executable(filename=task.name, digest=digest)}
    jobs = [
        (t,
         EvaluationJob(
             operation=ESOperation(ESOperation.EVALUATION, None, dataset.id,
                                   dataset.testcases[t].codename).to_dict(),
             language=language,
             task_type=dataset.task_type,
             task_type_parameters=json.loads(dataset.task_type_parameters),
             managers=dict(dataset.managers),
             executables=executables,
             input=dataset.testcases[t].input,
             output=dataset.testcases[t].output,
             time_limit=dataset.time_limit,
             memory_limit=dataset.memory_limit)) for t in dataset.testcases
    ]
    tasktype = get_task_type(dataset=dataset)

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print(jobinfo[0])
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            move_cursor(directions.UP, erase=True)
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        status = job.plus.get("exit_status")
        info.append(
            (job.plus.get("execution_time"), job.plus.get("execution_memory")))
        points.append(float(job.outcome))

        # Avoid printing unneeded newline
        job.text = [t.rstrip() for t in job.text]

        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print("Want to stop and consider everything to timeout? [y/N] ",
                  end='')
            sys.stdout.flush()

            if assume is not None:
                tmp = assume
                print(tmp)
            else:
                # User input with a timeout of 5 seconds, at the end of which
                # we automatically say "n". ready will be a list of input ready
                # for reading, or an empty list if the timeout expired.
                # See: http://stackoverflow.com/a/2904057
                ready, _, _ = select.select([sys.stdin], [], [], 5)
                if ready:
                    tmp = sys.stdin.readline().strip().lower()
                else:
                    tmp = 'n'
                    print(tmp)

            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False
            print()
        move_cursor(directions.UP, erase=True)

    # Subtasks scoring
    subtasks = json.loads(dataset.score_type_parameters)
    if not isinstance(subtasks, list) or len(subtasks) == 0:
        subtasks = [[100, len(info)]]

    if dataset.score_type == 'GroupMin':
        scoreFun = min
    else:
        if dataset.score_type != 'Sum':
            logger.warning("Score type %s not yet supported! Using Sum" %
                           dataset.score_type)

        def scoreFun(x):
            return sum(x) / len(x)

    pos = 0
    sts = []

    # For each subtask generate a list of testcase it owns, the score gained
    # and the highest time and memory usage.
    for i in subtasks:
        stscores = []
        stsdata = []
        worst = [0, 0]
        try:
            for _ in xrange(i[1]):
                stscores.append(points[pos])
                stsdata.append(
                    (tcnames[pos], points[pos], comments[pos], info[pos]))
                if info[pos][0] > worst[0]:
                    worst[0] = info[pos][0]
                if info[pos][1] > worst[1]:
                    worst[1] = info[pos][1]
                pos += 1
            sts.append((scoreFun(stscores) * i[0], i[0], stsdata, worst))
        except:
            sts.append((0, i[0], stsdata, [0, 0]))

    # Result pretty printing
    # Strips sol/ and _EVAL from the solution's name
    solution = solution[4:-5]
    print()
    clen = max(len(c) for c in comments)
    for st, d in enumerate(sts):
        print(
            "Subtask %d:" % st,
            add_color_to_string(
                "%5.2f/%d" % (d[0], d[1]),
                colors.RED if abs(d[0] - d[1]) > 0.01 else colors.GREEN,
                bold=True))
        for (i, p, c, w) in d[2]:
            print("%s)" % i,
                  add_color_to_string(
                      "%5.2lf" % p,
                      colors.RED if abs(p - 1) > 0.01 else colors.BLACK),
                  "--- %s [Time:" % c.ljust(clen),
                  add_color_to_string(
                      ("%5.3f" % w[0]) if w[0] is not None else "N/A",
                      colors.BLUE if w[0] is not None
                      and w[0] >= 0.95 * d[3][0] else colors.BLACK),
                  "Memory:",
                  add_color_to_string(
                      "%5s" % mem_human(w[1]) if w[1] is not None else "N/A",
                      colors.BLUE if w[1] is not None
                      and w[1] >= 0.95 * d[3][1] else colors.BLACK,
                  ),
                  end="]")
            move_cursor(directions.RIGHT, 1000)
            move_cursor(directions.LEFT, len(solution) - 1)
            print(add_color_to_string(solution, colors.BLACK, bold=True))
    print()

    sols.append((solution, sum([st[0] for st in sts])))

    global tested_something
    if not tested_something:
        tested_something = True
        atexit.register(print_at_exit)

    return zip(points, comments, info)
Example #6
0
def wrapped_format_status_text(ctx, status_text):
    translation = ctx.get("translation", DEFAULT_TRANSLATION)
    return format_status_text(status_text, translation=translation)
Example #7
0
def wrapped_format_status_text(ctx, status_text):
    translation = ctx.get("translation", DEFAULT_TRANSLATION)
    return format_status_text(status_text, translation=translation)
Example #8
0
    def get(self, submission_id):
        """Retrieve a single submission.

        Query the database for the submission with the given ID, and
        the dataset given as query parameter (or the active one).

        submission_id (int): the ID of a submission.

        """
        # If it's not an integer we will ignore it. But if it's an
        # integer of a dataset that doesn't exist we'll raise a 404.
        dataset_id = local.request.args.get("dataset_id", type=int)

        with SessionGen() as local.session:
            # Load the submission, and check for existence.
            submission = Submission.get_from_id(submission_id, local.session)

            if submission is None:
                raise NotFound()

            # Load the dataset.
            if dataset_id is not None:
                dataset = Dataset.get_from_id(dataset_id, local.session)
                if dataset is None:
                    raise NotFound()
            else:
                q = local.session.query(Dataset)
                q = q.join(Task, Dataset.id == Task.active_dataset_id)
                q = q.filter(Task.id == submission.task_id)
                dataset = q.one()

            # Get the result (will fire a query).
            submission_result = submission.get_result(dataset)

            # Get the ScoreType (will fire a query for testcases).
            score_type = get_score_type(dataset=dataset)

            # Produce the data structure.
            s = submission
            sr = submission_result

            result = {
                '_ref': "%s" % s.id,
                'dataset': '%s' % dataset.id,
                'user': "******" % s.user_id,
                'task': "%s" % s.task_id,
                'timestamp': make_timestamp(s.timestamp),
                'language': s.language,
                # No files, no token: AWS doesn't need them.
            }

            if sr is not None:
                result.update({
                    'compilation_outcome':
                        {"ok": True,
                         "fail": False}.get(sr.compilation_outcome),
                    'compilation_text':
                        format_status_text(sr.compilation_text),
                    'compilation_tries': sr.compilation_tries,
                    'compilation_stdout': sr.compilation_stdout,
                    'compilation_stderr': sr.compilation_stderr,
                    'compilation_time': sr.compilation_time,
                    'compilation_wall_clock_time':
                        sr.compilation_wall_clock_time,
                    'compilation_memory': sr.compilation_memory,
                    'compilation_shard': sr.compilation_shard,
                    'compilation_sandbox': sr.compilation_sandbox,
                    'evaluation_outcome':
                        {"ok": True}.get(sr.evaluation_outcome),
                    'evaluation_tries': sr.evaluation_tries,
                    'evaluations': dict((ev.codename, {
                        'codename': ev.codename,
                        'outcome': ev.outcome,
                        'text': format_status_text(ev.text),
                        'execution_time': ev.execution_time,
                        'execution_wall_clock_time':
                            ev.execution_wall_clock_time,
                        'execution_memory': ev.execution_memory,
                        'evaluation_shard': ev.evaluation_shard,
                        'evaluation_sandbox': ev.evaluation_sandbox,
                    }) for ev in sr.evaluations),
                    'score': sr.score,
                    'max_score': score_type.max_score,
                    'score_details':
                        score_type.get_html_details(sr.score_details)
                        if sr.score is not None else None,
                })
            else:
                # Just copy all fields with None.
                result.update({
                    'compilation_outcome': None,
                    'compilation_text': None,
                    'compilation_tries': 0,
                    'compilation_stdout': None,
                    'compilation_stderr': None,
                    'compilation_time': None,
                    'compilation_wall_clock_time': None,
                    'compilation_memory': None,
                    'compilation_shard': None,
                    'compilation_sandbox': None,
                    'evaluation_outcome': None,
                    'evaluation_tries': 0,
                    'evaluations': {},
                    'score': None,
                    'max_score': score_type.max_score,
                    'score_details': None,
                })

        # Encode and send.
        local.response.mimetype = "application/json"
        local.response.data = json.dumps(result)
Example #9
0
    def compute_unit_test_score(self, submission_result,
                                submission_info):
        """Compute the score of a unit test.

        Format of the returned details:
            unit_test: True/False
            subtasks:
                name: name of the subtask
                status: (0, "okay")
                groups:
                    verdict: (42, "")
                    cases:
                        line: (,)                             case_line()
                        verdict: (42, "No expl. exp.")        judge_case()
                                                        if len(mandatory) != 0
                        time: 0.412
                        memory: 33659290                      in bytes

        """
        if submission_info is None:
            return {"unit_test": True,
                    "verdict": (-1, "Not a Unit Test")}

        submission_info = json.loads(submission_info)

        expected_sample_score = submission_info["expected_sample_score"]
        expected_partial_feedback_score = submission_info["expected_partial_feedback_score"]
        expected_final_score = submission_info["expected_final_score"]
        expected_sample_score_info = submission_info["expected_sample_score_info"]
        expected_partial_feedback_score_info = submission_info["expected_partial_feedback_score_info"]
        expected_final_score_info = submission_info["expected_final_score_info"]

        expectations = {tuple(json.loads(key)): val for key, val
                        in iteritems(submission_info["expected"])}
        case_expectations = submission_info["expected_case"]
        possible_task = expectations[()]

        # Actually, this means it didn't even compile
        if not submission_result.evaluated():
            subtasks_failed = True
            subtasks = []
        else:
            evaluations = dict((ev.codename, ev)
                            for ev in submission_result.evaluations)

            subtasks = []
            subtasks_failed = False

            for subtask in self.parameters["subtasks"]:
                subtasks.append({
                    "name": subtask["name"],
                    "groups": []
                    })

                possible_subtask = expectations[tuple(subtask["key"])]

                worst_group = (1, "okay")
                group_status = []

                for i, g in enumerate(subtask["groups"]):
                    possible_group = expectations[tuple(g["key"])]

                    possible = possible_task + possible_subtask + possible_group

                    subtasks[-1]["groups"].append({"verdict": (42, ""),
                                                "cases": []})
                    min_f = 1.0  # Minimum "score" of a test case in this group

                    cases_failed = False

                    # List of all results of all test cases in this group
                    case_results = []
                    extended_results = []

                    for testcase in g["testcases"]:
                        idx = testcase["codename"]
                        r = UnitTest.get_result(submission_info["limits"],
                                                evaluations[idx])
                        min_f = min(min_f, float(evaluations[idx].outcome))

                        mandatory = case_expectations[idx]

                        l = UnitTest.case_line(r, mandatory, possible)
                        v = (42, "No case-specific expectations.")

                        # Test case expectations
                        if len(mandatory) != 0:
                            accepted, desc = v = \
                                UnitTest.judge_case(r, mandatory, possible)
                            if accepted <= 0:
                                cases_failed = True
                            extended_results += r
                            case_results += \
                                [x for x in r if not UnitTest.ignore(x, mandatory)
                                and x not in mandatory]
                        else:
                            case_results += r

                        v = (v[0],
                            v[1] + "\nGrader output: " +
                            format_status_text((evaluations[idx].text)).strip())

                        subtasks[-1]["groups"][-1]["cases"].\
                            append({"line": l, "verdict": v,
                                    "time": evaluations[idx].execution_time,
                                    "memory": evaluations[idx].execution_memory})

                    status, short, desc = \
                        UnitTest.judge_group(case_results, extended_results,
                                            possible, [])

                    if cases_failed:
                        if status > 0:
                            desc = ""
                        else:
                            desc += "\n\n"

                        status = -1
                        desc += "At least one testcase did not behave as " \
                                "expected, cf. the \"test verdict\" column."
                        short = "failed"

                    subtasks[-1]["groups"][-1]["verdict"] = (status, desc)
                    worst_group = min(worst_group, (status, short))
                    group_status.append(status)

                subtasks[-1]["status"] = (worst_group[0], worst_group[1].upper())

                if all(s == 1337 for s in group_status):
                    subtasks[-1]["status"] = (1337, "IGNORED")
                elif subtasks[-1]["status"][0] > 0 and any(s == 1337
                                                        for s in group_status):
                        subtasks[-1]["status"] = (1337, "PARTIALLY IGNORED")

                if len(group_status) == 0:
                    subtasks[-1]["status"] = (1337, "EMPTY")

                if subtasks[-1]["status"][0] <= 0:
                    subtasks_failed = True

        def is_in(x, l):
            return l[0] <= x <= l[1]

        sample_score = self._compute_score(submission_result, "sample")[0]
        partial_feedback_score = self._compute_score(submission_result, "partial")[0]
        final_score = self._compute_score(submission_result, "final")[0]

        sample_score_okay = is_in(sample_score, expected_sample_score)
        partial_feedback_score_okay = is_in(partial_feedback_score, expected_partial_feedback_score)
        final_score_okay = is_in(final_score, expected_final_score)

        partial_feedback_enabled = self.parameters["feedback"] == "partial"

        okay = not subtasks_failed \
            and sample_score_okay \
            and (partial_feedback_score_okay or not partial_feedback_enabled) \
            and final_score_okay \

        details = {
            "unit_test": True,
            "subtasks": subtasks,
            "verdict": (1, "Okay") if okay else (0, "Failed"),

            "sample_score_okay": sample_score_okay,
            "sample_score": sample_score,
            "expected_sample_score": expected_sample_score_info,

            "partial_feedback_enabled": partial_feedback_enabled,
            "partial_feedback_score_okay": partial_feedback_score_okay,
            "partial_feedback_score": partial_feedback_score,
            "expected_partial_feedback_score": expected_partial_feedback_score_info,

            "final_score_okay": final_score_okay,
            "final_score": final_score,
            "expected_final_score": expected_final_score_info,
            }

        return details
Example #10
0
 def test_success_with_placeholders(self):
     self.assertEqual(format_status_text(["%s", "ASD"]), "ASD")
     self.assertEqual(format_status_text(["ASD%s\n%s", "QWE", "123"]),
                      "ASDQWE\n123")
Example #11
0
 def test_success_no_placeholders(self):
     self.assertEqual(format_status_text([]), "N/A")
     self.assertEqual(format_status_text([""]), "")
     self.assertEqual(format_status_text(["ASD"]), "ASD")
     self.assertEqual(format_status_text(["你好"]), "你好")
Example #12
0
File: Test.py Project: ldct/cms
def test_testcases(base_dir, soluzione, language, assume=None):
    global task, file_cacher

    # Use a FileCacher with a NullBackend in order to avoid to fill
    # the database with junk
    if file_cacher is None:
        file_cacher = FileCacher(null=True)

    # Load the task
    # TODO - This implies copying a lot of data to the FileCacher,
    # which is annoying if you have to do it continuously; it would be
    # better to use a persistent cache (although local, possibly
    # filesystem-based instead of database-based) and somehow detect
    # when the task has already been loaded
    if task is None:
        loader = YamlLoader(
            os.path.realpath(os.path.join(base_dir, "..")),
            file_cacher)
        # Normally we should import the contest before, but YamlLoader
        # accepts get_task() even without previous get_contest() calls
        task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1])

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    digest = file_cacher.put_file_from_path(
        os.path.join(base_dir, soluzione),
        "Solution %s for task %s" % (soluzione, task.name))
    executables = {task.name: Executable(filename=task.name, digest=digest)}
    jobs = [(t, EvaluationJob(
        language=language,
        task_type=dataset.task_type,
        task_type_parameters=json.loads(dataset.task_type_parameters),
        managers=dict(dataset.managers),
        executables=executables,
        input=dataset.testcases[t].input, output=dataset.testcases[t].output,
        time_limit=dataset.time_limit,
        memory_limit=dataset.memory_limit)) for t in dataset.testcases]
    tasktype = get_task_type(dataset=dataset)

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print(jobinfo[0], end='')
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        status = job.plus["exit_status"]
        info.append("Time: %5.3f   Wall: %5.3f   Memory: %s" %
                   (job.plus["execution_time"],
                    job.plus["execution_wall_clock_time"],
                    mem_human(job.plus["execution_memory"])))
        points.append(float(job.outcome))
        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print()
            print("Want to stop and consider everything to timeout? [y/N]",
                  end='')
            if assume is not None:
                print(assume)
                tmp = assume
            else:
                tmp = raw_input().lower()
            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False

    # Result pretty printing
    print()
    clen = max(len(c) for c in comments)
    ilen = max(len(i) for i in info)
    for (i, p, c, b) in zip(tcnames, points, comments, info):
        print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen)))

    return zip(points, comments, info)
Example #13
0
    def compute_unit_test_score(self, submission_result, submission_info):
        """Compute the score of a unit test.

        Format of the returned details:
            unit_test: True/False
            subtasks:
                name: name of the subtask
                status: (0, "okay")
                groups:
                    verdict: (42, "")
                    max_runtime: 0.412
                    max_memory: 33659290                      in bytes
                    cases:
                        line: (,)                             case_line()
                        verdict: (42, "No expl. exp.")        judge_case()
                                                        if len(mandatory) != 0
                        time: 0.412
                        memory: 33659290                      in bytes

        """
        eps = SubtaskGroup.EPS
        threshold_lax = SubtaskGroup.THRESHOLD_LAX
        threshold_strict = SubtaskGroup.THRESHOLD_STRICT
        threshold_very_strict = SubtaskGroup.THRESHOLD_VERY_STRICT

        if submission_info is None:
            return {"unit_test": True, "verdict": (-1, "Not a Unit Test")}

        submission_info = json.loads(submission_info)

        expected_sample_score = submission_info["expected_sample_score"]
        expected_partial_feedback_score = submission_info[
            "expected_partial_feedback_score"]
        expected_final_score = submission_info["expected_final_score"]
        expected_sample_score_info = submission_info[
            "expected_sample_score_info"]
        expected_partial_feedback_score_info = submission_info[
            "expected_partial_feedback_score_info"]
        expected_final_score_info = submission_info[
            "expected_final_score_info"]

        expectations = {
            tuple(json.loads(key)): val
            for key, val in iteritems(submission_info["expected"])
        }
        case_expectations = submission_info["expected_case"]
        possible_task = expectations[()]

        useful = set()
        essential = set()
        all_cases = {
            c["codename"]
            for s in self.parameters["subtasks"] for g in s["groups"]
            for c in g["testcases"]
        }
        dominated = {d: {c for c in all_cases if c != d} for d in all_cases}

        # Actually, this means it didn't even compile
        if not submission_result.evaluated():
            subtasks_failed = True
            subtasks = []
        else:
            evaluations = dict(
                (ev.codename, ev) for ev in submission_result.evaluations)

            subtasks = []
            subtasks_failed = False

            for subtask in self.parameters["subtasks"]:
                subtasks.append({"name": subtask["name"], "groups": []})

                possible_subtask = expectations[tuple(subtask["key"])]

                worst_group = (1, "okay")
                group_status = []
                curr_group_dict = {}

                for i, g in enumerate(subtask["groups"]):
                    """
                    Check unit test
                    """
                    possible_group = expectations[tuple(g["key"])]

                    possible = possible_task + possible_subtask + possible_group

                    subtasks[-1]["groups"].append({
                        "verdict": (42, ""),
                        "cases": []
                    })
                    min_f = 1.0  # Minimum "score" of a test case in this group

                    cases_failed = False

                    # List of all results of all test cases in this group
                    case_results = []
                    extended_results = []

                    for testcase in g["testcases"]:
                        idx = testcase["codename"]
                        r = UnitTest.get_result(submission_info["limits"],
                                                evaluations[idx])
                        this_score = float(evaluations[idx].outcome)
                        curr_group_dict[idx] = this_score
                        min_f = min(min_f, this_score)

                        mandatory = case_expectations[idx]

                        l = UnitTest.case_line(r, mandatory, possible)
                        v = (42, "No case-specific expectations.")

                        # Test case expectations
                        if len(mandatory) != 0:
                            accepted, desc = v = \
                                UnitTest.judge_case(r, mandatory, possible)
                            if accepted <= 0:
                                cases_failed = True
                            extended_results += r
                            case_results += \
                                [x for x in r if not UnitTest.ignore(x, mandatory)
                                and x not in mandatory]
                        else:
                            case_results += r

                        v = (v[0],
                             v[1] + "\nGrader output: " + format_status_text(
                                 (evaluations[idx].text)).strip())

                        subtasks[-1]["groups"][-1]["cases"].\
                            append({"line": l, "verdict": v,
                                    "time": evaluations[idx].execution_time,
                                    "memory": evaluations[idx].execution_memory,
                                    "codename": idx})

                    status, short, desc = \
                        UnitTest.judge_group(case_results, extended_results,
                                            possible, [])

                    if cases_failed:
                        if status > 0:
                            desc = ""
                        else:
                            desc += "\n\n"

                        status = -1
                        desc += "At least one testcase did not behave as " \
                                "expected, cf. the \"test verdict\" column."
                        short = "failed"

                    subtasks[-1]["groups"][-1]["verdict"] = (status, desc)
                    subtasks[-1]["groups"][-1]["max_runtime"] = \
                        max((c["time"]
                             for c in subtasks[-1]["groups"][-1]["cases"]),
                            default=None)
                    subtasks[-1]["groups"][-1]["max_memory"] = \
                        max((c["memory"]
                             for c in subtasks[-1]["groups"][-1]["cases"]),
                            default=None)
                    worst_group = min(worst_group, (status, short))
                    group_status.append(status)
                    """
                    Check testcase utility
                    """
                    if len(g["testcases"]) == 0 or subtask["sample"]:
                        continue

                    group_scores = sorted(curr_group_dict.values())

                    for c in g["testcases"]:
                        id = c["codename"]
                        s = curr_group_dict[id]
                        is_useful = (s <= min(
                            (1 + threshold_lax) * group_scores[0] + eps,
                            1 - eps))
                        is_essential = (
                            len(group_scores) == 1 or
                            (1 + threshold_strict) * s + eps < group_scores[1])

                        if is_essential:
                            essential.add(id)

                        elif is_useful:
                            useful.add(id)

                        dominated[id] &= {
                            c
                            for c, p in curr_group_dict.items()
                            if p <= (1 + threshold_very_strict) * s + eps
                        }

                subtasks[-1]["status"] = (worst_group[0],
                                          worst_group[1].upper())

                if all(s == 1337 for s in group_status):
                    subtasks[-1]["status"] = (1337, "IGNORED")
                elif subtasks[-1]["status"][0] > 0 and any(
                        s == 1337 for s in group_status):
                    subtasks[-1]["status"] = (1337, "PARTIALLY IGNORED")

                if len(group_status) == 0:
                    subtasks[-1]["status"] = (1337, "EMPTY")

                if subtasks[-1]["status"][0] <= 0:
                    subtasks_failed = True

        score_precision = submission_info["score_precision"]

        def is_in(x, l):
            return round(l[0], score_precision) <= round(x, score_precision) \
                                                <= round(l[1], score_precision)

        sample_score = self._compute_score(submission_result, "sample")[0]
        partial_feedback_score = self._compute_score(submission_result,
                                                     "partial")[0]
        final_score = self._compute_score(submission_result, "final")[0]

        sample_score_okay = is_in(sample_score, expected_sample_score)
        partial_feedback_score_okay = is_in(partial_feedback_score,
                                            expected_partial_feedback_score)
        final_score_okay = is_in(final_score, expected_final_score)

        partial_feedback_enabled = self.parameters["feedback"] == "partial"

        okay = not subtasks_failed \
            and sample_score_okay \
            and (partial_feedback_score_okay or not partial_feedback_enabled) \
            and final_score_okay \

        details = {
            "unit_test": True,
            "subtasks": subtasks,
            "verdict": (1, "Okay") if okay else (0, "Failed"),
            "sample_score_okay": sample_score_okay,
            "sample_score": sample_score,
            "expected_sample_score": expected_sample_score_info,
            "partial_feedback_enabled": partial_feedback_enabled,
            "partial_feedback_score_okay": partial_feedback_score_okay,
            "partial_feedback_score": partial_feedback_score,
            "expected_partial_feedback_score":
            expected_partial_feedback_score_info,
            "final_score_okay": final_score_okay,
            "final_score": final_score,
            "expected_final_score": expected_final_score_info,
            "dominated": dominated,
            "essential": essential,
            "useful": useful
        }

        return details
Example #14
0
def test_testcases(base_dir, soluzione, language, assume=None):
    global task, file_cacher

    # Use a disabled FileCacher with a FSBackend in order to avoid to fill
    # the database with junk and to save up space.
    if file_cacher is None:
        file_cacher = FileCacher(path=os.path.join(config.cache_dir,
                                                   'cmsMake'),
                                 enabled=False)

    # Load the task
    if task is None:
        loader = YamlLoader(os.path.realpath(os.path.join(base_dir, "..")),
                            file_cacher)
        # Normally we should import the contest before, but YamlLoader
        # accepts get_task() even without previous get_contest() calls
        task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1])

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    if dataset.task_type != "OutputOnly":
        digest = file_cacher.put_file_from_path(
            os.path.join(base_dir, soluzione),
            "Solution %s for task %s" % (soluzione, task.name))
        executables = {
            task.name: Executable(filename=task.name, digest=digest)
        }
        jobs = [(t,
                 EvaluationJob(language=language,
                               task_type=dataset.task_type,
                               task_type_parameters=json.loads(
                                   dataset.task_type_parameters),
                               managers=dict(dataset.managers),
                               executables=executables,
                               input=dataset.testcases[t].input,
                               output=dataset.testcases[t].output,
                               time_limit=dataset.time_limit,
                               memory_limit=dataset.memory_limit))
                for t in dataset.testcases]
        tasktype = get_task_type(dataset=dataset)
    else:
        print("Generating outputs...", end='')
        files = {}
        for t in sorted(dataset.testcases.keys()):
            with file_cacher.get_file(dataset.testcases[t].input) as fin:
                with TemporaryFile() as fout:
                    print(str(t), end='')
                    call(soluzione, stdin=fin, stdout=fout, cwd=base_dir)
                    fout.seek(0)
                    digest = file_cacher.put_file_from_fobj(fout)
                    outname = "output_%s.txt" % t
                    files[outname] = File(filename=outname, digest=digest)
        jobs = [(t,
                 EvaluationJob(task_type=dataset.task_type,
                               task_type_parameters=json.loads(
                                   dataset.task_type_parameters),
                               managers=dict(dataset.managers),
                               files=files,
                               input=dataset.testcases[t].input,
                               output=dataset.testcases[t].output,
                               time_limit=dataset.time_limit,
                               memory_limit=dataset.memory_limit))
                for t in dataset.testcases]
        for k, job in jobs:
            job._key = k
        tasktype = get_task_type(dataset=dataset)
        print()

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print(jobinfo[0], end='')
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        if dataset.task_type != "OutputOnly":
            status = job.plus["exit_status"]
            info.append("Time: %5.3f   Wall: %5.3f   Memory: %s" %
                        (job.plus["execution_time"],
                         job.plus["execution_wall_clock_time"],
                         mem_human(job.plus["execution_memory"])))
        else:
            status = "ok"
            info.append("N/A")
        points.append(float(job.outcome))
        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print()
            print("Want to stop and consider everything to timeout? [y/N]",
                  end='')
            if assume is not None:
                print(assume)
                tmp = assume
            else:
                tmp = raw_input().lower()
            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False

    # Result pretty printing
    print()
    clen = max(len(c) for c in comments)
    ilen = max(len(i) for i in info)
    for (i, p, c, b) in zip(tcnames, points, comments, info):
        print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen)))

    return zip(points, comments, info)
Example #15
0
 def test_success_no_placeholders(self):
     self.assertEqual(format_status_text([]), "N/A")
     self.assertEqual(format_status_text([""]), "")
     self.assertEqual(format_status_text(["ASD"]), "ASD")
     self.assertEqual(format_status_text(["你好"]), "你好")
Example #16
0
def test_testcases(base_dir, solution, language, assume=None):
    global task, file_cacher

    # Use a FileCacher with a NullBackend in order to avoid to fill
    # the database with junk
    if file_cacher is None:
        file_cacher = FileCacher(null=True)

    cmscontrib.loaders.italy_yaml.logger = NullLogger()
    # Load the task
    # TODO - This implies copying a lot of data to the FileCacher,
    # which is annoying if you have to do it continuously; it would be
    # better to use a persistent cache (although local, possibly
    # filesystem-based instead of database-based) and somehow detect
    # when the task has already been loaded
    if task is None:
        loader = cmscontrib.loaders.italy_yaml.YamlLoader(base_dir,
                                                          file_cacher)
        task = loader.get_task(get_statement=False)

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    digest = file_cacher.put_file_from_path(
        os.path.join(base_dir, solution),
        "Solution %s for task %s" % (solution, task.name))
    executables = {task.name: Executable(filename=task.name, digest=digest)}
    jobs = [(t, EvaluationJob(
        language=language,
        task_type=dataset.task_type,
        task_type_parameters=json.loads(dataset.task_type_parameters),
        managers=dict(dataset.managers),
        executables=executables,
        input=dataset.testcases[t].input, output=dataset.testcases[t].output,
        time_limit=dataset.time_limit,
        memory_limit=dataset.memory_limit)) for t in dataset.testcases]
    tasktype = get_task_type(dataset=dataset)

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print(jobinfo[0])
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            move_cursor(directions.UP, erase=True)
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        status = job.plus.get("exit_status")
        info.append((job.plus.get("execution_time"),
                     job.plus.get("execution_memory")))
        points.append(float(job.outcome))
        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print("Want to stop and consider everything to timeout? [y/N]",
                  end='')
            if assume is not None:
                print(assume)
                tmp = assume
            else:
                tmp = raw_input().lower()
            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False
            print()
        move_cursor(directions.UP, erase=True)

    # Subtasks scoring
    try:
        subtasks = json.loads(dataset.score_type_parameters)
        subtasks[0]
    except:
        subtasks = [[100, len(info)]]

    if dataset.score_type == 'GroupMin':
        scoreFun = min
    else:
        if dataset.score_type != 'Sum':
            logger.warning("Score type %s not yet supported! Using Sum"
                           % dataset.score_type)

        def scoreFun(x):
            return sum(x) / len(x)

    pos = 0
    sts = []

    # For each subtask generate a list of testcase it owns, the score gained
    # and the highest time and memory usage.
    for i in subtasks:
        stscores = []
        stsdata = []
        worst = [0, 0]
        try:
            for _ in xrange(i[1]):
                stscores.append(points[pos])
                stsdata.append((tcnames[pos], points[pos],
                                comments[pos], info[pos]))
                if info[pos][0] > worst[0]:
                    worst[0] = info[pos][0]
                if info[pos][1] > worst[1]:
                    worst[1] = info[pos][1]
                pos += 1
            sts.append((scoreFun(stscores) * i[0], i[0], stsdata, worst))
        except:
            sts.append((0, i[0], stsdata, [0, 0]))

    # Result pretty printing
    # Strips sol/ and _EVAL from the solution's name
    solution = solution[4:-5]
    print()
    clen = max(len(c) for c in comments)
    for st, d in enumerate(sts):
        print(
            "Subtask %d:" % st,
            add_color_to_string(
                "%5.2f/%d" % (d[0], d[1]),
                colors.RED if abs(d[0] - d[1]) > 0.01 else colors.GREEN,
                bold=True
            )
        )
        for (i, p, c, w) in d[2]:
            print(
                "%s)" % i,
                add_color_to_string(
                    "%5.2lf" % p,
                    colors.RED if abs(p - 1) > 0.01 else colors.BLACK
                ),
                "--- %s [Time:" % c.ljust(clen),
                add_color_to_string(
                    ("%5.3f" % w[0]) if w[0] is not None else "N/A",
                    colors.BLUE if w[0] is not None and w[0] >= 0.95 * d[3][0]
                    else colors.BLACK
                ),
                "Memory:",
                add_color_to_string(
                    "%5s" % mem_human(w[1]) if w[1] is not None else "N/A",
                    colors.BLUE if w[1] is not None and w[1] >= 0.95 * d[3][1]
                    else colors.BLACK,
                ),
                end="]"
            )
            move_cursor(directions.RIGHT, 1000)
            move_cursor(directions.LEFT, len(solution) - 1)
            print(add_color_to_string(solution, colors.BLACK, bold=True))
    print()

    sols.append((solution, sum([st[0] for st in sts])))

    global tested_something
    if not tested_something:
        tested_something = True
        atexit.register(print_at_exit)

    return zip(points, comments, info)
Example #17
0
 def test_success_with_placeholders(self):
     self.assertEqual(format_status_text(["%s", "ASD"]), "ASD")
     self.assertEqual(format_status_text(["ASD%s\n%s", "QWE", "123"]),
                      "ASDQWE\n123")
Example #18
0
def test_testcases(base_dir, soluzione, assume=None):
    global task, file_cacher

    # Use a FileCacher with a NullBackend in order to avoid to fill
    # the database with junk
    if file_cacher is None:
        file_cacher = FileCacher(null=True)

    # Load the task
    # TODO - This implies copying a lot of data to the FileCacher,
    # which is annoying if you have to do it continuously; it would be
    # better to use a persistent cache (although local, possibly
    # filesystem-based instead of database-based) and somehow detect
    # when the task has already been loaded
    if task is None:
        loader = YamlLoader(
            os.path.realpath(os.path.join(base_dir, "..")),
            file_cacher)
        # Normally we should import the contest before, but YamlLoader
        # accepts get_task() even without previous get_contest() calls
        task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1])

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    digest = file_cacher.put_file_from_path(
        os.path.join(base_dir, soluzione),
        "Solution %s for task %s" % (soluzione, task.name))
    executables = {task.name: Executable(filename=task.name, digest=digest)}
    jobs = [(t, EvaluationJob(
        task_type=dataset.task_type,
        task_type_parameters=json.loads(dataset.task_type_parameters),
        managers=dict(dataset.managers),
        executables=executables,
        input=dataset.testcases[t].input, output=dataset.testcases[t].output,
        time_limit=dataset.time_limit,
        memory_limit=dataset.memory_limit)) for t in dataset.testcases]
    tasktype = get_task_type(dataset=dataset)

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print jobinfo[0],
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        status = job.plus["exit_status"]
        info.append("Time: %5.3f   Wall: %5.3f   Memory: %s" %
                   (job.plus["execution_time"],
                    job.plus["execution_wall_clock_time"],
                    mem_human(job.plus["execution_memory"])))
        points.append(float(job.outcome))
        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print
            print "Want to stop and consider everything to timeout? [y/N]",
            if assume is not None:
                print assume
                tmp = assume
            else:
                tmp = raw_input().lower()
            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False

    # Result pretty printing
    print
    clen = max(len(c) for c in comments)
    ilen = max(len(i) for i in info)
    for (i, p, c, b) in zip(tcnames, points, comments, info):
        print "%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen))

    return zip(points, comments, info)
Example #19
0
def test_testcases(base_dir, soluzione, language, assume=None):
    global task, file_cacher

    # Use a disabled FileCacher with a FSBackend in order to avoid to fill
    # the database with junk and to save up space.
    if file_cacher is None:
        file_cacher = FileCacher(path=os.path.join(config.cache_dir,
                                                   'cmsMake'),
                                 enabled=False)

    # Load the task
    if task is None:
        loader = YamlLoader(
            os.path.realpath(os.path.join(base_dir, "..")),
            file_cacher)
        # Normally we should import the contest before, but YamlLoader
        # accepts get_task() even without previous get_contest() calls
        task = loader.get_task(os.path.split(os.path.realpath(base_dir))[1])

    # Prepare the EvaluationJob
    dataset = task.active_dataset
    if dataset.task_type != "OutputOnly":
        digest = file_cacher.put_file_from_path(
            os.path.join(base_dir, soluzione),
            "Solution %s for task %s" % (soluzione, task.name))
        executables = {task.name: Executable(filename=task.name,
                                             digest=digest)}
        jobs = [(t, EvaluationJob(
            language=language,
            task_type=dataset.task_type,
            task_type_parameters=json.loads(dataset.task_type_parameters),
            managers=dict(dataset.managers),
            executables=executables,
            input=dataset.testcases[t].input,
            output=dataset.testcases[t].output,
            time_limit=dataset.time_limit,
            memory_limit=dataset.memory_limit)) for t in dataset.testcases]
        tasktype = get_task_type(dataset=dataset)
    else:
        print("Generating outputs...", end='')
        files = {}
        for t in sorted(dataset.testcases.keys()):
            with file_cacher.get_file(dataset.testcases[t].input) as fin:
                with TemporaryFile() as fout:
                    print(str(t), end='')
                    call(soluzione, stdin=fin, stdout=fout, cwd=base_dir)
                    fout.seek(0)
                    digest = file_cacher.put_file_from_fobj(fout)
                    outname = "output_%s.txt" % t
                    files[outname] = File(filename=outname, digest=digest)
        jobs = [(t, EvaluationJob(
            task_type=dataset.task_type,
            task_type_parameters=json.loads(dataset.task_type_parameters),
            managers=dict(dataset.managers),
            files=files,
            input=dataset.testcases[t].input,
            output=dataset.testcases[t].output,
            time_limit=dataset.time_limit,
            memory_limit=dataset.memory_limit)) for t in dataset.testcases]
        for k, job in jobs:
            job._key = k
        tasktype = get_task_type(dataset=dataset)
        print()

    ask_again = True
    last_status = "ok"
    status = "ok"
    stop = False
    info = []
    points = []
    comments = []
    tcnames = []
    for jobinfo in sorted(jobs):
        print(jobinfo[0], end='')
        sys.stdout.flush()
        job = jobinfo[1]
        # Skip the testcase if we decide to consider everything to
        # timeout
        if stop:
            info.append("Time limit exceeded")
            points.append(0.0)
            comments.append("Timeout.")
            continue

        # Evaluate testcase
        last_status = status
        tasktype.evaluate(job, file_cacher)
        if dataset.task_type != "OutputOnly":
            status = job.plus["exit_status"]
            info.append("Time: %5.3f   Wall: %5.3f   Memory: %s" %
                       (job.plus["execution_time"],
                        job.plus["execution_wall_clock_time"],
                        mem_human(job.plus["execution_memory"])))
        else:
            status = "ok"
            info.append("N/A")
        points.append(float(job.outcome))
        comments.append(format_status_text(job.text))
        tcnames.append(jobinfo[0])

        # If we saw two consecutive timeouts, ask wether we want to
        # consider everything to timeout
        if ask_again and status == "timeout" and last_status == "timeout":
            print()
            print("Want to stop and consider everything to timeout? [y/N]",
                  end='')
            if assume is not None:
                print(assume)
                tmp = assume
            else:
                tmp = raw_input().lower()
            if tmp in ['y', 'yes']:
                stop = True
            else:
                ask_again = False

    # Result pretty printing
    print()
    clen = max(len(c) for c in comments)
    ilen = max(len(i) for i in info)
    for (i, p, c, b) in zip(tcnames, points, comments, info):
        print("%s) %5.2lf --- %s [%s]" % (i, p, c.ljust(clen), b.center(ilen)))

    return zip(points, comments, info)