Ejemplo n.º 1
0
def test_learners(
    description,
    group,
    max_tests,
    needed_wins,
    row_limits,
    col_limits,
    seed,
    grader,
):
    """Test data generation methods beat given learner.  		  	   		   	 			  		 			     			  	  		 	  	 		 			  		  			
  		  	   		   	 			  		 			     			  	  		 	  	 		 			  		  			
    Requires test description, test case group, and a grader fixture.  		  	   		   	 			  		 			     			  	  		 	  	 		 			  		  			
    """

    points_earned = 0.0  # initialize points for this test case
    incorrect = True
    msgs = []
    try:
        data_x, data_y = None, None
        same_data_x, same_data_y = None, None
        diff_data_x, diff_data_y = None, None
        better_learner, worse_learner = None, None
        if group == "author":
            try:
                from gen_data import author

                auth_string = run_with_timeout(author, seconds_per_test_case,
                                               (), {})
                if auth_string == "tb34":
                    incorrect = True
                    msgs.append("   Incorrect author name (tb34)")
                    points_earned = -10
                elif auth_string == "":
                    incorrect = True
                    msgs.append("   Empty author name")
                    points_earned = -10
                else:
                    incorrect = False
            except Exception as e:
                incorrect = True
                msgs.append(
                    "   Exception occured when calling author() method: {}".
                    format(e))
                points_earned = -10
        else:
            if group == "best4dt":
                from gen_data import best_4_dt

                data_x, data_y = run_with_timeout(best_4_dt,
                                                  seconds_per_test_case, (),
                                                  {"seed": seed})
                same_data_x, same_data_y = run_with_timeout(
                    best_4_dt, seconds_per_test_case, (), {"seed": seed})
                diff_data_x, diff_data_y = run_with_timeout(
                    best_4_dt, seconds_per_test_case, (), {"seed": seed + 1})
                better_learner = DTLearner
                worse_learner = LinRegLearner
            elif group == "best4lr":
                from gen_data import best_4_lin_reg

                data_x, data_y = run_with_timeout(best_4_lin_reg,
                                                  seconds_per_test_case, (),
                                                  {"seed": seed})
                same_data_x, same_data_y = run_with_timeout(
                    best_4_lin_reg, seconds_per_test_case, (), {"seed": seed})
                diff_data_x, diff_data_y = run_with_timeout(
                    best_4_lin_reg, seconds_per_test_case, (),
                    {"seed": seed + 1})
                better_learner = LinRegLearner
                worse_learner = DTLearner

            num_samples = data_x.shape[0]
            cutoff = int(num_samples * 0.6)
            worse_better_err = []
            for run in range(max_tests):
                permutation = np.random.permutation(num_samples)
                train_x, train_y = (
                    data_x[permutation[:cutoff]],
                    data_y[permutation[:cutoff]],
                )
                test_x, test_y = (
                    data_x[permutation[cutoff:]],
                    data_y[permutation[cutoff:]],
                )
                better = better_learner()
                worse = worse_learner()
                better.add_evidence(train_x, train_y)
                worse.add_evidence(train_x, train_y)
                better_pred = better.query(test_x)
                worse_pred = worse.query(test_x)
                better_err = np.linalg.norm(test_y - better_pred)
                worse_err = np.linalg.norm(test_y - worse_pred)
                worse_better_err.append((worse_err, better_err))
            worse_better_err.sort(key=functools.cmp_to_key(lambda a, b: int(
                (b[0] - b[1]) - (a[0] - a[1]))))
            better_wins_count = 0
            for worse_err, better_err in worse_better_err:
                if better_err < 0.9 * worse_err:
                    better_wins_count = better_wins_count + 1
                    points_earned += 5.0
                if better_wins_count >= needed_wins:
                    break
            incorrect = False
            if (data_x.shape[0] < row_limits[0]) or (data_x.shape[0] >
                                                     row_limits[1]):
                incorrect = True
                msgs.append("    Invalid number of rows. Should be between {},"
                            " found {}".format(row_limits, data_x.shape[0]))
                points_earned = max(0, points_earned - 20)
            if (data_x.shape[1] < col_limits[0]) or (data_x.shape[1] >
                                                     col_limits[1]):
                incorrect = True
                msgs.append(
                    "    Invalid number of columns. Should be between {},"
                    " found {}".format(col_limits, data_x.shape[1]))
                points_earned = max(0, points_earned - 20)
            if better_wins_count < needed_wins:
                incorrect = True
                msgs.append(
                    "    Better learner did not exceed worse learner. Expected"
                    " {}, found {}".format(needed_wins, better_wins_count))
            if not (np.array_equal(same_data_y, data_y)) or not (
                    np.array_equal(same_data_x, data_x)):
                incorrect = True
                msgs.append(
                    "    Did not produce the same data with the same seed.\n" +
                    "      First data_x:\n{}\n".format(data_x) +
                    "      Second data_x:\n{}\n".format(same_data_x) +
                    "      First data_y:\n{}\n".format(data_y) +
                    "      Second data_y:\n{}\n".format(same_data_y))
                points_earned = max(0, points_earned - 20)
            if np.array_equal(diff_data_y, data_y) and np.array_equal(
                    diff_data_x, data_x):
                incorrect = True
                msgs.append("    Did not produce different data with different"
                            " seeds.\n" +
                            "      First data_x:\n{}\n".format(data_x) +
                            "      Second data_x:\n{}\n".format(diff_data_x) +
                            "      First data_y:\n{}\n".format(data_y) +
                            "      Second data_y:\n{}\n".format(diff_data_y))
                points_earned = max(0, points_earned - 20)
        if incorrect:
            if group == "author":
                raise IncorrectOutput(
                    "Test failed on one or more criteria.\n  {}".format(
                        "\n".join(msgs)))
            else:
                inputs_str = "    Residuals: {}".format(worse_better_err)
                raise IncorrectOutput(
                    "Test failed on one or more output criteria.\n "
                    " Inputs:\n{}\n  Failures:\n{}".format(
                        inputs_str, "\n".join(msgs)))
        else:
            if group != "author":
                avg_ratio = 0.0
                worse_better_err.sort(key=functools.cmp_to_key(
                    lambda a, b: int(np.sign((b[0] - b[1]) - (a[0] - a[1])))))
                for we, be in worse_better_err[:10]:
                    avg_ratio += float(we) - float(be)
                avg_ratio = avg_ratio / 10.0
                if group == "best4dt":
                    grader.add_performance(np.array([avg_ratio, 0]))
                else:
                    grader.add_performance(np.array([0, avg_ratio]))
    except Exception as e:
        # Test result: failed
        msg = "Description: {} (group: {})\n".format(description, group)
        # Generate a filtered stacktrace, only showing erroneous lines in student file(s)
        tb_list = tb.extract_tb(sys.exc_info()[2])
        for i in range(len(tb_list)):
            row = tb_list[i]
            tb_list[i] = (
                os.path.basename(row[0]),
                row[1],
                row[2],
                row[3],
            )  # show only filename instead of long absolute path
        tb_list = [row for row in tb_list if (row[0] == "gen_data.py")]
        if tb_list:
            msg += "Traceback:\n"
            msg += "".join(tb.format_list(tb_list))  # contains newlines
        elif "grading_traceback" in dir(e):
            msg += "Traceback:\n"
            msg += "".join(tb.format_list(e.grading_traceback))
        msg += "{}: {}".format(e.__class__.__name__, str(e))

        # Report failure result to grader, with stacktrace
        grader.add_result(
            GradeResult(outcome="failed", points=points_earned, msg=msg))
        raise
    else:
        # Test result: passed (no exceptions)
        grader.add_result(
            GradeResult(outcome="passed", points=points_earned, msg=None))
def test_learners(description, group, max_tests, needed_wins, row_limits,
                  col_limits, seed, grader):
    """Test data generation methods beat given learner.

    Requires test description, test case group, and a grader fixture.
    """

    points_earned = 0.0  # initialize points for this test case
    incorrect = True
    try:
        # Try to import KNNLearner (only once)
        # if not 'KNNLearner' in globals():
        #     from KNNLearner import KNNLearner
        dataX, dataY = None, None
        same_dataX, same_dataY = None, None
        diff_dataX, diff_dataY = None, None
        betterLearner, worseLearner = None, None
        if group == "best4rt":
            from gen_data import best4RT
            dataX, dataY = run_with_timeout(best4RT, seconds_per_test_case, (),
                                            {'seed': seed})
            same_dataX, same_dataY = run_with_timeout(best4RT,
                                                      seconds_per_test_case,
                                                      (), {'seed': seed})
            diff_dataX, diff_dataY = run_with_timeout(best4RT,
                                                      seconds_per_test_case,
                                                      (), {'seed': seed + 1})
            betterLearner = RTLearner
            worseLearner = LinRegLearner
        else:
            from gen_data import best4LinReg
            dataX, dataY = run_with_timeout(best4LinReg, seconds_per_test_case,
                                            (), {'seed': seed})
            same_dataX, same_dataY = run_with_timeout(best4LinReg,
                                                      seconds_per_test_case,
                                                      (), {'seed': seed})
            diff_dataX, diff_dataY = run_with_timeout(best4LinReg,
                                                      seconds_per_test_case,
                                                      (), {'seed': seed + 1})
            betterLearner = LinRegLearner
            worseLearner = RTLearner

        num_samples = dataX.shape[0]
        cutoff = int(num_samples * 0.6)
        worse_better_err = []
        for run in range(max_tests):
            permutation = np.random.permutation(num_samples)
            train_X, train_Y = dataX[permutation[:cutoff]], dataY[
                permutation[:cutoff]]
            test_X, test_Y = dataX[permutation[cutoff:]], dataY[
                permutation[cutoff:]]
            better = betterLearner()
            worse = worseLearner()
            better.addEvidence(train_X, train_Y)
            worse.addEvidence(train_X, train_Y)
            better_pred = better.query(test_X)
            worse_pred = worse.query(test_X)
            better_err = np.linalg.norm(test_Y - better_pred)
            worse_err = np.linalg.norm(test_Y - worse_pred)
            worse_better_err.append((worse_err, better_err))
        worse_better_err.sort(lambda a, b: int((b[0] - b[1]) - (a[0] - a[1])))
        better_wins_count = 0
        for worse_err, better_err in worse_better_err:
            if better_err < 0.9 * worse_err:
                better_wins_count = better_wins_count + 1
                points_earned += 5.0
            if better_wins_count >= needed_wins:
                break
        incorrect = False
        msgs = []
        if (dataX.shape[0] < row_limits[0]) or (dataX.shape[0] >
                                                row_limits[1]):
            incorrect = True
            msgs.append(
                "    Invalid number of rows. Should be between {}, found {}".
                format(row_limits, dataX.shape[0]))
            points_earned = max(0, points_earned - 20)
        if (dataX.shape[1] < col_limits[0]) or (dataX.shape[1] >
                                                col_limits[1]):
            incorrect = True
            msgs.append(
                "    Invalid number of columns. Should be between {}, found {}"
                .format(col_limits, dataX.shape[1]))
            points_earned = max(0, points_earned - 20)
        if better_wins_count < needed_wins:
            incorrect = True
            msgs.append(
                "    Better learner did not exceed worse learner. Expected {}, found {}"
                .format(needed_wins, better_wins_count))
        if not (np.array_equal(same_dataY, dataY)) or not (np.array_equal(
                same_dataX, dataX)):
            incorrect = True
            msgs.append("    Did not produce the same data with the same seed.\n"+\
                        "      First dataX:\n{}\n".format(dataX)+\
                        "      Second dataX:\n{}\n".format(same_dataX)+\
                        "      First dataY:\n{}\n".format(dataY)+\
                        "      Second dataY:\n{}\n".format(same_dataY))
            points_earned = max(0, points_earned - 20)
        if np.array_equal(diff_dataY, dataY) and np.array_equal(
                diff_dataX, dataX):
            incorrect = True
            msgs.append("    Did not produce different data with different seeds.\n"+\
                        "      First dataX:\n{}\n".format(dataX)+\
                        "      Second dataX:\n{}\n".format(diff_dataX)+\
                        "      First dataY:\n{}\n".format(dataY)+\
                        "      Second dataY:\n{}\n".format(diff_dataY))
            points_earned = max(0, points_earned - 20)
        if incorrect:
            inputs_str = "    Residuals: {}".format(worse_better_err)
            raise IncorrectOutput, "Test failed on one or more output criteria.\n  Inputs:\n{}\n  Failures:\n{}".format(
                inputs_str, "\n".join(msgs))
        else:
            avg_ratio = 0.0
            worse_better_err.sort(
                lambda a, b: int(np.sign((b[0] - b[1]) - (a[0] - a[1]))))
            for we, be in worse_better_err[:10]:
                avg_ratio += (float(we) - float(be))
            avg_ratio = avg_ratio / 10.0
            if group == "best4rt":
                grader.add_performance(np.array([avg_ratio, 0]))
            else:
                grader.add_performance(np.array([0, avg_ratio]))
    except Exception as e:
        # Test result: failed
        msg = "Description: {} (group: {})\n".format(description, group)

        # Generate a filtered stacktrace, only showing erroneous lines in student file(s)
        tb_list = tb.extract_tb(sys.exc_info()[2])
        for i in xrange(len(tb_list)):
            row = tb_list[i]
            tb_list[i] = (os.path.basename(row[0]), row[1], row[2], row[3]
                          )  # show only filename instead of long absolute path
        tb_list = [row for row in tb_list if (row[0] == 'gen_data.py')]
        if tb_list:
            msg += "Traceback:\n"
            msg += ''.join(tb.format_list(tb_list))  # contains newlines
        elif 'grading_traceback' in dir(e):
            msg += "Traceback:\n"
            msg += ''.join(tb.format_list(e.grading_traceback))
        msg += "{}: {}".format(e.__class__.__name__, e.message)

        # Report failure result to grader, with stacktrace
        grader.add_result(
            GradeResult(outcome='failed', points=points_earned, msg=msg))
        raise
    else:
        # Test result: passed (no exceptions)
        grader.add_result(
            GradeResult(outcome='passed', points=points_earned, msg=None))