def write_predictions(filename, ind_vars, tree):
    """
    Given data from data/filename.csv, the predicted target value
    for each passenger ID is written to predictions/filename.csv.
    """
    # Get the data set for prediction.
    f = file('data/%s.csv' % filename, 'r')
    data = get_data(f, ind_vars)
    var_dict = simplify_var_dict(ind_vars, None)
    f.close()

    # Output target variable predictions to csv.
    f = file('predictions/%s.csv' % filename, 'w')
    f.write('PassengerId,%s\n' % target_var[0])
    for datum in data:
        distribution = tree.predict(datum, var_dict)
        write_prediction(
                f,
                distribution,
                target_var,
                int(datum[var_dict['PassengerId']]))
    f.close()
def calc_performance(filename, target_var):
    """
    Calculates SSE of predictions of the target_variable.
    Note: actual values must be at data/filename.csv, and
    predicted values must be at predictions/filename.csv.
    """
    variables = {('PassengerId', 'continuous'): None}
    variables[(target_var[0], target_var[1])] = target_var[2:]

    var_dict = simplify_var_dict(variables, None)
    targ_var_name = target_var[0]
    targ_var_type = target_var[1]

    # Get actual values
    f = file('data/%s.csv' % filename, 'r')
    data = get_data(f, variables)
    f.close()

    # Get predicted values
    f = file('predictions/%s.csv' % filename, 'r')
    predictions = get_data(f, variables)
    f.close()

    if targ_var_type == 'categorical':
        wrong = 0
        for ind in xrange(len(data)):
            a = data[ind][var_dict[target_var[0]]]
            b = predictions[ind][var_dict[target_var[0]]]
            if a != b:
                wrong += 1
        return wrong
    elif targ_var_type == 'continuous':
        SSE = 0
        for ind in xrange(len(data)):
            actual_value = data[ind][var_dict[targ_var_name]]
            predicted_value = predictions[ind][var_dict[targ_var_name]]
            SSE += (actual_value - predicted_value)**2
        return SSE
Beispiel #3
0
        # Build tree to predict age:
        f = file("data/%s_age.csv" % filename, "r")
        max_depth = 6
        root = construct(f, ind_vars, target_var, max_depth)
        f.close()

        f = file("trees/age.tree", "w")
        write_tree(root, f)
        f.close()

        # Trim tree: TODO

    # Use tree to predict age:
    # Compile list of independent variables used to predict target variable
    tree = read_tree("trees/age.tree")
    f = file("data/%s_no_age.csv" % filename, "r")
    ind_vars[("PassengerId", "continuous")] = None
    data = get_data(f, ind_vars)
    var_dict = simplify_var_dict(ind_vars, None)

    # Output target variable predictions to csv.
    f = file("predictions/ages.csv", "w")
    f.write("PassengerId,%s\n" % target_var[0])
    for datum in data:
        distribution = tree.predict(datum, var_dict)
        write_prediction(f, distribution, target_var, int(datum[var_dict["PassengerId"]]))
    f.close()

    # Write a new data file with age values.
    write_age_file(filename)