def calc_performance(filename, target_var): """ Calculates SSE of predictions of the target_variable. Note: actual values must be at data/filename.csv, and predicted values must be at predictions/filename.csv. """ variables = {('PassengerId', 'continuous'): None} variables[(target_var[0], target_var[1])] = target_var[2:] var_dict = simplify_var_dict(variables, None) targ_var_name = target_var[0] targ_var_type = target_var[1] # Get actual values f = file('data/%s.csv' % filename, 'r') data = get_data(f, variables) f.close() # Get predicted values f = file('predictions/%s.csv' % filename, 'r') predictions = get_data(f, variables) f.close() if targ_var_type == 'categorical': wrong = 0 for ind in xrange(len(data)): a = data[ind][var_dict[target_var[0]]] b = predictions[ind][var_dict[target_var[0]]] if a != b: wrong += 1 return wrong elif targ_var_type == 'continuous': SSE = 0 for ind in xrange(len(data)): actual_value = data[ind][var_dict[targ_var_name]] predicted_value = predictions[ind][var_dict[targ_var_name]] SSE += (actual_value - predicted_value)**2 return SSE
def write_predictions(filename, ind_vars, tree): """ Given data from data/filename.csv, the predicted target value for each passenger ID is written to predictions/filename.csv. """ # Get the data set for prediction. f = file('data/%s.csv' % filename, 'r') data = get_data(f, ind_vars) var_dict = simplify_var_dict(ind_vars, None) f.close() # Output target variable predictions to csv. f = file('predictions/%s.csv' % filename, 'w') f.write('PassengerId,%s\n' % target_var[0]) for datum in data: distribution = tree.predict(datum, var_dict) write_prediction( f, distribution, target_var, int(datum[var_dict['PassengerId']])) f.close()
# Build tree to predict age: f = file("data/%s_age.csv" % filename, "r") max_depth = 6 root = construct(f, ind_vars, target_var, max_depth) f.close() f = file("trees/age.tree", "w") write_tree(root, f) f.close() # Trim tree: TODO # Use tree to predict age: # Compile list of independent variables used to predict target variable tree = read_tree("trees/age.tree") f = file("data/%s_no_age.csv" % filename, "r") ind_vars[("PassengerId", "continuous")] = None data = get_data(f, ind_vars) var_dict = simplify_var_dict(ind_vars, None) # Output target variable predictions to csv. f = file("predictions/ages.csv", "w") f.write("PassengerId,%s\n" % target_var[0]) for datum in data: distribution = tree.predict(datum, var_dict) write_prediction(f, distribution, target_var, int(datum[var_dict["PassengerId"]])) f.close() # Write a new data file with age values. write_age_file(filename)