Ejemplo n.º 1
0
def eval_tree_once(filename, train_size, test_size, replacement,
        ind_vars, target_var, max_depth):
    """
    Evaluates a tree by sampling the training and testing data sets once.

    Returns the error for cross validation (SSE with respect to test set)
    and the error for evaluating residuals (SSE with respect to train set)
    """

    partition_data(filename, train_size, test_size, replacement)

    # Build tree from train data
    f = file('data/my_train.csv', 'r')
    tree = construct(f, ind_vars, target_var, max_depth)
    f.close()

    # Write tree, just to see what the tree looks like.
    f = file('trees/my_tree.tree', 'w')
    write_tree(tree, f)
    f.close()

    # Append PassengerId as a variable for prediction purposes.
    ind_vars[('PassengerId', 'continuous')] = None

    # Make predictions on both test and train data sets.
    write_predictions('my_test', ind_vars, tree)
    write_predictions('my_train', ind_vars, tree)

    # Remove PassengerId from ind_vars
    ind_vars.pop(('PassengerId', 'continuous'), None)

    # calculate errors for both test and train data sets
    cross_val_error = calc_performance('my_test', target_var)
    res_error = calc_performance('my_train', target_var)

    return cross_val_error, res_error
Ejemplo n.º 2
0
        ('Pclass', 'categorical'): ['1', '2', '3'],
        ('Embarked', 'categorical'): ['C', 'S', 'Q'],
        ('Title', 'categorical'): titles,
        ('Ticket_Code', 'categorical'): ticket_codes,
    }
    target_var = ['Survived', 'categorical', '0', '1']
    max_depth = 100

    # Build tree!
    f = file('data/%s.csv' % filename_train, 'r')
    tree = construct(f, ind_vars, target_var, max_depth)
    f.close()

    # Write tree to file. Not necessary, but nice to have.
    f = file('trees/%s.tree' % filename_tree, 'w')
    write_tree(tree, f)
    f.close()

    # Append PassengerId for prediction purposes.
    f = file('data/%s.csv' % filename_test, 'r')
    ind_vars[('PassengerId', 'continuous')] = None
    data = get_data(f, ind_vars)
    var_dict = simplify_var_dict(ind_vars, None)
    f.close()

    # Output target variable predictions to csv.
    f = file('predictions/%s.csv' % filename_predictions, 'w')
    f.write('PassengerId,%s\n' % target_var[0])
    for datum in data:
        distribution = tree.predict(datum, var_dict)
        write_prediction(