コード例 #1
0
def eval_tree_once(filename, train_size, test_size, replacement,
        ind_vars, target_var, max_depth):
    """
    Evaluates a tree by sampling the training and testing data sets once.

    Returns the error for cross validation (SSE with respect to test set)
    and the error for evaluating residuals (SSE with respect to train set)
    """

    partition_data(filename, train_size, test_size, replacement)

    # Build tree from train data
    f = file('data/my_train.csv', 'r')
    tree = construct(f, ind_vars, target_var, max_depth)
    f.close()

    # Write tree, just to see what the tree looks like.
    f = file('trees/my_tree.tree', 'w')
    write_tree(tree, f)
    f.close()

    # Append PassengerId as a variable for prediction purposes.
    ind_vars[('PassengerId', 'continuous')] = None

    # Make predictions on both test and train data sets.
    write_predictions('my_test', ind_vars, tree)
    write_predictions('my_train', ind_vars, tree)

    # Remove PassengerId from ind_vars
    ind_vars.pop(('PassengerId', 'continuous'), None)

    # calculate errors for both test and train data sets
    cross_val_error = calc_performance('my_test', target_var)
    res_error = calc_performance('my_train', target_var)

    return cross_val_error, res_error
コード例 #2
0
ファイル: add_age.py プロジェクト: kendricktang/titanic
        ("Embarked", "categorical"): ["S", "C", "Q"],
        ("SibSp", "continuous"): None,
        ("Parch", "continuous"): None,
        ("Fare", "continuous"): None,
        # ('Title', 'categorical'): titles,
        # ('Ticket_Code', 'categorical'): ticket_codes,
        # ('Ticket_Val', 'continuous'): None,
    }
    target_var = ["Age", "continuous"]

    # Only build a tree if necessary.
    if build_tree:
        # Build tree to predict age:
        f = file("data/%s_age.csv" % filename, "r")
        max_depth = 6
        root = construct(f, ind_vars, target_var, max_depth)
        f.close()

        f = file("trees/age.tree", "w")
        write_tree(root, f)
        f.close()

        # Trim tree: TODO

    # Use tree to predict age:
    # Compile list of independent variables used to predict target variable
    tree = read_tree("trees/age.tree")
    f = file("data/%s_no_age.csv" % filename, "r")
    ind_vars[("PassengerId", "continuous")] = None
    data = get_data(f, ind_vars)
    var_dict = simplify_var_dict(ind_vars, None)