def cross_validate(matrix, identifiers, features, id2name, model, n_folds=3, seed=None): """Use data from all year deltas > target_delta to predict scores.""" feature_cols = [ idx for idx, (feat, delta) in enumerate(features) if delta != 0 ] objective_index = features.index(('fantasy_points', 0)) def get_features_objective(_matrix): X = _matrix[:, feature_cols] y = _matrix[:, objective_index] return X, y accum_test_identifiers = [] accum_test_scores = [] accum_test_preds = [] for fold, (train_index, test_index) in \ enumerate(KFold(n=matrix.shape[0], n_folds=n_folds, shuffle=True, random_state=seed)): imputer = Imputer() scaler = StandardScaler() # Need to standardize for eg SVR train_matrix = matrix[train_index, :] test_matrix = matrix[test_index, :] imputer.fit(train_matrix) train_imputed = scaler.fit_transform(imputer.transform(train_matrix)) test_imputed = scaler.transform(imputer.transform(test_matrix)) X_train, y_train = get_features_objective(train_imputed) model.fit(X_train, y_train) X_test, y_test = get_features_objective(test_imputed) y_pred = model.predict(X_test) test_identifiers = [identifiers[idx] for idx in test_index] accum_test_identifiers.extend(test_identifiers) accum_test_scores.extend(y_test) accum_test_preds.extend(y_pred) pos_ranks_true = position_ranking_lists(accum_test_identifiers, accum_test_scores, id2name) pos_ranks_pred = position_ranking_lists(accum_test_identifiers, accum_test_preds, id2name) taus = compute_taus(pos_ranks_true, pos_ranks_pred) for deltapos in sorted(taus, key=lambda x: (x[1], x[0])): print deltapos, taus[deltapos] return
def main(): id2year2stats = load_files( {year: 'fant%d.csv' % year for year in xrange(2008, 2013)}, SPECIAL_CASE_TRADES) def id_to_useful_name(id): year2stats = id2year2stats[id] any_year = year2stats[year2stats.keys()[0]] return (any_year['Name'], any_year['Tm'], any_year['FantasyFantPos']) current_players = set(id for id in id2year2stats if BASE_YEAR - 1 in id2year2stats[id]) matrix, identifiers, features = construct_feature_matrix(id2year2stats) id2name = { ident[ID]: id_to_useful_name(ident[ID]) for ident in identifiers } from sklearn import linear_model from sklearn import ensemble from sklearn import svm seed = randint(0, 2**32 - 1) for model in [ linear_model.LinearRegression(), linear_model.Ridge(), ensemble.RandomForestRegressor(), ensemble.ExtraTreesRegressor(), ensemble.AdaBoostRegressor(), ensemble.GradientBoostingRegressor(), svm.SVR(), svm.NuSVR(), ]: print str(model).split('(')[0] cross_validate(matrix, identifiers, features, id2name, model, n_folds=10, seed=seed) print model = ensemble.RandomForestRegressor() current_predictions, current_ids = \ predict_current_year(matrix, identifiers, features, id2name, model) current_predictions, current_ids = zip( *[(pred, ident) for pred, ident in zip(current_predictions, current_ids) if ident[ID] in current_players]) current_predicted_ranks = position_ranking_lists(current_ids, current_predictions, id2name) dump_predictions(current_predicted_ranks) return
def cross_validate(matrix, identifiers, features, id2name, model, n_folds=3, seed=None): """Use data from all year deltas > target_delta to predict scores.""" feature_cols = [idx for idx, (feat, delta) in enumerate(features) if delta != 0] objective_index = features.index(("fantasy_points", 0)) def get_features_objective(_matrix): X = _matrix[:, feature_cols] y = _matrix[:, objective_index] return X, y accum_test_identifiers = [] accum_test_scores = [] accum_test_preds = [] for fold, (train_index, test_index) in enumerate( KFold(n=matrix.shape[0], n_folds=n_folds, shuffle=True, random_state=seed) ): imputer = Imputer() scaler = StandardScaler() # Need to standardize for eg SVR train_matrix = matrix[train_index, :] test_matrix = matrix[test_index, :] imputer.fit(train_matrix) train_imputed = scaler.fit_transform(imputer.transform(train_matrix)) test_imputed = scaler.transform(imputer.transform(test_matrix)) X_train, y_train = get_features_objective(train_imputed) model.fit(X_train, y_train) X_test, y_test = get_features_objective(test_imputed) y_pred = model.predict(X_test) test_identifiers = [identifiers[idx] for idx in test_index] accum_test_identifiers.extend(test_identifiers) accum_test_scores.extend(y_test) accum_test_preds.extend(y_pred) pos_ranks_true = position_ranking_lists(accum_test_identifiers, accum_test_scores, id2name) pos_ranks_pred = position_ranking_lists(accum_test_identifiers, accum_test_preds, id2name) taus = compute_taus(pos_ranks_true, pos_ranks_pred) for deltapos in sorted(taus, key=lambda x: (x[1], x[0])): print deltapos, taus[deltapos] return
def main(): id2year2stats = load_files( {year: 'fant%d.csv' % year for year in xrange(2008, 2013)}, SPECIAL_CASE_TRADES) def id_to_useful_name(id): year2stats = id2year2stats[id] any_year = year2stats[year2stats.keys()[0]] return (any_year['Name'], any_year['Tm'], any_year['FantasyFantPos']) current_players = set(id for id in id2year2stats if BASE_YEAR - 1 in id2year2stats[id]) matrix, identifiers, features = construct_feature_matrix(id2year2stats) id2name = {ident[ID]: id_to_useful_name(ident[ID]) for ident in identifiers} from sklearn import linear_model from sklearn import ensemble from sklearn import svm seed = randint(0, 2**32 - 1) for model in [linear_model.LinearRegression(), linear_model.Ridge(), ensemble.RandomForestRegressor(), ensemble.ExtraTreesRegressor(), ensemble.AdaBoostRegressor(), ensemble.GradientBoostingRegressor(), svm.SVR(), svm.NuSVR(), ]: print str(model).split('(')[0] cross_validate(matrix, identifiers, features, id2name, model, n_folds=10, seed=seed) print model = ensemble.RandomForestRegressor() current_predictions, current_ids = \ predict_current_year(matrix, identifiers, features, id2name, model) current_predictions, current_ids = zip( *[(pred, ident) for pred, ident in zip(current_predictions, current_ids) if ident[ID] in current_players]) current_predicted_ranks = position_ranking_lists( current_ids, current_predictions, id2name) dump_predictions(current_predicted_ranks) return