def main(dist_file, pred_file, scoring_type): # dist_file = '2015_10.json' # pred_file = '../predictions.json' # scoring_type = 'standard' # Load dist data file with open(dist_file) as f: distdata = json.load(f) # load prediction file with open(pred_file) as f: preddata = {d["player_id"]: d for d in json.load(f)} # make scorer scorer = make_scorer(base_type=scoring_type) # score stats preddata = {k: score_stats_dict(v, scorer=scorer) for k, v in preddata.iteritems()} # add pred data to dist data for k, v in distdata.iteritems(): if k in preddata.keys(): distdata[k].update({"player_info": preddata[k]}) else: print(k) # save updated dist data with open(dist_file, "w") as f: json.dump(distdata, f)
def score_stats_dict(stat_dict, scorer=make_scorer(base_type="standard")): stat_dict.update( {u"standard_score": sum([v * stat_dict[k] for k, v in scorer.iteritems() if k in stat_dict.keys()])} ) return stat_dict
def main(): ################################ ### CONFIGURE pred_week = 14 #None db = nfldb.connect() result_path='../results' ### LOAD DATA # load train data full_train, pipe, stats = load_feature_set(db) # picks columns to model lag_cols = [stat + '_lag' for stat in stats] mean_cols = [stat + '_mean' for stat in stats] other_cols = ['same_year_lag', 'played_lag'] infoColumns = ExtractColumns(like=[], exact=['year','week','time','player_id','full_name']) row_info = infoColumns.fit_transform(X=full_train) # load prediction data pred_data, predict_i, pred_info, pred_yr_wk = prediction_feature_set(db, pipe, infoColumns, pred_week=pred_week) ################################## ### PREPARE DATA FOR TRAIN AND PREDICT # train data with all columns X_all = full_train # prediction data with all columns pred_all = pred_data.iloc[predict_i] # which rows did players play played_bool = full_train['played'] == 1 played_index = [i for i in range(X_all.shape[0]) if played_bool[i]] # random split train and test train_index, test_index = train_test_split_index(X_all.shape[0], test_size=0.1, seed=0) feature_cols = lag_cols + mean_cols + other_cols XColumns = ExtractColumns(like=feature_cols) X = XColumns.fit_transform(X=X_all) X_pred = XColumns.fit_transform(X=pred_all) ################################## ### SET UP & TRAIN KNN # fit k nearest neighbors k = 100 played_only = True i_knn = played_index if played_only else range(X.shape[0]) #nn = NearestNeighbors(n_neighbors=k).fit(X.iloc[i_knn]) # regularization reg = CoefScaler(linear_model=Ridge()) reg = reg.fit(X=X.iloc[i_knn], y = score_stats(X_all, make_scorer(base_type='standard')).iloc[i_knn]) X_reg = reg.transform(X.iloc[i_knn]) nn = NearestNeighbors(n_neighbors=k).fit(X_reg) # returns tuple of (distances, indices of neighbors) # for prediction set #distance, neighbor = nn.kneighbors(X=X_pred) X_reg_pred = reg.transform(X=X_pred) distance, neighbor = nn.kneighbors(X=X_reg_pred) ################################## ### READ AND PLOT KNN RESULTS nn_dict = {} for check_i in range(pred_all.shape[0]): # check neighbors # check_nn is a data frame where the first row is the player # and the rest of the rows are the nearest neighbors check_nn = pred_all.iloc[[check_i],:].append(X_all.iloc[i_knn].iloc[neighbor[check_i,:]]) check_nn['StandardPoints'] = score_stats(check_nn, make_scorer(base_type='standard')) check_nn['PPRPoints'] = score_stats(check_nn, make_scorer(base_type='ppr')) nn_i = plot_knn(check_nn, save_image=True, plot_stat='StandardPoints', pred_yr_wk=pred_yr_wk, result_path=plot_image_path(result_path, pred_yr_wk), n_bins=25, bandwidth=2.5) nn_dict.update(nn_i) save_plot_data_json(nn_dict, result_path, pred_yr_wk)