def logistic_GAM(x_tr, y_tr, x_tst, y_tst): classifier = LogisticGAM() classifier.fit(x_tr, y_tr) tr_pred = classifier.predict(x_tr) y_pred = classifier.predict(x_tst) confusion_matrix = metrics.confusion_matrix(y_tst, y_pred) print(confusion_matrix) print('Accuracy of logistic regression classifier on test set: {:.2f}' \ .format(metrics.accuracy_score(y_pred, y_tst))) print('Accuracy of logistic regression classifier on train set: {:.2f}' \ .format(metrics.accuracy_score(tr_pred, y_tr))) '''
class AdaptiveLogisticGAM(BaseEstimator, RegressorMixin): def __init__(self, param_grid=None, gam_params=None): # create GAM if gam_params is None: gam_params = {} self.model = LogisticGAM(**gam_params) # set grid search parameters if param_grid is None: param_grid = GAM_GRID_BASE self.param_grid = param_grid def fit(self, X, y): if isinstance(X, pd.DataFrame): X = X.values # fit using grid-search self.model.gridsearch(X, y, progress=False, **self.param_grid) def predict(self, X): if isinstance(X, pd.DataFrame): X = X.values return self.model.predict(X) def predict_proba(self, X): if isinstance(X, pd.DataFrame): X = X.values return self.model.predict_proba(X)
def simulation(No_T,n,p,box_plot=True): err=[] for i in range (No_T): #generate the test data X_train,Y_train=generate_data(n,p) X_test,Y_test= generate_data(n,p) logit_gam = LogisticGAM() logit_gam.gridsearch(X_train,Y_train) #calculate test error test_err=sum(logit_gam.predict(X_test)!=Y_test)/n err.append(test_err) if box_plot: plt.figure(num=None,figsize=(8,6),dpi=80) plt.boxplot(err) plt.text(1.1,0.15,"Mean:{:.2f}".format(np.mean(err))) plt.text(1.1,0.14,"Var:{:.3f}".format(np.var(err))) plt.title("logisticGAM") plt.ylabel("Test Error") plt.show()
n_splines = [5, 10, 15, 20, 25] lams = lams * 6 # shift values to -3, 3 lams = lams - 3 lams = np.exp(lams) cons = [ 'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none' ] random = LogisticGAM(aa).gridsearch(trainX, trainy, weights=w, lam=lams, n_splines=n_splines) random = random.gridsearch(trainX, trainy, constraints=cons) print(random.lam) print(random.n_splines) print(random.constraints) print(random.accuracy(testX, testy)) from sklearn.metrics import confusion_matrix preds = random.predict(testX) print(confusion_matrix(testy, preds)) for i, term in enumerate(random.terms): if term.isintercept: continue XX = random.generate_X_grid(term=i) pdep, confi = random.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(names1[i]) plt.show()
gam_pred_prob = gam_model.predict_proba(X_test) gam_preds, complete_gam_dat = top_15_predictions(entire_test_data, gam_pred_prob ) gam_performance = all_nba_test_report(complete_gam_dat) players_missed(complete_gam_dat) gam_predict_probs_2020 = gam_model.predict_proba(features_2020) gam_predict_binary_2020 = gam_model.predict(features_2020) gam_predictions_2020 = predict_2020(positions=position_encoder.inverse_transform(features_2020['All_NBA_Pos']), player_names=current_dat['Player'], binary_prediction= gam_predict_binary_2020, probability_predictions= gam_predict_probs_2020) gam_predictions_2020.to_csv("gam_predictions.csv") ##### ----- ##### ----- ##### ----- ##### -----# #### ----- ##### ----- ##### ----- ##### ----- ##### # Model 1.4 - KNN # --> Standardize all continuous features # NOTE: We standardize the train set # To standardize the test set, we use the mean and sd obtained from the train set (can't have leakage of info onto the test set)
# Generalizing a GAM import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import log_loss # We can split the data just like we usually would: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) gam4 = LogisticGAM().gridsearch(X_train, y_train) predictions = gam4.predict(X_test) print("Accuracy: {} ".format(accuracy_score(y_test, predictions))) probas = gam4.predict_proba(X_test) print("Log Loss: {} ".format(log_loss(y_test, probas))) # Accuracy: 0.925531914893617 # Log Loss: 0.15704862623168236 lambda_ = np.logspace(-3, 3, 3) n_splines = [2, 5, 10, 20, 50] constraints = [None, 'monotonic_inc', 'monotonic_dec'] #[‘convex’, ‘concave’, ‘monotonic_inc’, ‘monotonic_dec’,’circular’, ‘none’] gam5 = LogisticGAM().gridsearch( X_train,
class EpidemicModels: # Sequential 6 layer neural network def returnSequential6(self): model = Sequential() model.add(Dense(50, input_dim=20, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def returnSequential9(self): model = Sequential() model.add(Dense(80, input_dim=20, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def RNN(self): model = Sequential() model.add(SimpleRNN(2, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def multi_RNN(self): model = Sequential() model.add(SimpleRNN(2, input_dim=20)) model.add(Dense(40, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def baseline(self): # Create model model = Sequential() model.add(Dense(20, input_dim=20, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def lstm(self): model = Sequential() model.add(LSTM(10, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model def multi_lstm(self): model = Sequential() model.add(LSTM(4, input_dim=20, return_sequences=True)) model.add(LSTM(4, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model # Sequential 4 layer neural network def returnSequential2(self): model = Sequential() model.add(Dense(14, activation='relu', input_dim=20)) model.add(Dense(units=7, activation='relu')) model.add(Dense(units=1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def __init__(self, m=1): if m == 0: self.model = self.baseline() self.type = 0 elif m == 1: self.model = self.returnSequential2() self.type = 2 elif m == 2: self.model = self.returnSequential6() self.type = 2 elif m == 3: self.model = self.RNN() self.type = 1 elif m == 4: self.model = self.multi_RNN() self.type = 1 elif m == 5: self.model = self.lstm() self.type = 1 elif m == 6: self.model = self.multi_lstm() self.type = 1 elif m == 7: self.model = LogisticGAM() self.type = 3 elif m == 8: self.model = self.returnSequential9() self.type = 2 def returnModel(self): return self.model def train(self, X, y, bs=10, epochs=100): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) if self.type == 3: self.model.gridsearch(X, y) else: self.model.fit(X, y, batch_size=bs, epochs=epochs, shuffle=True) def prediction(self, X): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) return self.model.predict(X) def cross_eval(self, X, y, bs=10, ep=100, k=5): scores = [] if self.type == 0: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 2: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) print(score) scores.append(score) return sum(scores) / len(scores) elif self.type == 3: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) print(y_pre) scores.append(f1_score(y_pre, y_test)) return sum(scores) / len(scores)
for i in dates: # Fit Model to 90 Days of Data Lagged mask = (called_pitches['game_date'] <= i) & (called_pitches['game_date'] >= (i - dt.timedelta(days=90))) set = called_pitches.loc[mask] df = pd.DataFrame(set) target_df = pd.Series(set.strikeCall) X = df[['plate_x', 'plate_z', 'sz_top', 'sz_bot']] # 'bsCount' y = target_df gam_fit = LogisticGAM().fit(X, y) # Predict Each Day of Games Using 90 Day Model games = called_pitches.loc[called_pitches['game_date'] == i] x_games = games[['plate_x', 'plate_z', 'sz_top', 'sz_bot']] results = gam_fit.predict(x_games) called_pitches.at[called_pitches['game_date'] == i, 'pred_zone_last90'] = results # Track Progress and Reset Model print("Inserted..." + str(i)) gam_fit = "" called_pitches['zone_dev_last90'] = called_pitches[ 'strikeCall'] - called_pitches['pred_zone_last90'] # Create Unique Umpire-Date ID, Average Zone Deviation for each Umpire-Game, # of Called_Pitches called_pitches['umpire_date'] = called_pitches['umpire'] + called_pitches[ 'game_date'].map(str) umpDev90 = called_pitches[['game_date', 'umpire_date', 'umpire', 'game_pk']] umpDev90 = pd.DataFrame(umpDev90)