def fit_gam(X, y, comment, use_x_normalization): print("------------------------------") print(comment) print("------------------------------") np.random.seed(0) if use_x_normalization: X = StandardScaler().fit_transform(X) train_scores = np.array([]) val_scores = np.array([]) kf = KFold(n_splits=10, shuffle=True) for train_index, val_index in kf.split(X): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] clf = LogisticGAM() clf.fit(X_train, y_train) train_scores = np.append(train_scores, clf.accuracy(X_train, y_train) * 100) val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100) print('Training accuracy: {:.2f}%'.format(np.mean(train_scores))) print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores))) print()
def main(): X = pd.read_csv( './dataset/gradcafe/cs_preprocessed_X.csv', usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape( -1) np.random.seed(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) fit_gam(X_train, y_train, "Without normalization on X", False) fit_gam(X_train, y_train, "With normalization on X", True) # Normalization is better X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) np.random.seed(0) clf = LogisticGAM() clf.fit(X_train, y_train) training_accuracy = clf.accuracy(X_train, y_train) * 100 testing_accuracy = clf.accuracy(X_test, y_test) * 100 print("------------------------------") print("Results with normalization on testing set") print("------------------------------") print('Training accuracy: {:.2f}%'.format(training_accuracy)) print('Testing accuracy: {:.2f}%'.format(testing_accuracy)) print()
def logistic_GAM(x_tr, y_tr, x_tst, y_tst): classifier = LogisticGAM() classifier.fit(x_tr, y_tr) tr_pred = classifier.predict(x_tr) y_pred = classifier.predict(x_tst) confusion_matrix = metrics.confusion_matrix(y_tst, y_pred) print(confusion_matrix) print('Accuracy of logistic regression classifier on test set: {:.2f}' \ .format(metrics.accuracy_score(y_pred, y_tst))) print('Accuracy of logistic regression classifier on train set: {:.2f}' \ .format(metrics.accuracy_score(tr_pred, y_tr))) '''
names1 = ["Silent"] for i in range(1, 37): #if(not(i==4 or i==6 or i==10 or i==11)): if (not (i == 4 or i == 6 or i == 10 or i == 11)): aa += s(i) names1.append(names[i]) #print("yeh karre") gam1 = LogisticGAM(aa) #print(gam1.lam) w = [] for y in trainy: if (y == 0): w.append(1) else: w.append(10) gam1 = gam1.fit(trainX, trainy, weights=w) import numpy as np lams = np.random.rand(10, 33) # random points on [0, 1], with shape (100, 3) n_splines = [5, 10, 15, 20, 25] lams = lams * 6 # shift values to -3, 3 lams = lams - 3 lams = np.exp(lams) cons = [ 'convex', 'concave', 'monotonic_inc', 'monotonic_dec', 'circular', 'none' ] random = LogisticGAM(aa).gridsearch(trainX, trainy, weights=w, lam=lams, n_splines=n_splines) random = random.gridsearch(trainX, trainy, constraints=cons)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter import pygam from pygam import LinearGAM, LogisticGAM import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = LogisticGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = True else: clf = LinearGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = False X = self.basic_impute(X) # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code self.median_train = {} if len(self.X_numeric) > 0: for colname in self.X_numeric: self.median_train[colname] = X[colname].quantile(0.5) X.loc[:, colname] = X[colname].fillna( self.median_train[colname]).copy() try: clf.fit(X, y) except np.linalg.LinAlgError as e: raise IgnoreError("np.linalg.LinAlgError") from e except pygam.utils.OptimizationError as e: raise IgnoreError("pygam.utils.OptimizationError") from e except ValueError as e: if 'On entry to DLASCL parameter number' in str(e): raise IgnoreError('On entry to DLASCL parameter number') from e raise p_values = np.array(clf.statistics_['p_values']) # Plot the partial dependence plots for each feature for ii in range(X.shape[1]): XX = clf.generate_X_grid(term=ii) plt.figure() plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX)) plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX, width=.95)[1], c='r', ls='--') plt.title("Partial Dependence " + str(ii), fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join( tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'), bbox_inches="tight") if max(p_values[0:(len(p_values) - 1)]) > 0: importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16)) importances = list(importances / max(importances)) else: importances = [1] * (len(p_values) - 1) self.mean_target = np.array(sum(y) / len(y)) self.set_model_properties(model=clf, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
player_names=current_dat['Player'], binary_prediction=rfc__2020, probability_predictions= rfc_probs_2020[:,1]) rfc_predictions_2020 rfc_predictions_2020.to_csv("rfc_predictions.csv") ##### ----- ##### ----- ##### ----- ##### -----# #### ----- ##### ----- ##### ----- ##### ----- ##### # Model 1.3 - Generalized Additive Models from pygam import LogisticGAM #Fit a GAM model with the default parameters gam_model = LogisticGAM() gam_model.fit(X_train, y_train) gam_pred_prob = gam_model.predict_proba(X_test) gam_preds, complete_gam_dat = top_15_predictions(entire_test_data, gam_pred_prob ) gam_performance = all_nba_test_report(complete_gam_dat) players_missed(complete_gam_dat)
class EpidemicModels: # Sequential 6 layer neural network def returnSequential6(self): model = Sequential() model.add(Dense(50, input_dim=20, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def returnSequential9(self): model = Sequential() model.add(Dense(80, input_dim=20, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def RNN(self): model = Sequential() model.add(SimpleRNN(2, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def multi_RNN(self): model = Sequential() model.add(SimpleRNN(2, input_dim=20)) model.add(Dense(40, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def baseline(self): # Create model model = Sequential() model.add(Dense(20, input_dim=20, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def lstm(self): model = Sequential() model.add(LSTM(10, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model def multi_lstm(self): model = Sequential() model.add(LSTM(4, input_dim=20, return_sequences=True)) model.add(LSTM(4, input_dim=20)) model.add(Dense(1, activation='linear')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model # Sequential 4 layer neural network def returnSequential2(self): model = Sequential() model.add(Dense(14, activation='relu', input_dim=20)) model.add(Dense(units=7, activation='relu')) model.add(Dense(units=1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def __init__(self, m=1): if m == 0: self.model = self.baseline() self.type = 0 elif m == 1: self.model = self.returnSequential2() self.type = 2 elif m == 2: self.model = self.returnSequential6() self.type = 2 elif m == 3: self.model = self.RNN() self.type = 1 elif m == 4: self.model = self.multi_RNN() self.type = 1 elif m == 5: self.model = self.lstm() self.type = 1 elif m == 6: self.model = self.multi_lstm() self.type = 1 elif m == 7: self.model = LogisticGAM() self.type = 3 elif m == 8: self.model = self.returnSequential9() self.type = 2 def returnModel(self): return self.model def train(self, X, y, bs=10, epochs=100): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) if self.type == 3: self.model.gridsearch(X, y) else: self.model.fit(X, y, batch_size=bs, epochs=epochs, shuffle=True) def prediction(self, X): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) return self.model.predict(X) def cross_eval(self, X, y, bs=10, ep=100, k=5): scores = [] if self.type == 0: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 2: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) print(score) scores.append(score) return sum(scores) / len(scores) elif self.type == 3: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) print(y_pre) scores.append(f1_score(y_pre, y_test)) return sum(scores) / len(scores)