def __init__(self, m=1): if m == 0: self.model = self.baseline() self.type = 0 elif m == 1: self.model = self.returnSequential2() self.type = 2 elif m == 2: self.model = self.returnSequential6() self.type = 2 elif m == 3: self.model = self.RNN() self.type = 1 elif m == 4: self.model = self.multi_RNN() self.type = 1 elif m == 5: self.model = self.lstm() self.type = 1 elif m == 6: self.model = self.multi_lstm() self.type = 1 elif m == 7: self.model = LogisticGAM() self.type = 3 elif m == 8: self.model = self.returnSequential9() self.type = 2
class AdaptiveLogisticGAM(BaseEstimator, RegressorMixin): def __init__(self, param_grid=None, gam_params=None): # create GAM if gam_params is None: gam_params = {} self.model = LogisticGAM(**gam_params) # set grid search parameters if param_grid is None: param_grid = GAM_GRID_BASE self.param_grid = param_grid def fit(self, X, y): if isinstance(X, pd.DataFrame): X = X.values # fit using grid-search self.model.gridsearch(X, y, progress=False, **self.param_grid) def predict(self, X): if isinstance(X, pd.DataFrame): X = X.values return self.model.predict(X) def predict_proba(self, X): if isinstance(X, pd.DataFrame): X = X.values return self.model.predict_proba(X)
def test_exceptions(): # get data X, y, w, ite, p, bs = make_te_data(n=200) train = [i for i in range(100)] test = [i for i in range(100, 200)] with pytest.raises(ValueError): # pass incorrect type of estimator fit_and_score_te_oracle(LinearGAM(), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True) with pytest.raises(ValueError): # fit should throw an error fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True, error_score='raise') with pytest.raises(ValueError): # fit should throw an error because error score is incorrect fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=False, error_score='asdfad') # assert we get error score otherwise score = fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()), X, y, w, p, ite, train=train, test=test, scorer='neg_mean_squared_error', return_test_score_only=True, error_score=np.nan) assert math.isnan(score)
def _plot_logodd(self): # Gérer les manquants dans le GAM lignes_completes = np.invert( np.isnan(self.predictors_cont).sum(axis=1).astype(bool)) # Fit du GAM sur tout le monde gam = LogisticGAM(dtype=['numerical' for _ in range(self.d_cont)] + ['categorical' for _ in range( self.d_qual)]).fit( pd.concat([pd.DataFrame(self.predictors_cont[lignes_completes, :]).apply( lambda x: x.astype('float')), pd.DataFrame(self.predictors_qual[lignes_completes, :]).apply( lambda x: x.astype('category'))], axis=1), self.labels[lignes_completes]) # Quelles que soient les valeurs de predictors_cont_number et # predictors_qual_number, on plot tout pour l'instant plt.figure() fig, axs = plt.subplots(1, self.d_cont + self.d_qual) plt.rcParams['figure.figsize'] = (28, 8) for i, ax in enumerate(axs): try: XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') except ValueError: # pragma: no cover continue plt.show(block=False)
def GAM2(self): """GAM of splines, where we perform variable selection to find the best model.""" from pygam import LogisticGAM, s, l, f terms = s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) gam = LogisticGAM(terms=terms, fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(range(len(ypred.reshape(-1,1))),\ ypred.reshape(-1,1)-0.5,"r.", label='GAM model') plt.plot(range(len(self.ytest)), self.ytest, "b.", label='Testing Data') plt.legend() plt.title("GAM model with linear terms. Prediction data is\n"\ + "scaled downwards by 0.5 for visual purposes.") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show()
def spline_calibration(X, y): gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch( X, y) # add a linear term # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html # gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(X, y) # add a linear term # compute ece and acc after calibration y_ = gam.predict_proba(X) return y_
def __init__(self, param_grid=None, gam_params=None): # create GAM if gam_params is None: gam_params = {} self.model = LogisticGAM(**gam_params) # set grid search parameters if param_grid is None: param_grid = GAM_GRID_BASE self.param_grid = param_grid
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref): # gam = LogisticGAM(s(0)).gridsearch(X, y) # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html gam = LogisticGAM(s(0, constraints='monotonic_inc', n_splines=5)).gridsearch(X, y) # add a linear term #XX = gam.generate_X_grid(term=0) XX = np.linspace(0, 1, 100) ax.plot(XX, gam.predict_proba(XX), c='g') ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--') # compute ece and acc after calibration y_ = gam.predict_proba(X_eval) ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) brier = BrierEval(np.array([1 - y_, y_]).T, y_eval) mse = MseEval(gam, gam_ref, num_bins=100) acc = gam.accuracy(X_eval, y_eval) ax.text(0.05, 0.75, 'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' % (ece, mce, brier, acc, mse), size=6, ha='left', va='center', bbox={ 'facecolor': 'green', 'alpha': 0.5, 'pad': 4 }) ax.set_xlim(0.0, 1.0) ax.set_ylim(0.0, 1.0) confi = gam.confidence_intervals(X_eval, width=0.95) print gam.summary() return ece, mce, brier, acc, mse, ax, confi
def calibrate_propensities(propensities, treatment): """Post-hoc calibration of propensity scores given the true treatments Args: propensities: propensity scores treatment: treatment indicator Returns: p: calibrated version of the propensities given """ gam = LogisticGAM(s(0)).fit(propensities, treatment) return gam.predict_proba(propensities)
def superlearnersetup(var_type, K=5): """Super Learner setup for binary and continuous variables""" if var_type == 'binary': # Binary variable log_b = LogisticRegression(penalty='none', solver='lbfgs', max_iter=1000) rdf_b = RandomForestClassifier( n_estimators=500, min_samples_leaf=20) # max features is sqrt(n_features) gam1_b = LogisticGAM(n_splines=4, lam=0.6) gam2_b = LogisticGAM(n_splines=6, lam=0.6) nn1_b = MLPClassifier(hidden_layer_sizes=(4, ), activation='relu', solver='lbfgs', max_iter=2000) emp_b = EmpiricalMean() lib = [log_b, gam1_b, gam2_b, rdf_b, nn1_b, emp_b] libnames = [ "Logit", "GAM1", "GAM2", "Random Forest", "Neural-Net", "Mean" ] sl = SuperLearner(lib, libnames, loss="nloglik", K=K, print_results=False) elif var_type == 'continuous': # Continuous variable lin_c = LinearRegression() rdf_c = RandomForestRegressor(n_estimators=500, min_samples_leaf=20) gam1_c = GAM(link='identity', n_splines=4, lam=0.6) gam2_c = GAM(link='identity', n_splines=6, lam=0.6) nn1_c = MLPRegressor(hidden_layer_sizes=(4, ), activation='relu', solver='lbfgs', max_iter=2000) emp_c = EmpiricalMean() lib = [lin_c, gam1_c, gam2_c, rdf_c, nn1_c, emp_c] libnames = [ "Linear", "GAM1", "GAM2", "Random Forest", "Neural-Net", "Mean" ] sl = SuperLearner(lib, libnames, K=K, print_results=False) else: raise ValueError("Not Supported") return sl
def spline_classification(X, y, X_eval, y_eval, gam_ref): # gam = LogisticGAM(s(0)).gridsearch(X, y) # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html gam = LogisticGAM(s(0, constraints='monotonic_inc', n_splines=5)).gridsearch(X, y) # add a linear term #XX = gam.generate_X_grid(term=0) # compute ece and acc after calibration y_ = gam.predict_proba(X_eval) ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) brier = BrierEval(np.array([1 - y_, y_]).T, y_eval) mse = MseEval(gam, gam_ref, num_bins=100) acc = gam.accuracy(X_eval, y_eval) # compute the confidence on datapoints of X_eval confi = gam.confidence_intervals(X_eval, width=0.95) return ece, mce, brier, acc, mse, confi
def calibrate(ps, treatment): """Calibrate propensity scores with logistic GAM. Ref: https://pygam.readthedocs.io/en/latest/api/logisticgam.html Args: ps (numpy.array): a propensity score vector treatment (numpy.array): a binary treatment vector (0: control, 1: treated) Returns: (numpy.array): a calibrated propensity score vector """ gam = LogisticGAM(s(0)).fit(ps, treatment) return gam.predict_proba(ps)
def test_if_learner(): # get data without noise X, y, w, ite, p, bs = make_te_data(n=200, noise=False) # get surrogate predictions to compare against po predictions mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w) # get surrogate predictions for two folds as inside the iflearner splitter = StratifiedKFold(n_splits=2, shuffle=True, random_state=42) idx_list = [] for train_index, test_index in splitter.split(X, w): idx_list.append((train_index, test_index)) fold2_mask = np.zeros(200, dtype=bool) fold2_mask[idx_list[0][1]] = 1 mu_0, mu_1 = np.zeros(200), np.zeros(200) mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask) mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask) pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1) # make second stage model t_model = LinearGAM() t_model.fit(X, pseudo_outcome) te_debiased = t_model.predict(X) # fit if learner if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) # test outcomes np.testing.assert_almost_equal(te, te_debiased) np.testing.assert_almost_equal(mu_0, mu_0_plug) np.testing.assert_almost_equal(mu_1, mu_1_plug) np.testing.assert_almost_equal(if_learner.predict(X), te_debiased) with pytest.raises(ValueError): # predicting po when base model not fitted should not be possible if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True) with pytest.warns(UserWarning): # warning raised if only one fold? if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42) if_learner.fit(X, y, w, p) # check that binary_y setting also works (smoketest) X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline, noise=False, binary_y=True) if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(), binary_y=True, setting=RR_NAME, fit_base_model=True) if_learner.fit(X, y, w, p) te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
def simulation(No_T,n,p,box_plot=True): err=[] for i in range (No_T): #generate the test data X_train,Y_train=generate_data(n,p) X_test,Y_test= generate_data(n,p) logit_gam = LogisticGAM() logit_gam.gridsearch(X_train,Y_train) #calculate test error test_err=sum(logit_gam.predict(X_test)!=Y_test)/n err.append(test_err) if box_plot: plt.figure(num=None,figsize=(8,6),dpi=80) plt.boxplot(err) plt.text(1.1,0.15,"Mean:{:.2f}".format(np.mean(err))) plt.text(1.1,0.14,"Var:{:.3f}".format(np.var(err))) plt.title("logisticGAM") plt.ylabel("Test Error") plt.show()
def main(): X = pd.read_csv( './dataset/gradcafe/cs_preprocessed_X.csv', usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape( -1) np.random.seed(0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) fit_gam(X_train, y_train, "Without normalization on X", False) fit_gam(X_train, y_train, "With normalization on X", True) # Normalization is better X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) np.random.seed(0) clf = LogisticGAM() clf.fit(X_train, y_train) training_accuracy = clf.accuracy(X_train, y_train) * 100 testing_accuracy = clf.accuracy(X_test, y_test) * 100 print("------------------------------") print("Results with normalization on testing set") print("------------------------------") print('Training accuracy: {:.2f}%'.format(training_accuracy)) print('Testing accuracy: {:.2f}%'.format(testing_accuracy)) print()
def fit_gam(X, y, comment, use_x_normalization): print("------------------------------") print(comment) print("------------------------------") np.random.seed(0) if use_x_normalization: X = StandardScaler().fit_transform(X) train_scores = np.array([]) val_scores = np.array([]) kf = KFold(n_splits=10, shuffle=True) for train_index, val_index in kf.split(X): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] clf = LogisticGAM() clf.fit(X_train, y_train) train_scores = np.append(train_scores, clf.accuracy(X_train, y_train) * 100) val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100) print('Training accuracy: {:.2f}%'.format(np.mean(train_scores))) print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores))) print()
def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR): model_spec = f(0) if features[0].is_factor() else s( 0, n_splines=self.num_splines) for i in range(1, len(features)): model_spec += f(i) if features[i].is_factor() else s( i, n_splines=self.num_splines) if model_type == TYPE_LINEAR: return LinearGAM(model_spec) if model_type == TYPE_LOGISTIC: return LogisticGAM(model_spec)
def logistic_GAM(x_tr, y_tr, x_tst, y_tst): classifier = LogisticGAM() classifier.fit(x_tr, y_tr) tr_pred = classifier.predict(x_tr) y_pred = classifier.predict(x_tst) confusion_matrix = metrics.confusion_matrix(y_tst, y_pred) print(confusion_matrix) print('Accuracy of logistic regression classifier on test set: {:.2f}' \ .format(metrics.accuracy_score(y_pred, y_tst))) print('Accuracy of logistic regression classifier on train set: {:.2f}' \ .format(metrics.accuracy_score(tr_pred, y_tr))) '''
def fit(self): S = s(0) if self.feature_names[0] in self.numerical_features else f(0) for i in range(1, len(self.feature_names)): if self.feature_names[i] in self.numerical_features: S += s(i) else: S += f(i) if self.mode == 'regression': gam = LinearGAM(S) gam.gridsearch(self.X_train, self.y_train) self._is_fitted = True self.explainer = gam elif self.mode == 'classification': gam = LogisticGAM(S) gam.gridsearch(np.array(self.X_train), self.y_train) self._is_fitted = True self.explainer = gam else: raise NameError( 'ERROR: mode should be regression or classification')
def create_rand_gam(number_of_searches, new_values, pred_y, y, pca_splines, pca_lam, pred_splines, pred_lam, pred_factor): lams = np.random.rand(number_of_searches, new_values.shape[1] + 1) # random points on [0, 1], with shape (1000, 3) lams = lams * 8 - 4 # shift values to -4, 4 lams = 10**lams # transforms values to 1e-4, 1e4 new_values = np.append(new_values, np.array(pred_y).reshape(-1, 1), axis=1) titles = [] for i in range(new_values.shape[1] - 1): titles.append(str(i)) if i == 0: x = s(i, n_splines=pca_splines, lam=pca_lam) else: x = x + s(i, n_splines=pca_splines, lam=pca_lam) if pred_factor: x = x + pygam.terms.f(i + 1, lam=pred_lam) else: x = x + s(i + 1, n_splines=pred_splines, lam=pred_lam) rand_gam = LogisticGAM(x).gridsearch(new_values, y, lam=lams) return rand_gam, new_values, titles
def test_plugin_learner(): # get data without noise X, y, w, ite, p, bs = make_te_data(n=200, noise=False) # get surrogates mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w) # use plug in learner p_model = PlugInTELearner(LinearGAM()) p_model.fit(X, y, w, p) te, mu_0, mu_1 = p_model.predict(X, return_po=True) # test outcomes np.testing.assert_almost_equal(te, mu_1_plug - mu_0_plug) np.testing.assert_almost_equal(mu_0, mu_0_plug) np.testing.assert_almost_equal(mu_1, mu_1_plug) np.testing.assert_almost_equal(p_model.predict(X), mu_1_plug - mu_0_plug) # check that binary_y setting also works (smoketest) X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline, noise=False, binary_y=True) p_model = PlugInTELearner(LogisticGAM(), binary_y=True, setting=RR_NAME) p_model.fit(X, y, w, p) te, mu_0, mu_1 = p_model.predict(X, return_po=True)
#----------------------------------------------------- #load the breast cancer data set ds = load_breast_cancer() X, y = ds.data, ds.target #select first 6 features only X = X[:, 0:6] selected_features = ds.feature_names[0:6] #----------------------------------------------------- #Fit a model with the default parameters gam = LogisticGAM().fit(X, y) gam.summary() roc_auc_score(y, gam.predict_proba(X)) #0.994173140954495 gam.accuracy(X, y) #0.9560632688927944 #----------------------------------------------------- # Explore and interpret individual features plt.ion() plt.rcParams['figure.figsize'] = (28, 8) fig, axs = plt.subplots(1, X.shape[1]) for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i, meshgrid=True)
gam_mse = np.mean((gam_predictions - d['mpg'])**2) print('MSE:', gam_mse) #Add binary and categorical predictors to the GAM from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() d['h_bin'] = encoder.fit_transform(pd.cut(d['hp'], 5)) gam_model = LinearGAM().fit(d[['disp', 'wt', 'vs', 'h_bin']], d['mpg']) print(gam_model.summary()) gam_predictions = gam_model.predict(d[['disp', 'wt', 'vs', 'h_bin']]) gam_mse = np.mean((gam_predictions - d['mpg'])**2) print('MSE:', gam_mse) #Performing classification (logistic regression) with the GAM d['mpg_bin'] = encoder.fit_transform(pd.cut(d['mpg'], [0, 20, 100])) gam_model = LogisticGAM().gridsearch(d[['disp', 'wt', 'vs', 'h_bin']], d['mpg_bin']) print(gam_model.summary()) print('Classification Accuracy:', gam_model.accuracy(d[['disp', 'wt', 'vs', 'h_bin']], d['mpg_bin'])) #This models the conditional probabilities for mpg being < 20 and >=20 #Note the y-axis of these plots is the logit ''' ------------------------------------------------------------------------------- -------------------Classification and Regression Trees------------------------- ------------------------------------------------------------------------------- ''' #Regression tree reg_tree = DecisionTreeRegressor(criterion='mse', min_samples_split=20).fit( d.drop(['mpg', 'mpg_bin'], axis=1), d['mpg'])
ls='--') if i == 0: ax.set_ylim(-30, 30) ax.set_title(titles[i]) ###################################################### # classification from pygam import LogisticGAM, s, f from pygam.datasets import default X, y = default(return_X_y=True) X.shape X gam = LogisticGAM(f(0) + s(1) + s(2)).gridsearch(X, y) fig, axs = plt.subplots(1, 3) titles = ['student', 'balance', 'income'] for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) ax.plot(XX[:, i], pdep) ax.plot(XX[:, i], confi, c='r', ls='--') ax.set_title(titles[i]) gam.accuracy(X, y) ######################################################
"Maximum Expression Value", "Copy Number -1", "Copy Number 0", "Copy Number 1" ] trainX, testX, trainy, testy = train_test_split(X, Y, test_size=0.4) from pygam import LogisticGAM, s, f #print("allocated") #print(trainX) aa = s(0) names1 = ["Silent"] for i in range(1, 37): #if(not(i==4 or i==6 or i==10 or i==11)): if (not (i == 4 or i == 6 or i == 10 or i == 11)): aa += s(i) names1.append(names[i]) #print("yeh karre") gam1 = LogisticGAM(aa) #print(gam1.lam) w = [] for y in trainy: if (y == 0): w.append(1) else: w.append(10) gam1 = gam1.fit(trainX, trainy, weights=w) import numpy as np lams = np.random.rand(10, 33) # random points on [0, 1], with shape (100, 3) n_splines = [5, 10, 15, 20, 25] lams = lams * 6 # shift values to -3, 3 lams = lams - 3 lams = np.exp(lams) cons = [
auc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='roc_auc').mean() print("\n calculate cross-validated AUC (M2. X_train_scaled_poly):", auc_log2) acc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='accuracy').mean() print("\n calculate cross-validated accurancy (M2. X_train_scaled_poly):", acc_log2) acc_logs3 = cross_validation.cross_val_predict(logreg, X_train_scaled_poly, Y_train, cv=10) print(metrics.accuracy_score(Y_train, acc_logs2)) print(metrics.classification_report(Y_train, acc_logs3)) print(logreg.coef_) print('\n ------------------------------------------------------------------') # call predict_proba() to get the list of probabilities that the classifier assigned to each instance for each class: ############################################################################################################################### # GAM import pandas as pd from pygam import LogisticGAM # Fit a model with the default parameters gam = LogisticGAM().fit(X_train_scaled, Y_train) gam.summary() print('gam.accuracy(X_train_scaled, Y_train):',gam.accuracy(X_train_scaled, Y_train)) print('gam.accuracy(X_test_scaled, Y_test):',gam.accuracy(X_test_scaled, Y_test)) acc_loggamc = cross_val_score(gam, X_train_scaled, Y_train, cv=10, scoring='accuracy').mean() print('acc_loggam_cross-validation, train_scaled',acc_loggamc) # make predictions for testing set Y_scaler_pred_class = logreg.predict(X_test_scaled) # calculate testing accuracy from sklearn import metrics print('\n ------------------------------------------------------------------') print("\n calculate testing accuracy (M1. X_train_scaled):", metrics.accuracy_score(Y_test, Y_scaler_pred_class)) print('\n ------------------------------------------------------------------')
def score_notes_from_network(csv_dir, name = False, pca = False, load = False): dataset = pd.read_csv(csv_dir) y = dataset['truth'] del dataset['truth'] if not name: name = str(uuid.uuid4()) if not pca: pca = PCA(0.95) pca.fit(dataset) new_values = pca.transform(dataset) new_values, y = datasets.load_iris(return_X_y=True) if not load: from sklearn.linear_model import LogisticRegression X, y = datasets.load_iris(return_X_y=True) clf = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=1000).fit(X, y) aaa = clf.predict_proba(X) base_estimator = LogisticGAM(s(0) + s(1) + s(2) + s(3)) ensemble = OneVsRestClassifier(base_estimator, n_jobs=1).fit(new_values, y).predict(new_values) ensemble.fit(new_values, y) ensemble.predict_proba(new_values) else: with open(load, 'rb') as f: rand_gam = pickle.load(f) rand_gam.summary() scores = rand_gam.predict_proba(new_values) rand_gam.accuracy(new_values, y) scr_order = np.argsort(scores) filename = name + '.pkl' with open('./GAM_model/models/' + name, 'wb') as f: pickle.dump(rand_gam, f) w = 480 h = 480 columns = 21 rows = 1 # ax enables access to manipulate each of subplots dirList =['./pl_train_station/output_hed_images/', './pl_train_station/pretty_good_set_input/'] ordered_list = [[os.listdir('./pl_train_station/output_hed_images/')[i] for i in scr_order]][0] ordered_labels = [[y[i] for i in scr_order]][0] shutil.rmtree('./GAM_model/scored/rgb/') shutil.rmtree('./GAM_model/scored/edge/') os.makedirs('./GAM_model/scored/rgb/') os.makedirs('./GAM_model/scored/edge/') for i, img in enumerate(ordered_list): res_img = cv2.resize(cv2.imread(dirList[0] + img), (w, h)) cv2.imwrite('./GAM_model/scored/edge/' + str(i) + '_' + str(ordered_labels[i]) + '_' + str(scores[scr_order[i]]) + '.bmp', res_img) for i, img in enumerate(ordered_list): img = img[0:-10] + '.bmp' res_img = cv2.imread(dirList[1] + img) cv2.imwrite('./GAM_model/scored/rgb/' + str(i) + '_' + str(ordered_labels[i]) + '_' + str(scores[scr_order[i]]) + '.bmp', res_img)
tumors.head() # In[21]: # Se quita la columna de identificación de los datos. tumors = tumors.drop(['id'], axis=1) tumors.loc[tumors['diagnosis'] == 'M', 'diagnosis'] = 1 #Se cambia la variable a tipo binario. tumors.loc[tumors['diagnosis'] == 'B', 'diagnosis'] = 0 # tumors_X = tumors.iloc[:, :11].drop( ['diagnosis'], axis=1).values #Separamos las variables independientes tumors_y = tumors['diagnosis'] #Separamos las variables dependientes. # In[22]: gam = LogisticGAM(n_splines=20).gridsearch(tumors_X, tumors_y) gam.summary() # In[23]: print('La precisión del módelo es:', round(gam.accuracy(tumors_X, tumors_y) * 100, 2), "%") # In[24]: titles = tumors.columns[1:11] plt.figure() fig, axs = plt.subplots(1, 10, figsize=(40, 8)) for i, ax in enumerate(axs): XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) import pandas as pd import numpy as np from sklearn.preprocessing import OneHotEncoder from collections import Counter import pygam from pygam import LinearGAM, LogisticGAM import matplotlib.pyplot as plt # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folder tmp_folder = self._create_tmp_folder(logger) # Set up model if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) clf = LogisticGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = True else: clf = LinearGAM(terms="auto", lam=self.params["lam"], max_iter=self.params["max_iter"]) self.is_classifier = False X = self.basic_impute(X) # Find the datatypes X = X.to_pandas() X.columns = orig_cols # Change continuous features to categorical X_datatypes = [str(item) for item in list(X.dtypes)] # Change all float32 values to float64 for ii in range(len(X_datatypes)): if X_datatypes[ii] == 'float32': X = X.astype({orig_cols[ii]: np.float64}) X_datatypes = [str(item) for item in list(X.dtypes)] # List the categorical and numerical features self.X_categorical = [ orig_cols[col_count] for col_count in range(len(orig_cols)) if (X_datatypes[col_count] == 'category') or ( X_datatypes[col_count] == 'object') ] self.X_numeric = [ item for item in orig_cols if item not in self.X_categorical ] # Find the levels and mode for each categorical feature # for use in the test set self.train_levels = {} for item in self.X_categorical: self.train_levels[item] = list(set(X[item])) self.train_mode[item] = Counter(X[item]).most_common(1)[0][0] # One hot encode the categorical features # And replace missing values with a Missing category if len(self.X_categorical) > 0: X.loc[:, self.X_categorical] = X[self.X_categorical].fillna( "Missing").copy() self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(X[self.X_categorical]) self.encoded_categories = list( self.enc.get_feature_names(input_features=self.X_categorical)) X_enc = self.enc.transform(X[self.X_categorical]).toarray() X = pd.concat([ X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories) ], axis=1) # Replace missing values with a missing value code self.median_train = {} if len(self.X_numeric) > 0: for colname in self.X_numeric: self.median_train[colname] = X[colname].quantile(0.5) X.loc[:, colname] = X[colname].fillna( self.median_train[colname]).copy() try: clf.fit(X, y) except np.linalg.LinAlgError as e: raise IgnoreError("np.linalg.LinAlgError") from e except pygam.utils.OptimizationError as e: raise IgnoreError("pygam.utils.OptimizationError") from e except ValueError as e: if 'On entry to DLASCL parameter number' in str(e): raise IgnoreError('On entry to DLASCL parameter number') from e raise p_values = np.array(clf.statistics_['p_values']) # Plot the partial dependence plots for each feature for ii in range(X.shape[1]): XX = clf.generate_X_grid(term=ii) plt.figure() plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX)) plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX, width=.95)[1], c='r', ls='--') plt.title("Partial Dependence " + str(ii), fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join( tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'), bbox_inches="tight") if max(p_values[0:(len(p_values) - 1)]) > 0: importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16)) importances = list(importances / max(importances)) else: importances = [1] * (len(p_values) - 1) self.mean_target = np.array(sum(y) / len(y)) self.set_model_properties(model=clf, features=list(X.columns), importances=importances, iterations=self.params['n_estimators'])
rfc_predictions_2020 = predict_2020(positions=position_encoder.inverse_transform(features_2020['All_NBA_Pos']), player_names=current_dat['Player'], binary_prediction=rfc__2020, probability_predictions= rfc_probs_2020[:,1]) rfc_predictions_2020 rfc_predictions_2020.to_csv("rfc_predictions.csv") ##### ----- ##### ----- ##### ----- ##### -----# #### ----- ##### ----- ##### ----- ##### ----- ##### # Model 1.3 - Generalized Additive Models from pygam import LogisticGAM #Fit a GAM model with the default parameters gam_model = LogisticGAM() gam_model.fit(X_train, y_train) gam_pred_prob = gam_model.predict_proba(X_test) gam_preds, complete_gam_dat = top_15_predictions(entire_test_data, gam_pred_prob ) gam_performance = all_nba_test_report(complete_gam_dat) players_missed(complete_gam_dat)