Beispiel #1
0
 def __init__(self, m=1):
     if m == 0:
         self.model = self.baseline()
         self.type = 0
     elif m == 1:
         self.model = self.returnSequential2()
         self.type = 2
     elif m == 2:
         self.model = self.returnSequential6()
         self.type = 2
     elif m == 3:
         self.model = self.RNN()
         self.type = 1
     elif m == 4:
         self.model = self.multi_RNN()
         self.type = 1
     elif m == 5:
         self.model = self.lstm()
         self.type = 1
     elif m == 6:
         self.model = self.multi_lstm()
         self.type = 1
     elif m == 7:
         self.model = LogisticGAM()
         self.type = 3
     elif m == 8:
         self.model = self.returnSequential9()
         self.type = 2
Beispiel #2
0
class AdaptiveLogisticGAM(BaseEstimator, RegressorMixin):
    def __init__(self, param_grid=None, gam_params=None):
        # create GAM
        if gam_params is None:
            gam_params = {}
        self.model = LogisticGAM(**gam_params)

        # set grid search parameters
        if param_grid is None:
            param_grid = GAM_GRID_BASE
        self.param_grid = param_grid

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X = X.values

        # fit using grid-search
        self.model.gridsearch(X, y, progress=False, **self.param_grid)

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        return self.model.predict(X)

    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        return self.model.predict_proba(X)
def test_exceptions():
    # get data
    X, y, w, ite, p, bs = make_te_data(n=200)
    train = [i for i in range(100)]
    test = [i for i in range(100, 200)]

    with pytest.raises(ValueError):
        # pass incorrect type of estimator
        fit_and_score_te_oracle(LinearGAM(),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=True)

    with pytest.raises(ValueError):
        # fit should throw an error
        fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=True,
                                error_score='raise')

    with pytest.raises(ValueError):
        # fit should throw an error because error score is incorrect
        fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=False,
                                error_score='asdfad')

    # assert we get error score otherwise
    score = fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                    X,
                                    y,
                                    w,
                                    p,
                                    ite,
                                    train=train,
                                    test=test,
                                    scorer='neg_mean_squared_error',
                                    return_test_score_only=True,
                                    error_score=np.nan)
    assert math.isnan(score)
Beispiel #4
0
def _plot_logodd(self):
    # Gérer les manquants dans le GAM
    lignes_completes = np.invert(
        np.isnan(self.predictors_cont).sum(axis=1).astype(bool))

    # Fit du GAM sur tout le monde
    gam = LogisticGAM(dtype=['numerical' for _ in range(self.d_cont)] + ['categorical' for _ in range(
        self.d_qual)]).fit(
        pd.concat([pd.DataFrame(self.predictors_cont[lignes_completes, :]).apply(
            lambda x: x.astype('float')),
            pd.DataFrame(self.predictors_qual[lignes_completes, :]).apply(
                lambda x: x.astype('category'))], axis=1), self.labels[lignes_completes])

    # Quelles que soient les valeurs de predictors_cont_number et
    # predictors_qual_number, on plot tout pour l'instant
    plt.figure()
    fig, axs = plt.subplots(1, self.d_cont + self.d_qual)
    plt.rcParams['figure.figsize'] = (28, 8)
    for i, ax in enumerate(axs):
        try:
            XX = gam.generate_X_grid(term=i)
            ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
            ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--')
        except ValueError:  # pragma: no cover
            continue
    plt.show(block=False)
Beispiel #5
0
    def GAM2(self):
        """GAM of splines, where we perform variable selection
        to find the best model."""
        from pygam import LogisticGAM, s, l, f
        terms = s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7)

        gam = LogisticGAM(terms=terms, fit_intercept=False)
        mod = gam.gridsearch(self.Xtrain.values, self.ytrain, \
            lam=np.logspace(-3, 3, 11))     # Generate the model
        mod.summary()  # Pseudo-R2: 0.6449
        ypred = mod.predict(self.Xtest)
        MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values

        if self.plot:
            plt.plot(range(len(ypred.reshape(-1,1))),\
                ypred.reshape(-1,1)-0.5,"r.", label='GAM model')
            plt.plot(range(len(self.ytest)),
                     self.ytest,
                     "b.",
                     label='Testing Data')
            plt.legend()
            plt.title("GAM model with linear terms. Prediction data is\n"\
                + "scaled downwards by 0.5 for visual purposes.")
            plt.ylabel("FFVC score")
            plt.xlabel("Sample no.")
            plt.show()
Beispiel #6
0
def spline_calibration(X, y):
    gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(
        X, y)  # add a linear term
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    # gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(X, y) # add a linear term
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X)
    return y_
Beispiel #7
0
    def __init__(self, param_grid=None, gam_params=None):
        # create GAM
        if gam_params is None:
            gam_params = {}
        self.model = LogisticGAM(**gam_params)

        # set grid search parameters
        if param_grid is None:
            param_grid = GAM_GRID_BASE
        self.param_grid = param_grid
Beispiel #8
0
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref):
    # gam = LogisticGAM(s(0)).gridsearch(X, y)
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    gam = LogisticGAM(s(0, constraints='monotonic_inc',
                        n_splines=5)).gridsearch(X, y)  # add a linear term
    #XX = gam.generate_X_grid(term=0)
    XX = np.linspace(0, 1, 100)
    ax.plot(XX, gam.predict_proba(XX), c='g')
    ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--')
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X_eval)
    ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    brier = BrierEval(np.array([1 - y_, y_]).T, y_eval)
    mse = MseEval(gam, gam_ref, num_bins=100)
    acc = gam.accuracy(X_eval, y_eval)
    ax.text(0.05,
            0.75,
            'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' %
            (ece, mce, brier, acc, mse),
            size=6,
            ha='left',
            va='center',
            bbox={
                'facecolor': 'green',
                'alpha': 0.5,
                'pad': 4
            })
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    confi = gam.confidence_intervals(X_eval, width=0.95)
    print gam.summary()
    return ece, mce, brier, acc, mse, ax, confi
Beispiel #9
0
def calibrate_propensities(propensities, treatment):
    """Post-hoc calibration of propensity scores given the true treatments

    Args:
        propensities: propensity scores
        treatment: treatment indicator

    Returns:
        p: calibrated version of the propensities given
    """
    gam = LogisticGAM(s(0)).fit(propensities, treatment)
    return gam.predict_proba(propensities)
def superlearnersetup(var_type, K=5):
    """Super Learner setup for binary and continuous variables"""
    if var_type == 'binary':
        # Binary variable
        log_b = LogisticRegression(penalty='none',
                                   solver='lbfgs',
                                   max_iter=1000)
        rdf_b = RandomForestClassifier(
            n_estimators=500,
            min_samples_leaf=20)  # max features is sqrt(n_features)
        gam1_b = LogisticGAM(n_splines=4, lam=0.6)
        gam2_b = LogisticGAM(n_splines=6, lam=0.6)
        nn1_b = MLPClassifier(hidden_layer_sizes=(4, ),
                              activation='relu',
                              solver='lbfgs',
                              max_iter=2000)
        emp_b = EmpiricalMean()

        lib = [log_b, gam1_b, gam2_b, rdf_b, nn1_b, emp_b]
        libnames = [
            "Logit", "GAM1", "GAM2", "Random Forest", "Neural-Net", "Mean"
        ]
        sl = SuperLearner(lib,
                          libnames,
                          loss="nloglik",
                          K=K,
                          print_results=False)

    elif var_type == 'continuous':
        # Continuous variable
        lin_c = LinearRegression()
        rdf_c = RandomForestRegressor(n_estimators=500, min_samples_leaf=20)
        gam1_c = GAM(link='identity', n_splines=4, lam=0.6)
        gam2_c = GAM(link='identity', n_splines=6, lam=0.6)
        nn1_c = MLPRegressor(hidden_layer_sizes=(4, ),
                             activation='relu',
                             solver='lbfgs',
                             max_iter=2000)
        emp_c = EmpiricalMean()

        lib = [lin_c, gam1_c, gam2_c, rdf_c, nn1_c, emp_c]
        libnames = [
            "Linear", "GAM1", "GAM2", "Random Forest", "Neural-Net", "Mean"
        ]
        sl = SuperLearner(lib, libnames, K=K, print_results=False)

    else:
        raise ValueError("Not Supported")

    return sl
Beispiel #11
0
def spline_classification(X, y, X_eval, y_eval, gam_ref):
    # gam = LogisticGAM(s(0)).gridsearch(X, y)
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    gam = LogisticGAM(s(0, constraints='monotonic_inc',
                        n_splines=5)).gridsearch(X, y)  # add a linear term
    #XX = gam.generate_X_grid(term=0)
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X_eval)
    ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    brier = BrierEval(np.array([1 - y_, y_]).T, y_eval)
    mse = MseEval(gam, gam_ref, num_bins=100)
    acc = gam.accuracy(X_eval, y_eval)
    # compute the confidence on datapoints of X_eval
    confi = gam.confidence_intervals(X_eval, width=0.95)
    return ece, mce, brier, acc, mse, confi
Beispiel #12
0
def calibrate(ps, treatment):
    """Calibrate propensity scores with logistic GAM.

    Ref: https://pygam.readthedocs.io/en/latest/api/logisticgam.html

    Args:
        ps (numpy.array): a propensity score vector
        treatment (numpy.array): a binary treatment vector (0: control, 1: treated)

    Returns:
        (numpy.array): a calibrated propensity score vector
    """

    gam = LogisticGAM(s(0)).fit(ps, treatment)

    return gam.predict_proba(ps)
def test_if_learner():
    # get data without noise
    X, y, w, ite, p, bs = make_te_data(n=200, noise=False)

    # get surrogate predictions to compare against po predictions
    mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w)

    # get surrogate predictions for two folds as inside the iflearner
    splitter = StratifiedKFold(n_splits=2, shuffle=True,
                               random_state=42)
    idx_list = []
    for train_index, test_index in splitter.split(X, w):
        idx_list.append((train_index, test_index))

    fold2_mask = np.zeros(200, dtype=bool)
    fold2_mask[idx_list[0][1]] = 1
    mu_0, mu_1 = np.zeros(200), np.zeros(200)
    mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask)
    mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask)
    pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1)

    # make second stage model
    t_model = LinearGAM()
    t_model.fit(X, pseudo_outcome)
    te_debiased = t_model.predict(X)

    # fit if learner
    if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    # test outcomes
    np.testing.assert_almost_equal(te, te_debiased)
    np.testing.assert_almost_equal(mu_0, mu_0_plug)
    np.testing.assert_almost_equal(mu_1, mu_1_plug)
    np.testing.assert_almost_equal(if_learner.predict(X), te_debiased)

    with pytest.raises(ValueError):
        # predicting po when base model not fitted should not be possible
        if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42)
        if_learner.fit(X, y, w, p)
        te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    with pytest.warns(UserWarning):
        # warning raised if only one fold?
        if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42)
        if_learner.fit(X, y, w, p)

    # check that binary_y setting also works (smoketest)
    X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline,
                                       noise=False, binary_y=True)
    if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(),
                             binary_y=True, setting=RR_NAME, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
Beispiel #14
0
def simulation(No_T,n,p,box_plot=True):
    err=[]
    for i in range (No_T):
    #generate the test data
        X_train,Y_train=generate_data(n,p)
        X_test,Y_test= generate_data(n,p)
        
        logit_gam = LogisticGAM()
        logit_gam.gridsearch(X_train,Y_train)
        
        #calculate test error
        test_err=sum(logit_gam.predict(X_test)!=Y_test)/n
        err.append(test_err)
    if box_plot:
        plt.figure(num=None,figsize=(8,6),dpi=80)
        plt.boxplot(err)
        plt.text(1.1,0.15,"Mean:{:.2f}".format(np.mean(err)))
        plt.text(1.1,0.14,"Var:{:.3f}".format(np.var(err)))
        plt.title("logisticGAM")
        plt.ylabel("Test Error")
        plt.show()
Beispiel #15
0
def main():
    X = pd.read_csv(
        './dataset/gradcafe/cs_preprocessed_X.csv',
        usecols=[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]).values
    # X = pd.read_csv('./dataset/gradcafe/pnp_x.csv', header=None).values
    y = pd.read_csv('./dataset/gradcafe/cs_preprocessed_Y.csv').values.reshape(
        -1)

    np.random.seed(0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    fit_gam(X_train, y_train, "Without normalization on X", False)
    fit_gam(X_train, y_train, "With normalization on X", True)

    # Normalization is better
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    np.random.seed(0)
    clf = LogisticGAM()
    clf.fit(X_train, y_train)

    training_accuracy = clf.accuracy(X_train, y_train) * 100
    testing_accuracy = clf.accuracy(X_test, y_test) * 100

    print("------------------------------")
    print("Results with normalization on testing set")
    print("------------------------------")
    print('Training accuracy: {:.2f}%'.format(training_accuracy))
    print('Testing accuracy: {:.2f}%'.format(testing_accuracy))
    print()
Beispiel #16
0
def fit_gam(X, y, comment, use_x_normalization):
    print("------------------------------")
    print(comment)
    print("------------------------------")
    np.random.seed(0)
    if use_x_normalization:
        X = StandardScaler().fit_transform(X)

    train_scores = np.array([])
    val_scores = np.array([])

    kf = KFold(n_splits=10, shuffle=True)
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        clf = LogisticGAM()
        clf.fit(X_train, y_train)

        train_scores = np.append(train_scores,
                                 clf.accuracy(X_train, y_train) * 100)
        val_scores = np.append(val_scores, clf.accuracy(X_val, y_val) * 100)

    print('Training accuracy: {:.2f}%'.format(np.mean(train_scores)))
    print('Validation accuracy: {:.2f}%'.format(np.mean(val_scores)))
    print()
    def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR):

        model_spec = f(0) if features[0].is_factor() else s(
            0, n_splines=self.num_splines)

        for i in range(1, len(features)):
            model_spec += f(i) if features[i].is_factor() else s(
                i, n_splines=self.num_splines)

        if model_type == TYPE_LINEAR:
            return LinearGAM(model_spec)

        if model_type == TYPE_LOGISTIC:
            return LogisticGAM(model_spec)
Beispiel #18
0
def logistic_GAM(x_tr, y_tr, x_tst, y_tst):
    classifier = LogisticGAM()
    classifier.fit(x_tr, y_tr)
    tr_pred = classifier.predict(x_tr)
    y_pred = classifier.predict(x_tst)
    confusion_matrix = metrics.confusion_matrix(y_tst, y_pred)
    print(confusion_matrix)
    print('Accuracy of logistic regression classifier on test set: {:.2f}' \
          .format(metrics.accuracy_score(y_pred, y_tst)))
    print('Accuracy of logistic regression classifier on train set: {:.2f}' \
          .format(metrics.accuracy_score(tr_pred, y_tr)))
    '''
    def fit(self):
        S = s(0) if self.feature_names[0] in self.numerical_features else f(0)
        for i in range(1, len(self.feature_names)):
            if self.feature_names[i] in self.numerical_features:
                S += s(i)
            else:
                S += f(i)

        if self.mode == 'regression':
            gam = LinearGAM(S)
            gam.gridsearch(self.X_train, self.y_train)
            self._is_fitted = True
            self.explainer = gam
        elif self.mode == 'classification':
            gam = LogisticGAM(S)
            gam.gridsearch(np.array(self.X_train), self.y_train)
            self._is_fitted = True
            self.explainer = gam
        else:
            raise NameError(
                'ERROR: mode should be regression or classification')
def create_rand_gam(number_of_searches, new_values, pred_y, y, pca_splines,
                    pca_lam, pred_splines, pred_lam, pred_factor):
    lams = np.random.rand(number_of_searches, new_values.shape[1] +
                          1)  # random points on [0, 1], with shape (1000, 3)
    lams = lams * 8 - 4  # shift values to -4, 4
    lams = 10**lams  # transforms values to 1e-4, 1e4
    new_values = np.append(new_values, np.array(pred_y).reshape(-1, 1), axis=1)

    titles = []
    for i in range(new_values.shape[1] - 1):
        titles.append(str(i))
        if i == 0:
            x = s(i, n_splines=pca_splines, lam=pca_lam)
        else:
            x = x + s(i, n_splines=pca_splines, lam=pca_lam)
    if pred_factor:
        x = x + pygam.terms.f(i + 1, lam=pred_lam)
    else:
        x = x + s(i + 1, n_splines=pred_splines, lam=pred_lam)

    rand_gam = LogisticGAM(x).gridsearch(new_values, y, lam=lams)
    return rand_gam, new_values, titles
def test_plugin_learner():
    # get data without noise
    X, y, w, ite, p, bs = make_te_data(n=200, noise=False)

    # get surrogates
    mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w)

    # use plug in learner
    p_model = PlugInTELearner(LinearGAM())
    p_model.fit(X, y, w, p)
    te, mu_0, mu_1 = p_model.predict(X, return_po=True)

    # test outcomes
    np.testing.assert_almost_equal(te, mu_1_plug - mu_0_plug)
    np.testing.assert_almost_equal(mu_0, mu_0_plug)
    np.testing.assert_almost_equal(mu_1, mu_1_plug)
    np.testing.assert_almost_equal(p_model.predict(X), mu_1_plug - mu_0_plug)

    # check that binary_y setting also works (smoketest)
    X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline,
                                       noise=False, binary_y=True)
    p_model = PlugInTELearner(LogisticGAM(), binary_y=True, setting=RR_NAME)
    p_model.fit(X, y, w, p)
    te, mu_0, mu_1 = p_model.predict(X, return_po=True)
Beispiel #22
0
#-----------------------------------------------------
#load the breast cancer data set

ds = load_breast_cancer()

X, y = ds.data, ds.target

#select first 6 features only
X = X[:, 0:6]

selected_features = ds.feature_names[0:6]

#-----------------------------------------------------
#Fit a model with the default parameters
gam = LogisticGAM().fit(X, y)
gam.summary()

roc_auc_score(y, gam.predict_proba(X))  #0.994173140954495
gam.accuracy(X, y)  #0.9560632688927944

#-----------------------------------------------------
# Explore and interpret individual features

plt.ion()
plt.rcParams['figure.figsize'] = (28, 8)

fig, axs = plt.subplots(1, X.shape[1])

for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i, meshgrid=True)
Beispiel #23
0
gam_mse = np.mean((gam_predictions - d['mpg'])**2)
print('MSE:', gam_mse)

#Add binary and categorical predictors to the GAM
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
d['h_bin'] = encoder.fit_transform(pd.cut(d['hp'], 5))
gam_model = LinearGAM().fit(d[['disp', 'wt', 'vs', 'h_bin']], d['mpg'])
print(gam_model.summary())
gam_predictions = gam_model.predict(d[['disp', 'wt', 'vs', 'h_bin']])
gam_mse = np.mean((gam_predictions - d['mpg'])**2)
print('MSE:', gam_mse)

#Performing classification (logistic regression) with the GAM
d['mpg_bin'] = encoder.fit_transform(pd.cut(d['mpg'], [0, 20, 100]))
gam_model = LogisticGAM().gridsearch(d[['disp', 'wt', 'vs', 'h_bin']],
                                     d['mpg_bin'])
print(gam_model.summary())
print('Classification Accuracy:',
      gam_model.accuracy(d[['disp', 'wt', 'vs', 'h_bin']], d['mpg_bin']))

#This models the conditional probabilities for mpg being < 20 and >=20
#Note the y-axis of these plots is the logit
'''
-------------------------------------------------------------------------------
-------------------Classification and Regression Trees-------------------------
-------------------------------------------------------------------------------
'''

#Regression tree
reg_tree = DecisionTreeRegressor(criterion='mse', min_samples_split=20).fit(
    d.drop(['mpg', 'mpg_bin'], axis=1), d['mpg'])
Beispiel #24
0
            ls='--')
    if i == 0:
        ax.set_ylim(-30, 30)
    ax.set_title(titles[i])

######################################################
# classification

from pygam import LogisticGAM, s, f
from pygam.datasets import default

X, y = default(return_X_y=True)
X.shape
X

gam = LogisticGAM(f(0) + s(1) + s(2)).gridsearch(X, y)

fig, axs = plt.subplots(1, 3)
titles = ['student', 'balance', 'income']

for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i)
    pdep, confi = gam.partial_dependence(term=i, width=.95)

    ax.plot(XX[:, i], pdep)
    ax.plot(XX[:, i], confi, c='r', ls='--')
    ax.set_title(titles[i])

gam.accuracy(X, y)

######################################################
Beispiel #25
0
    "Maximum Expression Value", "Copy Number -1", "Copy Number 0",
    "Copy Number 1"
]
trainX, testX, trainy, testy = train_test_split(X, Y, test_size=0.4)
from pygam import LogisticGAM, s, f
#print("allocated")
#print(trainX)
aa = s(0)
names1 = ["Silent"]
for i in range(1, 37):
    #if(not(i==4 or i==6 or i==10 or i==11)):
    if (not (i == 4 or i == 6 or i == 10 or i == 11)):
        aa += s(i)
        names1.append(names[i])
#print("yeh karre")
gam1 = LogisticGAM(aa)
#print(gam1.lam)
w = []
for y in trainy:
    if (y == 0):
        w.append(1)
    else:
        w.append(10)
gam1 = gam1.fit(trainX, trainy, weights=w)
import numpy as np
lams = np.random.rand(10, 33)  # random points on [0, 1], with shape (100, 3)
n_splines = [5, 10, 15, 20, 25]
lams = lams * 6  # shift values to -3, 3
lams = lams - 3
lams = np.exp(lams)
cons = [
Beispiel #26
0
auc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='roc_auc').mean()
print("\n calculate cross-validated AUC  (M2. X_train_scaled_poly):", auc_log2)
acc_log3 = cross_val_score(logreg, X_train_scaled_poly, Y_train, cv=10, scoring='accuracy').mean()
print("\n calculate cross-validated accurancy  (M2. X_train_scaled_poly):", acc_log2)
acc_logs3 = cross_validation.cross_val_predict(logreg, X_train_scaled_poly, Y_train, cv=10)
print(metrics.accuracy_score(Y_train, acc_logs2))
print(metrics.classification_report(Y_train, acc_logs3))
print(logreg.coef_)
print('\n ------------------------------------------------------------------')
# call predict_proba() to get the list of probabilities that the classifier assigned to each instance for each class:
###############################################################################################################################
# GAM
import pandas as pd
from pygam import LogisticGAM
# Fit a model with the default parameters
gam = LogisticGAM().fit(X_train_scaled, Y_train)
gam.summary()
print('gam.accuracy(X_train_scaled, Y_train):',gam.accuracy(X_train_scaled, Y_train))
print('gam.accuracy(X_test_scaled, Y_test):',gam.accuracy(X_test_scaled, Y_test))
acc_loggamc = cross_val_score(gam, X_train_scaled, Y_train, cv=10, scoring='accuracy').mean()
print('acc_loggam_cross-validation, train_scaled',acc_loggamc)


# make predictions for testing set
Y_scaler_pred_class = logreg.predict(X_test_scaled)
# calculate testing accuracy
from sklearn import metrics

print('\n ------------------------------------------------------------------')
print("\n calculate testing accuracy (M1. X_train_scaled):", metrics.accuracy_score(Y_test, Y_scaler_pred_class))
print('\n ------------------------------------------------------------------')
def score_notes_from_network(csv_dir, name = False, pca = False, load = False):
    dataset = pd.read_csv(csv_dir)
    y = dataset['truth']
    del dataset['truth']

    if not name:
        name = str(uuid.uuid4())
    if not pca:
        pca = PCA(0.95)
        pca.fit(dataset)
    new_values = pca.transform(dataset)
    new_values, y = datasets.load_iris(return_X_y=True)
    if not load:
        from sklearn.linear_model import LogisticRegression
        X, y = datasets.load_iris(return_X_y=True)
        clf = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=1000).fit(X, y)
        aaa = clf.predict_proba(X)



        base_estimator = LogisticGAM(s(0) + s(1) + s(2) + s(3))
        ensemble = OneVsRestClassifier(base_estimator, n_jobs=1).fit(new_values, y).predict(new_values)
        ensemble.fit(new_values, y)
        ensemble.predict_proba(new_values)

    else:
        with open(load, 'rb') as f:
            rand_gam = pickle.load(f)

    rand_gam.summary()
    scores = rand_gam.predict_proba(new_values)
    rand_gam.accuracy(new_values, y)
    scr_order = np.argsort(scores)
    filename = name + '.pkl'

    with open('./GAM_model/models/' + name, 'wb') as f:
        pickle.dump(rand_gam, f)

    w = 480
    h = 480

    columns = 21
    rows = 1

    # ax enables access to manipulate each of subplots
    dirList =['./pl_train_station/output_hed_images/', './pl_train_station/pretty_good_set_input/']
    ordered_list = [[os.listdir('./pl_train_station/output_hed_images/')[i] for i in scr_order]][0]
    ordered_labels = [[y[i] for i in scr_order]][0]

    shutil.rmtree('./GAM_model/scored/rgb/')
    shutil.rmtree('./GAM_model/scored/edge/')
    os.makedirs('./GAM_model/scored/rgb/')
    os.makedirs('./GAM_model/scored/edge/')

    for i, img in enumerate(ordered_list):
        res_img = cv2.resize(cv2.imread(dirList[0] + img), (w, h))
        cv2.imwrite('./GAM_model/scored/edge/' + str(i) + '_' + str(ordered_labels[i]) + '_' + str(scores[scr_order[i]]) + '.bmp',
                    res_img)

    for i, img in enumerate(ordered_list):
        img = img[0:-10] + '.bmp'
        res_img = cv2.imread(dirList[1] + img)
        cv2.imwrite('./GAM_model/scored/rgb/' + str(i) + '_' + str(ordered_labels[i]) + '_' + str(scores[scr_order[i]]) + '.bmp', res_img)
Beispiel #28
0
tumors.head()

# In[21]:

# Se quita la columna de identificación de los datos.
tumors = tumors.drop(['id'], axis=1)
tumors.loc[tumors['diagnosis'] == 'M',
           'diagnosis'] = 1  #Se cambia la variable a tipo binario.
tumors.loc[tumors['diagnosis'] == 'B', 'diagnosis'] = 0  #
tumors_X = tumors.iloc[:, :11].drop(
    ['diagnosis'], axis=1).values  #Separamos las variables independientes
tumors_y = tumors['diagnosis']  #Separamos las variables dependientes.

# In[22]:

gam = LogisticGAM(n_splines=20).gridsearch(tumors_X, tumors_y)
gam.summary()

# In[23]:

print('La precisión del módelo es:',
      round(gam.accuracy(tumors_X, tumors_y) * 100, 2), "%")

# In[24]:

titles = tumors.columns[1:11]
plt.figure()
fig, axs = plt.subplots(1, 10, figsize=(40, 8))
for i, ax in enumerate(axs):
    XX = gam.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
Beispiel #29
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter
        import pygam
        from pygam import LinearGAM, LogisticGAM
        import matplotlib.pyplot as plt

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            clf = LogisticGAM(terms="auto",
                              lam=self.params["lam"],
                              max_iter=self.params["max_iter"])
            self.is_classifier = True

        else:
            clf = LinearGAM(terms="auto",
                            lam=self.params["lam"],
                            max_iter=self.params["max_iter"])
            self.is_classifier = False

        X = self.basic_impute(X)
        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [
            orig_cols[col_count] for col_count in range(len(orig_cols))
            if (X_datatypes[col_count] == 'category') or (
                X_datatypes[col_count] == 'object')
        ]
        self.X_numeric = [
            item for item in orig_cols if item not in self.X_categorical
        ]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            X.loc[:, self.X_categorical] = X[self.X_categorical].fillna(
                "Missing").copy()
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(
                self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([
                X[self.X_numeric],
                pd.DataFrame(X_enc, columns=self.encoded_categories)
            ],
                          axis=1)

        # Replace missing values with a missing value code
        self.median_train = {}

        if len(self.X_numeric) > 0:
            for colname in self.X_numeric:
                self.median_train[colname] = X[colname].quantile(0.5)
                X.loc[:, colname] = X[colname].fillna(
                    self.median_train[colname]).copy()

        try:
            clf.fit(X, y)
        except np.linalg.LinAlgError as e:
            raise IgnoreError("np.linalg.LinAlgError") from e
        except pygam.utils.OptimizationError as e:
            raise IgnoreError("pygam.utils.OptimizationError") from e
        except ValueError as e:
            if 'On entry to DLASCL parameter number' in str(e):
                raise IgnoreError('On entry to DLASCL parameter number') from e
            raise

        p_values = np.array(clf.statistics_['p_values'])

        # Plot the partial dependence plots for each feature
        for ii in range(X.shape[1]):
            XX = clf.generate_X_grid(term=ii)
            plt.figure()
            plt.plot(XX[:, ii], clf.partial_dependence(term=ii, X=XX))
            plt.plot(XX[:, ii],
                     clf.partial_dependence(term=ii, X=XX, width=.95)[1],
                     c='r',
                     ls='--')
            plt.title("Partial Dependence " + str(ii),
                      fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(os.path.join(
                tmp_folder, 'Feature_partial_dependence_' + str(ii) + '.png'),
                        bbox_inches="tight")

        if max(p_values[0:(len(p_values) - 1)]) > 0:
            importances = -np.log(p_values[0:(len(p_values) - 1)] + 10**(-16))

            importances = list(importances / max(importances))
        else:
            importances = [1] * (len(p_values) - 1)

        self.mean_target = np.array(sum(y) / len(y))

        self.set_model_properties(model=clf,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
Beispiel #30
0
rfc_predictions_2020 = predict_2020(positions=position_encoder.inverse_transform(features_2020['All_NBA_Pos']),
            player_names=current_dat['Player'],
            binary_prediction=rfc__2020,
            probability_predictions= rfc_probs_2020[:,1])

rfc_predictions_2020
rfc_predictions_2020.to_csv("rfc_predictions.csv")

##### ----- ##### ----- ##### ----- ##### -----# #### ----- ##### ----- ##### ----- ##### ----- #####
# Model 1.3 - Generalized Additive Models

from pygam import LogisticGAM


#Fit a GAM model with the default parameters
gam_model =  LogisticGAM()
gam_model.fit(X_train, y_train)


gam_pred_prob = gam_model.predict_proba(X_test)


gam_preds, complete_gam_dat = top_15_predictions(entire_test_data, gam_pred_prob )


gam_performance = all_nba_test_report(complete_gam_dat)


players_missed(complete_gam_dat)