def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:,0])
    
    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k:[] for k in X.columns}
        pred=np.zeros(y.shape[0])
        for train,test in CV.split(X,y):
            Xtrain = X.iloc[train,:]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test,:]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape)>1:
                p=p[:,0]
            pred[test]=p

            if get_importance:    
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k,v in importance_out.items():
                    importances[k].append(v)
                    
        cv_scores = [{'r': np.corrcoef(y,pred)[0,1],
                      'R2': np.corrcoef(y,pred)[0,1]**2,
                      'MAE': mean_absolute_error(y,pred)}]
        
        
        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{'r': np.corrcoef(y,in_pred)[0,1],
                          'R2': np.corrcoef(y,in_pred)[0,1]**2,
                          'MAE': mean_absolute_error(y,in_pred)}]
        GAM_results[name] = {'scores_cv': cv_scores,
                             'scores_insample': in_scores,
                             'pred_vars': X.columns,
                             'importances': importances,
                             'model': gam}
    return GAM_results
Esempio n. 2
0
def smoother_linearGAM(x,y,X,**kwargs):
    from pygam import LinearGAM, l, s
    if isinstance(x,list):
        x = np.array(x)
    x = x.reshape(len(x),1)
    if isinstance(y,list):
        y = np.array(y)
    if isinstance(X,list):
        X = np.array(X)
    if X is None:
        X = x.reshape(len(x),1)
    else:
        X = X.reshape(len(X),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    n_splines = int(len(y)/5)
    #gam = LinearGAM(n_splines=n_splines,\
    #                terms=s(0,basis='ps')\
    #                ).gridsearch(x, y)
    gam = LinearGAM( terms=s(0,basis='ps')\
                    ).gridsearch(x, y )
    # sample on the input grid
    means = gam.predict(X)
    return means
Esempio n. 3
0
def feature_selection_single(x, y, x_test, y_test):
    timestart = time.time()
    cols = list(deepcopy(x.columns))
    best_result = 0
    selected_cols = []
    continue_selection = True

    iterationresult = {}
    while continue_selection:
        for col in tqdm(cols, leave=False):
            testcols = selected_cols + [col]

            model = LinearGAM().gridsearch(x[testcols].values,
                                           y,
                                           progress=False)
            iterationresult[col] = model._estimate_r2(
                x_test[testcols].values, y_test)['explained_deviance']
            #iterationresult[col] = r2_score(model.predict(x_test[testcols].values), y_test)

        key = max(iterationresult.keys(),
                  key=(lambda key: iterationresult[key]))
        if (iterationresult[key] > best_result) & check_significance(
                x, y, x_test, selected_cols, key):
            best_result = iterationresult[key]
            selected_cols.append(key)
            cols.remove(key)
        else:
            continue_selection = False

    logging.info("{}: {}".format(selected_cols, best_result))
    return best_result, selected_cols, time.time() - timestart
Esempio n. 4
0
def BAM(X, y):
    # model implementation by PYGAM
    gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1))
    gam.gridsearch(X, y)
    # print(gam.gridsearch(X, y).summary())

    return gam
Esempio n. 5
0
    def tsSSE(self, model='linear'):

        sse = 0

        for i in range(self.m):

            index = [
                item for sublist in np.where(self.dataLabel == i)
                for item in sublist
            ]
            Xfit = self.Xall[index, :]
            Afit = self.Aall[index]
            Bfit = self.Ball[index]

            Af = Afit * self.model.decision_function(Xfit)
            Xmat = np.column_stack((Xfit, Af))

            if model == 'linear':
                ## linear regression model for B
                Xmat = sm.add_constant(Xmat)
                BModel = sm.OLS(Bfit, Xmat)
                res = BModel.fit()
                pred = res.predict()
            elif model == 'GAM':
                BModel = LinearGAM(fit_intercept=True)
                res = BModel.fit(
                    Xmat, Bfit)  ##the GAM model can be specified differently
                pred = res.predict(Xmat)

            sse = sse + sum([(Bfit[elem] - pred[elem])**2
                             for elem in range(len(Bfit))])

        return sse
Esempio n. 6
0
 def __init__(self, m, idim=20):
     if m == 0:
         self.model = self.base(idim)
         self.type = 2
     elif m == 1:
         self.model = self.base2(idim)
         self.type = 2
     elif m == 2:
         self.model = self.returnSequential4(idim)
         self.type = 2
     elif m == 3:
         self.model = self.returnSequential8(idim)
         self.type = 2
     elif m == 4:
         self.model = self.returnSequential15_regularized(idim)
         self.type = 2
     elif m == 5:
         self.model = self.multi_RNN(idim)
         self.type = 1
     elif m == 6:
         self.model = self.multi_lstm(idim)
         self.type = 1
     elif m == 7:
         self.model = LinearGAM()
         self.type = 3
     elif m == 8:
         self.model = self.RNN(idim)
         self.type = 1
     elif m == 9:
         self.model = self.lstm(idim)
         self.type = 1
def GAM(X, Y):

    """SPLITTING THE DATASET"""
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options)

    """PREPROCESSING"""
    # NB: No need for one-hot encoding – categorical columns are already binary!
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    """CREATING A DESIGN MATRIX"""
    poly = PolynomialFeatures(1)
    X_test = poly.fit_transform(X_test)
    X_train = poly.fit_transform(X_train)

    gam_input = None
    for n in range(X_train.shape[1]):
        if gam_input is not None:
            gam_input += GAM_spline(n)
        else:
            gam_input = GAM_spline(n)

    gam = LinearGAM(gam_input).fit(X_train, Y_train)
    Y_predict = gam.predict(X_test)
    Y_predict[Y_predict >= 0.5] = 1
    Y_predict[Y_predict < 0.5] = 0
    accuracy = (Y_predict.squeeze() == Y_test.squeeze()).astype(int)
    accuracy = np.sum(accuracy)/accuracy.shape[0]
    return accuracy
Esempio n. 8
0
def smooth_gam(x, y, n_splines=100, lam=10):
    from pygam import ExpectileGAM, LinearGAM, s, f
    gam = LinearGAM(s(0, n_splines=n_splines), lam=lam).fit(x, y)
    # gam = ExpectileGAM(s(0, n_splines=n_splines), expectile=0.5, lam=lam).gridsearch(x.values.reshape((-1,1)), y)
    XX = gam.generate_X_grid(term=0)
    confi = gam.confidence_intervals(XX)
    # confi = gam.prediction_intervals(XX)
    ym = gam.predict_mu(XX)
    return XX[:, 0], ym, confi
def test_model_constructors():
    # test that the right errors are thrown because cannot be constructed
    with pytest.raises(TypeError):
        BaseTEModel()

    with pytest.raises(ValueError):
        IFLearnerTE(None)

    # test other configurations of base learners
    if_learner1 = IFLearnerTE(None, base_estimator=LinearGAM())
    if_learner2 = IFLearnerTE(te_estimator=LinearGAM(), base_estimator=None)
Esempio n. 10
0
def interp_gam(data):
    valid = np.isfinite(data.stream_dist.values[:, 0])
    sample_xy = data.sample_xy.values[valid]
    sample_st = data.stream_dist.values[valid]
    sample_z = data.sample_z.values[valid]
    if np.sum(valid) == 0:
        return np.nan

    gam = LinearGAM(
        s(0, n_splines=4) + s(1, n_splines=5) +
        te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z)
    z_pred = gam.predict(np.array([[0, 0]]))[0]
    return z_pred
Esempio n. 11
0
    def updateEmpTauX(self, bFit=True, mask=None):

        if mask is None:
            mask = np.ones((self.V, self.S))

        square_diff_matrix = self.exp_square_diff_matrix()

        mXFit = np.ma.masked_where(mask == 0, self.X)

        X1DFit = np.ma.compressed(mXFit)

        logX1DFit = np.log(0.5 + X1DFit)

        mSDMFit = np.ma.masked_where(mask == 0, square_diff_matrix)

        mFitFit = np.ma.compressed(mSDMFit)

        logMFitFit = np.log(mFitFit + NMF_VB.minVar)

        if bFit:
            try:
                self.gam = LinearGAM(
                    s(0, n_splines=5,
                      constraints='monotonic_inc')).fit(logX1DFit, logMFitFit)

            except ValueError:
                print("Performing fixed tau")

                self.updateFixedTau(mask)

                return

        mX = np.ma.masked_where(mask == 0, self.X)

        X1D = np.ma.compressed(mX)

        logX1D = np.log(0.5 + X1D)

        yest_sm = self.gam.predict(logX1D)

        mBetaTau = self.beta * (X1D + 0.5) + 0.5 * np.exp(yest_sm)

        np.place(self.betaTau, mask == 1, mBetaTau)

        mExpTau = (self.alpha + 0.5) / mBetaTau

        np.place(self.expTau, mask == 1, mExpTau)

        mLogTau = digamma(self.alpha + 0.5) - np.log(mBetaTau)

        np.place(self.expLogTau, mask == 1, mLogTau)
Esempio n. 12
0
def test_if_learner():
    # get data without noise
    X, y, w, ite, p, bs = make_te_data(n=200, noise=False)

    # get surrogate predictions to compare against po predictions
    mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w)

    # get surrogate predictions for two folds as inside the iflearner
    splitter = StratifiedKFold(n_splits=2, shuffle=True,
                               random_state=42)
    idx_list = []
    for train_index, test_index in splitter.split(X, w):
        idx_list.append((train_index, test_index))

    fold2_mask = np.zeros(200, dtype=bool)
    fold2_mask[idx_list[0][1]] = 1
    mu_0, mu_1 = np.zeros(200), np.zeros(200)
    mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask)
    mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask)
    pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1)

    # make second stage model
    t_model = LinearGAM()
    t_model.fit(X, pseudo_outcome)
    te_debiased = t_model.predict(X)

    # fit if learner
    if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    # test outcomes
    np.testing.assert_almost_equal(te, te_debiased)
    np.testing.assert_almost_equal(mu_0, mu_0_plug)
    np.testing.assert_almost_equal(mu_1, mu_1_plug)
    np.testing.assert_almost_equal(if_learner.predict(X), te_debiased)

    with pytest.raises(ValueError):
        # predicting po when base model not fitted should not be possible
        if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42)
        if_learner.fit(X, y, w, p)
        te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    with pytest.warns(UserWarning):
        # warning raised if only one fold?
        if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42)
        if_learner.fit(X, y, w, p)

    # check that binary_y setting also works (smoketest)
    X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline,
                                       noise=False, binary_y=True)
    if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(),
                             binary_y=True, setting=RR_NAME, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
Esempio n. 13
0
def test_scores():
    # get data
    X, y, w, ite, p, bs = make_te_data(n=200)
    train = [i for i in range(100)]
    test = [i for i in range(100, 200)]

    # test that score is correct by pre-training IFLearner outside of scorer
    # split data
    X_train, y_train, w_train, p_train = _safe_indexing(X, train), _safe_indexing(y, train), \
                                         _safe_indexing(w, train), _safe_indexing(p, train)
    X_test, t_test = _safe_indexing(X, test), _safe_indexing(ite, test)

    # fit if-learner and get predictions on test set
    if_learner = IFLearnerTE(LinearGAM())
    if_learner.fit(X_train, y_train, w_train, p_train)
    t_pred = if_learner.predict(X_test)
    neg_mse = -mean_squared_error(t_test, t_pred)

    # score output
    score = fit_and_score_te_oracle(IFLearnerTE(LinearGAM()),
                                    X,
                                    y,
                                    w,
                                    p,
                                    ite,
                                    train=train,
                                    test=test,
                                    scorer='neg_mean_squared_error',
                                    return_test_score_only=True,
                                    error_score=np.nan)

    np.testing.assert_almost_equal(score, neg_mse)

    # smoke test some other capabilities
    # test that we can pass parameters too
    score = fit_and_score_te_oracle(IFLearnerTE(LinearGAM()),
                                    X,
                                    y,
                                    w,
                                    p,
                                    ite,
                                    train=train,
                                    test=test,
                                    parameters={'te_estimator': LinearGAM()},
                                    scorer='neg_mean_squared_error',
                                    return_test_score_only=True,
                                    error_score=np.nan)
    np.testing.assert_almost_equal(score, neg_mse)
Esempio n. 14
0
def test_exceptions():
    # get data
    X, y, w, ite, p, bs = make_te_data(n=200)
    train = [i for i in range(100)]
    test = [i for i in range(100, 200)]

    with pytest.raises(ValueError):
        # pass incorrect type of estimator
        fit_and_score_te_oracle(LinearGAM(),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=True)

    with pytest.raises(ValueError):
        # fit should throw an error
        fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=True,
                                error_score='raise')

    with pytest.raises(ValueError):
        # fit should throw an error because error score is incorrect
        fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                X,
                                y,
                                w,
                                p,
                                ite,
                                train=train,
                                test=test,
                                scorer='neg_mean_squared_error',
                                return_test_score_only=False,
                                error_score='asdfad')

    # assert we get error score otherwise
    score = fit_and_score_te_oracle(IFLearnerTE(LogisticGAM()),
                                    X,
                                    y,
                                    w,
                                    p,
                                    ite,
                                    train=train,
                                    test=test,
                                    scorer='neg_mean_squared_error',
                                    return_test_score_only=True,
                                    error_score=np.nan)
    assert math.isnan(score)
Esempio n. 15
0
def spline_fit(windspeed_column, power_column, n_splines=20):
    """
    Use the pyGAM package to fit a wind speed and power curve using spline fitting

    Args:
        windspeed_column (:obj:`pandas.Series`): feature column
        power_column (:obj:`pandas.Series`): response column
        n_splines (:obj:`int`): number of splines to use in the fit

    Returns:
        :obj:`function`: Python function of type (Array[float] -> Array[float]) implementing the power curve.

    """

    # Fit the data
    x = windspeed_column.values.reshape((windspeed_column.size, 1))
    y = power_column.values

    s = LinearGAM(n_splines=n_splines).gridsearch(x, y)

    # Create a closure over the spline fit which computes the power curve value for arbitrary array-like input
    def pc_spline(xx):
        P = s.predict(xx)
        return P

    return pc_spline
Esempio n. 16
0
def get_surrogate_predictions(X, y, w, pred_mask=None):
    if pred_mask is None:
        pred_mask = np.ones(len(y), dtype=bool)
        fit_mask = pred_mask
    else:
        fit_mask = ~pred_mask
    # get surrogates
    model_1 = LinearGAM()
    model_1.fit(X[fit_mask & (w == 1), :], y[fit_mask & (w == 1)])
    mu_1_plug = model_1.predict(X[pred_mask, :])

    model_0 = LinearGAM()
    model_0.fit(X[fit_mask & (w == 0), :], y[fit_mask & (w == 0)])
    mu_0_plug = model_0.predict(X[pred_mask, :])

    return mu_0_plug, mu_1_plug
Esempio n. 17
0
def gam_3param(windspeed_column, winddir_column, airdens_column, power_column, n_splines=20):
    """
    Use a generalized additive model to fit power to wind speed, wind direction and air density.

    Args:
        windspeed_column (:obj:`pandas.Series`): Wind speed feature column
        power_column (:obj:`pandas.Series`): Power response column
        winddir_column (:obj:`pandas.Series`): Optional. Wind direction feature column
        airdens_column (:obj:`pandas.Series`): Optional. Air density feature column
        n_splines (:obj:`int`): number of splines to use in the fit

    Returns:
        :obj:`function`: Python function of type (Array[float] -> Array[float]) implementing the power curve.

    """
    # create dataframe input to LinearGAM
    X = pd.DataFrame({"ws": windspeed_column, "wd": winddir_column, "dens": airdens_column})

    # Set response
    y = power_column.values

    # Fit the model
    s = LinearGAM(n_splines=n_splines).fit(X, y)

    # Wrap the prediction function in a closure to pack input variables
    def predict(windspeed_column, winddir_column, airdens_column):
        X = pd.DataFrame({"ws": windspeed_column, "wd": winddir_column, "dens": airdens_column})
        return s.predict(X)

    return predict
Esempio n. 18
0
    def _fit_final_gam(self):
        """We now regress the original treatment values against the pseudo-outcome values
        """

        return LinearGAM(s(0, n_splines=30, spline_order=3),
                         max_iter=500,
                         lam=self.bandwidth).fit(self.t_data,
                                                 y=self.pseudo_out)
def fit_pygam_model(X_train: pandas.core.frame.DataFrame,
                    X_test: pandas.core.frame.DataFrame,
                    y_train: pandas.core.frame.DataFrame,
                    y_test: pandas.core.frame.DataFrame):
    '''
    Creates a general additive model LinearGAM (normally distributed errors)
    with grid search. Returns the best model with given hyperparameters.
    hyperparameters: n_splines and lam regularization parameter.
    '''
    from pygam import LinearGAM
    gam = LinearGAM().gridsearch(X_train.values,
                                 y_train,
                                 n_splines=np.arange(3, 20),
                                 lam=np.logspace(-3, 3, 11))
    print(gam.summary())

    y_train_predicted = gam.predict(X_train)
    y_test_predicted = np.floor(gam.predict(X_test))

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    r2_train = r2_score(y_train, y_train_predicted)
    print("RMSE of training set is {}".format(rmse_train))
    print("MAE of testing set is {}".format(mae_train))
    print("R2 score of training set is {}\n".format(r2_train))

    if len(y_test) > 0:
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted))
        mae_test = mean_absolute_error(y_test, y_test_predicted)
        r2_test = r2_score(y_test, y_test_predicted)
        print("RMSE of testing set is {}".format(rmse_test))
        print("MAE of testing set is {}".format(mae_test))
        print("R2 score of testing set is {}\n".format(r2_test))
    '''
    Visualize the feature significance and confidence intervals
    '''
    num_features = len(X_train.columns)
    fig = plt.figure(figsize=(18, 12))
    fig.subplots_adjust(hspace=0.4)

    cnt = 1
    p_values = gam.statistics_['p_values']

    for i in range(num_features):
        axs = fig.add_subplot(num_features, 1, cnt)
        m = gam.generate_X_grid(term=i)
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i,
                                        X=m))  # this is the actual coefficents
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i, X=m, width=.95)[1],
                 c='r',
                 ls='--')  # this plots the confidence intervals
        axs.set_title(X_train.columns[i] +
                      ('*' if p_values[cnt] < 0.05 else ''))
        cnt += 1
Esempio n. 20
0
 def __init__(self, model_path, **kwargs):
     super().__init__()
     print('Using GeneralizedAdditive model.')
     self.model_params = {'n_splines': 25}
     self.model_path = model_path
     if kwargs:
         for kw in kwargs:
             self.model_params[kw] = kwargs[kw]
     self.model = LinearGAM(**self.model_params)
Esempio n. 21
0
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa):
    prediction = []
    actual_value = []
    n_splines_all = []
    lam_all = []

    # THIS IS OUTER LOOP: for VALIDATION/TESTING
    #train n models and evaluate their average performance
    gene_indexes = index_set
    y = cell_count_aa
    X = gene_expression[gene_expression.columns[gene_indexes]]
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    gam = LinearGAM()
    gam = gam.gridsearch(X,
                         y,
                         n_splines=np.arange(10, 50),
                         lam=[0.4, 0.5, 0.6, 0.7, 0.8])

    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # THIS IS INNER LOOP: for TRAINING/VALIDATION
        #train model with given optimized parameters
        regr = gam.fit(X_train, y_train)
        #make a prediction on OUTER LOOP test set
        prediction_val = regr.predict(X_test)[0]
        # store predictions and actual values
        prediction.append(prediction_val)
        actual_value.append(y_test[0])
        # add optimal parameter values to arrays
        n_splines_all.append(regr.n_splines)
        lam_all.append(regr.lam)
        print(test_index)
        print(str(prediction_val), " ", str(y_test[0]))
    #calculate spearman correlation over all of the models
    rho, pval = spearmanr(actual_value, prediction)
    lams = np.array(lam_all)
    lams_mean = lams.mean()
    n_splines_all = np.array(n_splines_all)
    n_splines_mean = n_splines_all.mean()
    return lams_mean, n_splines_mean, rho, pval
Esempio n. 22
0
def GAM_model(df, feature_list):
    X_train = df[feature_list]
    y_train = df[['logerror']]
    scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X_train)
    X_scaled = pd.DataFrame(scaler.transform(X_train),
                            columns=X_train.columns.values).set_index(
                                [X_train.index.values])
    X_scaled = X_scaled.to_numpy()
    y_train = y_train.to_numpy()
    from pygam import LinearGAM, s, f, te
    gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5))
    gam.gridsearch(X_scaled, y_train)
    y_pred = gam.predict(X_scaled)
    y_pred = pd.DataFrame(y_pred)
    y_pred['actual'] = y_train
    y_pred.columns = ['predicted', 'actual']
    RMSE = float('{:.3f}'.format(
        sqrt(mean_squared_error(y_pred.actual, y_pred.predicted))))
    R2 = float('{:.3f}'.format(r2_score(y_pred.actual, y_pred.predicted)))
    return RMSE, R2, gam
Esempio n. 23
0
    def _fit_gam(self):
        """Fits a GAM that predicts the outcome from the treatment and GPS
        """

        X = np.column_stack((self.T.values, self.gps))
        y = np.asarray(self.y)

        return LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order) +
            s(1, n_splines=self.n_splines, spline_order=self.spline_order),
            max_iter=self.max_iter,
            lam=self.lambda_,
        ).fit(X, y)
Esempio n. 24
0
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest, pred)[0, 1]**2
        importances[predictor] = R2
    return importances
Esempio n. 25
0
def fit_gam_plot_dependencies(df=None,
                              features=None,
                              target=None,
                              basis_1=s,
                              basis_2=False,
                              summary=False):
    X = df[features]
    y = df[target]

    if basis_1 and basis_2:
        gam = LinearGAM(basis_1(0, lam=60) + basis_2(1, lam=60),
                        fit_intercept=True).fit(X, y)

    elif basis_1:
        gam = LinearGAM(basis_1(0, lam=60), fit_intercept=True).fit(X, y)

    else:
        print('no basis called for features.. error')

    if summary:
        print(gam.summary())
    plot_gam_partial_dependencies(gam, features, target)
Esempio n. 26
0
    def train(self, X, y, base_pred):
        """Trains ensemble model based on data and base predictions.

        Adds value to class attribute "model_weight"

        Args:
            X: (np.ndarray) Training features, shape (N, D)
            y: (np.ndarray)  Training labels, shape (N, 1)
            base_pred: (dict of np.ndarray) Dictionary of base model predictions
                With keys (str) being model name, and values (np.ndarray) being
                predictions corresponds to X and y.
        """
        # build feature and  gam terms
        ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred)

        # define model
        self.gam_model = LinearGAM(feature_terms)

        # additional fine-tuning
        lam_grid = self._build_lambda_grid(n_grid=100)
        self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid,
                                  progress=False)
Esempio n. 27
0
def GAM(X, Y, factor = False):

    """SPLITTING THE DATASET"""
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options)

    """PREPROCESSING"""
    # NB: No need for one-hot encoding – categorical columns are already binary!
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    """CREATING A DESIGN MATRIX"""
    poly = PolynomialFeatures(1)
    X_test = poly.fit_transform(X_test)
    X_train = poly.fit_transform(X_train)

    linear = ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'y', 'n', 'y',
    'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']

    # for feature in X_train.T:
    #     unique = np.unique(feature)
    #     if len(unique) < 6:
    #         linear.append("n")
    #     else:
    #         idx = np.argsort(feature)
    #         plt.plot(feature[idx], Y.squeeze()[idx])
    #         plt.show()
    #         linear.append(input("Linear?\t"))

    linear = np.array(linear)
    linear[linear == "n"] = 0
    linear[linear == "y"] = 1
    linear = linear.astype(bool)

    gam_input = None
    for n,is_linear in enumerate(linear):
        if gam_input is not None:
            if is_linear:
                gam_input += GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input += GAM_spline(n)
        else:
            if is_linear:
                gam_input = GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input = GAM_spline(n)

    gam = LinearGAM(gam_input, fit_intercept = False, max_iter = int(1E5))
    gam.fit(X_train, Y_train)
    Y_predict_train = gam.predict(X_train)
    Y_predict_test = gam.predict(X_test)
    MSE_train = np.mean((Y_predict_train - Y_train)**2)
    MSE_test = np.mean((Y_predict_test - Y_test)**2)
    return MSE_train, MSE_test
Esempio n. 28
0
def cleaner_linearGAM(x,y,**kwargs):
    from pygam import LinearGAM, l, s
    if isinstance(x,list):
        x = np.array(x)
    if isinstance(y,list):
        y = np.array(y)
    X = x.reshape(len(x),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    #n_splines = int(len(y)/5)
    #gam = LinearGAM(n_splines=n_splines,\
    #                terms=s(0,basis='ps')\
    #                ).gridsearch(X, y)
    gam = LinearGAM(terms=s(0,basis='ps')).gridsearch(X, y)
    #gam = LinearGAM(n_splines=n_splines,terms=s(0)).gridsearch(X, y)
    # sample on the input grid
    means = gam.predict(X)
    bounds = gam.prediction_intervals(X, width=.95)
    idx = [i for i in range(len(y)) \
            if (y[i]>bounds[i,1] or y[i]<bounds[i,0])]
    return idx
    def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR):

        model_spec = f(0) if features[0].is_factor() else s(
            0, n_splines=self.num_splines)

        for i in range(1, len(features)):
            model_spec += f(i) if features[i].is_factor() else s(
                i, n_splines=self.num_splines)

        if model_type == TYPE_LINEAR:
            return LinearGAM(model_spec)

        if model_type == TYPE_LOGISTIC:
            return LogisticGAM(model_spec)
Esempio n. 30
0
def get_GAM_predictions(Xtrain, Ytrain, Xtest):
    """
    Perform grid search and train Linear GAM model and return predictions for the test set.
    :param Xtrain: X values for training.
    :param Ytrain: Y values for training.
    :param Xtest:  X values for validation.
    :return: Predictions from Linear GAM model for test dataset
    """
    # Create an array of lambda values to search
    lams = np.logspace(-3, 20, 35)
    # GAM search requires numpy arrays
    Xtrain_np = np.array(Xtrain, dtype=np.float64)
    Ytrain_np = np.array(Ytrain, dtype=np.float64)

    # Linear Generalised Additive Model
    model = LinearGAM(
        s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) +
        l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) +
        l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) +
        l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np,
                                                        Ytrain_np,
                                                        lam=lams)
    return model.predict(Xtest)
def predict_gam(ad_group,date):
    ads_file = 'data/ad_table.csv'
    df = pd.read_csv(ads_file, header=0, sep=',')
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    splines=[5, 7, 10, 20, 30, 40, 45]
    lams = np.logspace(-3,3,7)
    if(ad_group in df['ad'].unique()):
        df_ad_group_train = df[df['ad'] == ad_group]
        df_ad_group_train = df_ad_group_train.reset_index()
        df_ad_group_train['time_period'] = (df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days
        X_train = df_ad_group_train[['time_period']].values
        y_train = df_ad_group_train['shown'].values
        #auto tuning
        gam = LinearGAM().gridsearch(X_train, y_train, lam=lams, n_splines=splines)
        predictions = gam.predict(X_train)
        print('==== Tuning for ad group %s - best generalized cross-validation %f ' % (ad_group, gam.statistics_['GCV']))
        tuning_result = (gam.lam[0][0], gam.n_splines[0], gam.statistics_['GCV'])
        predict_date = (pd.to_datetime(date) - df_ad_group_train['date'][0]).days
        print("Auto tuning result=",tuning_result)
        print("Prediction for number of ads Shown for",ad_group,"on ",date,"=",gam.predict([[predict_date]]))
        print("Regression/Lambda value = ",gam.lam)
        print("n_splines=",gam.n_splines)
    else:
        print("Ad group does not exist")
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest,pred)[0,1]**2
        importances[predictor] = R2
    return importances
Esempio n. 33
0
import patsy as pt
import numpy as np
from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

# Prep the dataset
data = pd.read_csv(
    "/home/dusty/Econ8310/DataSets/HappinessWorld.csv")

# Generate x and y matrices
eqn = """happiness ~ -1 + freedom + family + year + economy + health + trust"""
y,x = pt.dmatrices(eqn, data=data)

# Initialize and fit the model
gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5))
gam = gam.gridsearch(np.asarray(x), y)

# Specify plot shape
titles = ['freedom', 'family', 'year', 'economy',
          'health', 'trust']

fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles)
fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False)

for i, title in enumerate(titles):
  XX = gam.generate_X_grid(term=i)
  pdep, confi = gam.partial_dependence(term=i, width=.95)
  trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect')
  ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI')
  ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')
Esempio n. 34
0
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot, row
import matplotlib.pyplot as plt

# Importing data from the web
path = 'http://www.stat.cmu.edu/~larry/' \
	'all-of-nonpar/=data/rock.dat'

data = pd.read_csv(path, sep=' *', engine='python')

X = data[['peri','shape','perm']]
y = data['area']

adjy = y - np.mean(y)

gam = LinearGAM(n_splines=10).gridsearch(X, y)
XX = generate_X_grid(gam)

# fig, axs = plt.subplots(1, 3)
titles = ['peri', 'shape', 'perm']

# for i, ax in enumerate(axs):
#     pdep, confi = gam.partial_dependence(XX, feature=i+1, width=.95)
    
#     ax.scatter(X[X.columns[i]], adjy, color='gray', edgecolors='none')
#     ax.plot(XX[:, i], pdep)
#     ax.plot(XX[:, i], confi[0], c='r', ls='--')
#     ax.set_title(titles[i])
    
    
pdep, confi = gam.partial_dependence(XX, width=.95)