def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:,0])
    
    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k:[] for k in X.columns}
        pred=np.zeros(y.shape[0])
        for train,test in CV.split(X,y):
            Xtrain = X.iloc[train,:]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test,:]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape)>1:
                p=p[:,0]
            pred[test]=p

            if get_importance:    
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k,v in importance_out.items():
                    importances[k].append(v)
                    
        cv_scores = [{'r': np.corrcoef(y,pred)[0,1],
                      'R2': np.corrcoef(y,pred)[0,1]**2,
                      'MAE': mean_absolute_error(y,pred)}]
        
        
        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{'r': np.corrcoef(y,in_pred)[0,1],
                          'R2': np.corrcoef(y,in_pred)[0,1]**2,
                          'MAE': mean_absolute_error(y,in_pred)}]
        GAM_results[name] = {'scores_cv': cv_scores,
                             'scores_insample': in_scores,
                             'pred_vars': X.columns,
                             'importances': importances,
                             'model': gam}
    return GAM_results
Esempio n. 2
0
def BAM(X, y):
    # model implementation by PYGAM
    gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1))
    gam.gridsearch(X, y)
    # print(gam.gridsearch(X, y).summary())

    return gam
Esempio n. 3
0
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:, 0])

    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k: [] for k in X.columns}
        pred = np.zeros(y.shape[0])
        for train, test in CV.split(X, y):
            Xtrain = X.iloc[train, :]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test, :]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape) > 1:
                p = p[:, 0]
            pred[test] = p

            if get_importance:
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k, v in importance_out.items():
                    importances[k].append(v)

        cv_scores = [{
            'r': np.corrcoef(y, pred)[0, 1],
            'R2': np.corrcoef(y, pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, pred)
        }]

        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{
            'r': np.corrcoef(y, in_pred)[0, 1],
            'R2': np.corrcoef(y, in_pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, in_pred)
        }]
        GAM_results[name] = {
            'scores_cv': cv_scores,
            'scores_insample': in_scores,
            'pred_vars': X.columns,
            'importances': importances,
            'model': gam
        }
    return GAM_results
Esempio n. 4
0
def pspline(time, flux, edge_cutoff, max_splines, return_nsplines, verbose):
    try:
        from pygam import LinearGAM, s
    except:
        raise ImportError('Could not import pygam')

    newflux = flux.copy()
    newtime = time.copy()
    detrended_flux = flux.copy() / np.nanmedian(newflux)

    for i in range(constants.PSPLINES_MAXITER):
        mask_outliers = np.ma.where(
            np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT *
            np.std(detrended_flux))
        newtime, newflux = cleaned_array(newtime[mask_outliers],
                                         newflux[mask_outliers])
        gam = LinearGAM(s(0, n_splines=max_splines))
        search_gam = gam.gridsearch(newtime[:, np.newaxis],
                                    newflux,
                                    progress=False)
        trend = search_gam.predict(newtime)
        detrended_flux = newflux / trend
        stdev = np.std(detrended_flux)
        mask_outliers = np.ma.where(
            np.abs(1 - detrended_flux) > constants.PSPLINES_STDEV_CUT *
            np.std(detrended_flux))
        if verbose:
            print('Iteration:', i + 1, 'Rejected outliers:',
                  len(mask_outliers[0]))
            # Check convergence
            if len(mask_outliers[0]) == 0:
                print('Converged.')
                break

    # Final iteration, applied to unclipped time series (interpolated over clipped values)
    mask_outliers = np.ma.where(
        np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT * stdev)
    newtime, newflux = cleaned_array(newtime[mask_outliers],
                                     newflux[mask_outliers])
    gam = LinearGAM(s(0, n_splines=max_splines))
    search_gam = gam.gridsearch(newtime[:, np.newaxis],
                                newflux,
                                progress=False)
    trend = search_gam.predict(time)

    # Cut off edges
    if edge_cutoff > 0:
        low_index = np.argmax(time > (min(time) + edge_cutoff))
        hi_index = np.argmax(time > (max(time) - edge_cutoff))
        trend[:low_index] = np.nan
        trend[hi_index:] = np.nan

    nsplines = np.ceil(gam.statistics_['edof'])
    return trend, nsplines
Esempio n. 5
0
def GAM_linear(X, y):
    X= X.to_numpy()
    y = y.to_numpy()
    from pygam import LinearGAM, s, f, te
    gam = LinearGAM(s(0) +s(1) +f(2))
    gam.gridsearch(X,y)
    y_pred = gam.predict(X)
    y_pred = pd.DataFrame(y_pred)
    y_pred['actual'] =y
    y_pred['residual'] = y_pred.actual-y_pred[0]
    return gam, gam.summary(), y_pred
Esempio n. 6
0
def interp_gam(data):
    valid = np.isfinite(data.stream_dist.values[:, 0])
    sample_xy = data.sample_xy.values[valid]
    sample_st = data.stream_dist.values[valid]
    sample_z = data.sample_z.values[valid]
    if np.sum(valid) == 0:
        return np.nan

    gam = LinearGAM(
        s(0, n_splines=4) + s(1, n_splines=5) +
        te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z)
    z_pred = gam.predict(np.array([[0, 0]]))[0]
    return z_pred
Esempio n. 7
0
    def _fit_gam(self):
        """Fits a GAM that predicts the outcome from the treatment and GPS
        """

        X = np.column_stack((self.T.values, self.gps))
        y = np.asarray(self.y)

        return LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order) +
            s(1, n_splines=self.n_splines, spline_order=self.spline_order),
            max_iter=self.max_iter,
            lam=self.lambda_,
        ).fit(X, y)
    def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR):

        model_spec = f(0) if features[0].is_factor() else s(
            0, n_splines=self.num_splines)

        for i in range(1, len(features)):
            model_spec += f(i) if features[i].is_factor() else s(
                i, n_splines=self.num_splines)

        if model_type == TYPE_LINEAR:
            return LinearGAM(model_spec)

        if model_type == TYPE_LOGISTIC:
            return LogisticGAM(model_spec)
Esempio n. 9
0
    def _fit_gam(self):
        """Fits a GAM that predicts the outcome (continuous or binary) from the treatment and GPS"""

        X = np.column_stack((self.T.values, self.gps))
        y = np.asarray(self.y)

        model_type_dict = {"continuous": LinearGAM, "binary": LogisticGAM}

        return model_type_dict[self.outcome_type](
            s(0, n_splines=self.n_splines, spline_order=self.spline_order) +
            s(1, n_splines=self.n_splines, spline_order=self.spline_order),
            max_iter=self.max_iter,
            lam=self.lambda_,
        ).fit(X, y)
Esempio n. 10
0
    def GAM2(self):
        """GAM of splines, where we perform variable selection
        to find the best model."""
        from pygam import LogisticGAM, s, l, f
        terms = s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7)

        gam = LogisticGAM(terms=terms, fit_intercept=False)
        mod = gam.gridsearch(self.Xtrain.values, self.ytrain, \
            lam=np.logspace(-3, 3, 11))     # Generate the model
        mod.summary()  # Pseudo-R2: 0.6449
        ypred = mod.predict(self.Xtest)
        MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values

        if self.plot:
            plt.plot(range(len(ypred.reshape(-1,1))),\
                ypred.reshape(-1,1)-0.5,"r.", label='GAM model')
            plt.plot(range(len(self.ytest)),
                     self.ytest,
                     "b.",
                     label='Testing Data')
            plt.legend()
            plt.title("GAM model with linear terms. Prediction data is\n"\
                + "scaled downwards by 0.5 for visual purposes.")
            plt.ylabel("FFVC score")
            plt.xlabel("Sample no.")
            plt.show()
Esempio n. 11
0
def smoother_expectileGAM(x,y,X,**kwargs):
    from pygam import s, ExpectileGAM
    if isinstance(x,list):
        x = np.array(x)
    if isinstance(y,list):
        y = np.array(y)
    if X is None:
        X = deepcopy(x)
    x = x.reshape(len(x),1)
    X = X.reshape(len(X),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    n_splines = int(len(y)/5)
    if 'expectile' in kwargs.keys():
        expectile = kwargs['expectile']
    else:
        expectile = .5
    #gam50 = ExpectileGAM(expectile=expectile,terms=s(0),\
    #                    n_splines=n_splines).gridsearch(x, y)
    gam50 = ExpectileGAM(expectile=expectile,terms=s(0),\
                        ).gridsearch(x, y)
    # This practice of copying makes the models
    # less likely to cross and much faster
    # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html
    # and copy the smoothing to the other models
    pred = gam50.predict(X)
    return pred
Esempio n. 12
0
def smoother_linearGAM(x,y,X,**kwargs):
    from pygam import LinearGAM, l, s
    if isinstance(x,list):
        x = np.array(x)
    x = x.reshape(len(x),1)
    if isinstance(y,list):
        y = np.array(y)
    if isinstance(X,list):
        X = np.array(X)
    if X is None:
        X = x.reshape(len(x),1)
    else:
        X = X.reshape(len(X),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    n_splines = int(len(y)/5)
    #gam = LinearGAM(n_splines=n_splines,\
    #                terms=s(0,basis='ps')\
    #                ).gridsearch(x, y)
    gam = LinearGAM( terms=s(0,basis='ps')\
                    ).gridsearch(x, y )
    # sample on the input grid
    means = gam.predict(X)
    return means
Esempio n. 13
0
def fit_gam_with_fix_dof(X, Y, dof):  ##{{{
    lam_up = 1e2
    lam_lo = 1e-2
    tol = 1e-2
    diff = 1. + tol
    n_splines = int(dof + 2)
    nit = 0
    while diff > tol:
        lam = (lam_up + lam_lo) / 2.

        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=n_splines, penalties="auto", lam=lam) +
            pg.l(1, penalties=None))
        gam_model.fit(X, Y)
        current_dof = gam_model.statistics_["edof"]
        if current_dof < dof:
            lam_up = lam
        else:
            lam_lo = lam
        diff = np.abs(dof - current_dof)
        nit += 1
        if nit % 100 == 0:
            lam_up = 1e2
            lam_lo = 1e-2
            n_splines += 1
    return gam_model
Esempio n. 14
0
def run_gam_effective_r_from_empirical(state_data,
                                       n_splines=25,
                                       algo=GammaGAM,
                                       n_bootstrap=100):

    # for numerical stability
    epsilon = 1

    R_series = (
        state_data['confirmed_new'] /
        state_data['confirmed_total'].shift(1)).dropna() * 1 / RECOVERY_RATE

    X = np.arange(R_series.shape[0])
    y = R_series.values + epsilon

    # running GAM in bootstrap
    bootstrap = []
    for _ in range(n_bootstrap):

        weights = dirichlet([1] * R_series.shape[0]).rvs(1)

        gam = algo(s(0, n_splines) + l(0))
        gam.fit(X, y, weights=weights[0])

        bootstrap.append(gam)

    preds = pd.DataFrame([m.predict(X) - epsilon for m in bootstrap]).T

    estimate_rt = pd.DataFrame(index=R_series.index)
    estimate_rt['ML'] = preds.mean(axis=1).values
    estimate_rt['Low_90'] = preds.quantile(0.05, axis=1).values
    estimate_rt['High_90'] = preds.quantile(0.95, axis=1).values

    return estimate_rt.dropna()
Esempio n. 15
0
    def _build_ensemble_feature(self, X, base_pred):
        """Builds featurre array and corresponding GAM TermList.

        Terms corresponding to X will be summation of
            dimension-wise splines, plus a tensor-product term across all dimension.

        """
        ensemble_term_func = s if self.nonlinear_ensemble else l

        ens_feature = np.asarray(list(base_pred.values())).T
        term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])]

        # optionally, add residual process
        if self.model_residual:
            # build gam terms
            term_list += [s(dim_index) for dim_index in
                          range(ens_feature.shape[1],
                                ens_feature.shape[1] + X.shape[1])]
            if X.shape[1] > 1:
                term_list += [te(*list(ens_feature.shape[1] +
                                       np.array(range(X.shape[1]))))]

            # update features
            ens_feature = np.concatenate([ens_feature, X], axis=1)

        gam_feature_terms = TermList(*term_list)

        return ens_feature, gam_feature_terms
Esempio n. 16
0
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref):
    # gam = LogisticGAM(s(0)).gridsearch(X, y)
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    gam = LogisticGAM(s(0, constraints='monotonic_inc',
                        n_splines=5)).gridsearch(X, y)  # add a linear term
    #XX = gam.generate_X_grid(term=0)
    XX = np.linspace(0, 1, 100)
    ax.plot(XX, gam.predict_proba(XX), c='g')
    ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--')
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X_eval)
    ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100)
    brier = BrierEval(np.array([1 - y_, y_]).T, y_eval)
    mse = MseEval(gam, gam_ref, num_bins=100)
    acc = gam.accuracy(X_eval, y_eval)
    ax.text(0.05,
            0.75,
            'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' %
            (ece, mce, brier, acc, mse),
            size=6,
            ha='left',
            va='center',
            bbox={
                'facecolor': 'green',
                'alpha': 0.5,
                'pad': 4
            })
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)
    confi = gam.confidence_intervals(X_eval, width=0.95)
    print gam.summary()
    return ece, mce, brier, acc, mse, ax, confi
Esempio n. 17
0
    def _fit_final_gam(self):
        """We now regress the original treatment values against the pseudo-outcome values
        """

        return LinearGAM(s(0, n_splines=30, spline_order=3),
                         max_iter=500,
                         lam=self.bandwidth).fit(self.t_data,
                                                 y=self.pseudo_out)
Esempio n. 18
0
def spline_calibration(X, y):
    gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(
        X, y)  # add a linear term
    # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html
    # gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(X, y) # add a linear term
    # compute ece and acc after calibration
    y_ = gam.predict_proba(X)
    return y_
Esempio n. 19
0
def pspline(time, flux):
    try:
        from pygam import LinearGAM, s
    except:
        raise ImportError('Could not import pygam')

    newflux = flux.copy()
    newtime = time.copy()
    detrended_flux = flux.copy()

    for i in range(constants.PSPLINES_MAXITER):
        mask_outliers = numpy.ma.where(
            1 - detrended_flux < constants.PSPLINES_STDEV_CUT *
            numpy.std(detrended_flux))
        newtime, newflux = cleaned_array(newtime[mask_outliers],
                                         newflux[mask_outliers])
        gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES))
        search_gam = gam.gridsearch(newtime[:, numpy.newaxis],
                                    newflux,
                                    progress=False)
        trend = search_gam.predict(newtime)
        detrended_flux = newflux / trend
        stdev = numpy.std(detrended_flux)
        mask_outliers = numpy.ma.where(
            1 - detrended_flux > constants.PSPLINES_STDEV_CUT *
            numpy.std(detrended_flux))
        print('Iteration:', i + 1, 'Rejected outliers:', len(mask_outliers[0]))

        # Check convergence
        if len(mask_outliers[0]) == 0:
            print('Converged.')
            break

    # Final iteration, applied to unclipped time series (interpolated over clipped values)
    mask_outliers = numpy.ma.where(
        1 - detrended_flux < constants.PSPLINES_STDEV_CUT * stdev)
    newtime, newflux = cleaned_array(newtime[mask_outliers],
                                     newflux[mask_outliers])
    gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES))
    search_gam = gam.gridsearch(newtime[:, numpy.newaxis],
                                newflux,
                                progress=False)
    trend = search_gam.predict(time)

    return trend
Esempio n. 20
0
def smooth_gam(x, y, n_splines=100, lam=10):
    from pygam import ExpectileGAM, LinearGAM, s, f
    gam = LinearGAM(s(0, n_splines=n_splines), lam=lam).fit(x, y)
    # gam = ExpectileGAM(s(0, n_splines=n_splines), expectile=0.5, lam=lam).gridsearch(x.values.reshape((-1,1)), y)
    XX = gam.generate_X_grid(term=0)
    confi = gam.confidence_intervals(XX)
    # confi = gam.prediction_intervals(XX)
    ym = gam.predict_mu(XX)
    return XX[:, 0], ym, confi
Esempio n. 21
0
def BAM():

    gam = GAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147])
                    + s(1, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147])
                    + te(0, 1, dtype=['numerical', 'numerical']), distribution= 'normal', link = 'identity', fit_intercept=True)
    print(gam.gridsearch(X, y, n_splines=np.arange(50)).summary())
    plt.scatter(X[:, 0][0:56], y[0:56], s=3, linewidths=0.0001, label='data')
    plt.plot(X[:, 0][0:56], gam.predict(X[0:56]), color='red', linewidth=1, label='prediction')
    plt.legend()
    plt.title('Basic Additive Model')
    plt.show()

    # error calculation
    rmse_val = rmse(np.array(y), np.array(gam.predict(X)))
    print("RMSE is: " + str(rmse_val))
    mae = mean_absolute_error(y, gam.predict(X))
    print("MAE is: " + str(mae))
    mape = mean_absolute_percentage_error(np.array(y), np.array(gam.predict(X)))
    print("MAPE is: " + str(mape))
Esempio n. 22
0
def cleaner_expectileGAM(x,y,**kwargs):
    from pygam import s, ExpectileGAM
    if isinstance(x,list):
        x = np.array(x)
    if isinstance(y,list):
        y = np.array(y)
    X = x.reshape(len(x),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    n_splines = int(len(y)/5)
    #gam50 = ExpectileGAM(expectile=.5,terms=s(0),\
    #                    n_splines=n_splines).gridsearch(X, y)
    gam50 = ExpectileGAM(expectile=.5,terms=s(0),\
                        ).gridsearch(X, y)
    # This practice of copying makes the models
    # less likely to cross and much faster
    # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html
    # and copy the smoothing to the other models
    lam = gam50.lam
    # now fit a few more models
    if 'expectile_ulim' in kwargs.keys():
        expectile_ulim = kwargs['expectile_ulim']
    else:
        expectile_ulim = .95
    if 'expectile_llim' in kwargs.keys():
        expectile_llim = kwargs['expectile_llim']
    else:
        expectile_llim = .05
    #gam_ulim = ExpectileGAM(expectile=expectile_ulim, lam=lam,
    #                    terms=s(0),n_splines=n_splines).fit(X, y)
    #gam_llim = ExpectileGAM(expectile=expectile_llim, lam=lam,
    #                    terms=s(0),n_splines=n_splines).fit(X, y)
    gam_ulim = ExpectileGAM(expectile=expectile_ulim, lam=lam,
                        terms=s(0)).fit(X, y)
    gam_llim = ExpectileGAM(expectile=expectile_llim, lam=lam,
                        terms=s(0)).fit(X, y)
    ulim = gam_ulim.predict(X)
    llim = gam_llim.predict(X)
    idx = [i for i in range(len(y)) \
            if (y[i]>ulim[i] or y[i]<llim[i])]
    return idx
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest,pred)[0,1]**2
        importances[predictor] = R2
    return importances
Esempio n. 24
0
    def _fit_gams(self, temp_t, temp_m, temp_y):
        """Fits the mediator and outcome GAMs"""
        temp_mediator_model = LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order),
            fit_intercept=True,
            max_iter=self.max_iter,
            lam=self.lambda_,
        )
        temp_mediator_model.fit(temp_t, temp_m)

        temp_outcome_model = LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order) +
            s(1, n_splines=self.n_splines, spline_order=self.spline_order),
            fit_intercept=True,
            max_iter=self.max_iter,
            lam=self.lambda_,
        )
        temp_outcome_model.fit(pd.concat([temp_t, temp_m], axis=1), temp_y)

        return temp_mediator_model, temp_outcome_model
Esempio n. 25
0
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest, pred)[0, 1]**2
        importances[predictor] = R2
    return importances
Esempio n. 26
0
    def _fit(self, X, y, mylam=None, **kwargs):
        if isinstance(X, pd.DataFrame):
            X = X.values

        if not self.fit_binary_feat_as_factor_term:
            self.model = self.model_cls(max_iter=self.max_iter,
                                        n_splines=self.n_splines,
                                        **self.kwargs)
        else:
            formulas = []
            for idx, feat_name in enumerate(self.feature_names):
                num_unique_x = len(self.X_values_counts[feat_name])
                if num_unique_x < 2:
                    continue

                if num_unique_x == 2:
                    formulas.append(f(idx))
                else:
                    formulas.append(s(idx))

            the_formula = formulas[0]
            for term in formulas[1:]:
                the_formula += term

            self.model = self.model_cls(the_formula,
                                        max_iter=self.max_iter,
                                        n_splines=self.n_splines,
                                        **self.kwargs)

        if not self.search:
            # Just fit the model with this lam
            return self.model.fit(X, y, **kwargs)

        if mylam is None:
            mylam = self.search_lam

        # do a grid search over here
        try:
            print('search range from %f to %f' % (mylam[0], mylam[-1]))
            self.model.gridsearch(X, y, lam=mylam, **kwargs)
        except (np.linalg.LinAlgError, pygam.utils.OptimizationError) as e:
            print('Get the following error:', str(e),
                  '\nRetry the grid search')
            if hasattr(self.model, 'coef_'):
                del self.model.coef_

            self._fit(X, y, mylam=mylam[1:], **kwargs)

        if not hasattr(self.model,
                       'statistics_'):  # Does not finish the training
            raise Exception('Training fails.')

        return self
Esempio n. 27
0
    def _predict_gam(ds, conf, time, quantiles=None, size=None,
                     return_gam=False,  return_counts=False,
                     max_time_diff=200):
        # insert 0s for every timeseries in the ensemble for the reference
        # period at -35 BP (1985)

        climate = conf.climate + '_ensemble'
        age = conf.age + '_ensemble'

        x = ds[age].values.ravel()
        y = ds[climate].values.ravel()

        mask = (~np.isnan(x)) & (~np.isnan(y))
        if not mask.any():
            return
        else:
            x = x[mask]
            y = y[mask]

        gam = pygam.LinearGAM(pygam.s(0)).gridsearch(
            x[:, np.newaxis], y, progress=False)

        time = np.asarray(time)

        ret = (gam.predict(time), )

        if quantiles is not None:
            ret = ret + (gam.prediction_intervals(time, quantiles=quantiles), )
        if size is not None:

            ret = ret + (gam.sample(
                x[:, np.newaxis], y, sample_at_X=time, n_draws=size).T, )
        if return_counts:
            tree = BallTree(ds[age].values.ravel()[:, np.newaxis])
            counts = tree.query_radius(time[:, np.newaxis], return_counts,
                                       count_only=True).astype(float)
            ret = ret + (counts, )

        # look how many samples in the ensemble fall into the `max_time_diff`
        # time interval around the predicted time
        tree = BallTree(ds[age].values.ravel()[:, np.newaxis])
        counts = tree.query_radius(time[:, np.newaxis], max_time_diff,
                                   count_only=True)

        idx = counts < 100
        if idx.any():
            for arr in ret:
                arr[idx] = np.nan

        if return_gam:
            return ret + (gam, )
        else:
            return ret
    def fit(self):
        S = s(0) if self.feature_names[0] in self.numerical_features else f(0)
        for i in range(1, len(self.feature_names)):
            if self.feature_names[i] in self.numerical_features:
                S += s(i)
            else:
                S += f(i)

        if self.mode == 'regression':
            gam = LinearGAM(S)
            gam.gridsearch(self.X_train, self.y_train)
            self._is_fitted = True
            self.explainer = gam
        elif self.mode == 'classification':
            gam = LogisticGAM(S)
            gam.gridsearch(np.array(self.X_train), self.y_train)
            self._is_fitted = True
            self.explainer = gam
        else:
            raise NameError(
                'ERROR: mode should be regression or classification')
Esempio n. 29
0
def calibrate_propensities(propensities, treatment):
    """Post-hoc calibration of propensity scores given the true treatments

    Args:
        propensities: propensity scores
        treatment: treatment indicator

    Returns:
        p: calibrated version of the propensities given
    """
    gam = LogisticGAM(s(0)).fit(propensities, treatment)
    return gam.predict_proba(propensities)
Esempio n. 30
0
def create_rand_gam(number_of_searches, new_values, pred_y, y, pca_splines,
                    pca_lam, pred_splines, pred_lam, pred_factor):
    lams = np.random.rand(number_of_searches, new_values.shape[1] +
                          1)  # random points on [0, 1], with shape (1000, 3)
    lams = lams * 8 - 4  # shift values to -4, 4
    lams = 10**lams  # transforms values to 1e-4, 1e4
    new_values = np.append(new_values, np.array(pred_y).reshape(-1, 1), axis=1)

    titles = []
    for i in range(new_values.shape[1] - 1):
        titles.append(str(i))
        if i == 0:
            x = s(i, n_splines=pca_splines, lam=pca_lam)
        else:
            x = x + s(i, n_splines=pca_splines, lam=pca_lam)
    if pred_factor:
        x = x + pygam.terms.f(i + 1, lam=pred_lam)
    else:
        x = x + s(i + 1, n_splines=pred_splines, lam=pred_lam)

    rand_gam = LogisticGAM(x).gridsearch(new_values, y, lam=lams)
    return rand_gam, new_values, titles
Esempio n. 31
0
    def updateEmpTauX(self, bFit=True, mask=None):

        if mask is None:
            mask = np.ones((self.V, self.S))

        square_diff_matrix = self.exp_square_diff_matrix()

        mXFit = np.ma.masked_where(mask == 0, self.X)

        X1DFit = np.ma.compressed(mXFit)

        logX1DFit = np.log(0.5 + X1DFit)

        mSDMFit = np.ma.masked_where(mask == 0, square_diff_matrix)

        mFitFit = np.ma.compressed(mSDMFit)

        logMFitFit = np.log(mFitFit + NMF_VB.minVar)

        if bFit:
            try:
                self.gam = LinearGAM(
                    s(0, n_splines=5,
                      constraints='monotonic_inc')).fit(logX1DFit, logMFitFit)

            except ValueError:
                print("Performing fixed tau")

                self.updateFixedTau(mask)

                return

        mX = np.ma.masked_where(mask == 0, self.X)

        X1D = np.ma.compressed(mX)

        logX1D = np.log(0.5 + X1D)

        yest_sm = self.gam.predict(logX1D)

        mBetaTau = self.beta * (X1D + 0.5) + 0.5 * np.exp(yest_sm)

        np.place(self.betaTau, mask == 1, mBetaTau)

        mExpTau = (self.alpha + 0.5) / mBetaTau

        np.place(self.expTau, mask == 1, mExpTau)

        mLogTau = digamma(self.alpha + 0.5) - np.log(mBetaTau)

        np.place(self.expLogTau, mask == 1, mLogTau)
Esempio n. 32
0
    def mean(self, smooth=None, **kwargs):
        """Compute an estimate of the mean.

        Parameters
        ----------
        smooth: str, default=None
            Name of the smoothing method to use. Currently, not implemented.

        Keyword Args
        ------------
        kernel_name: str, default='epanechnikov'
            Name of the kernel used for local polynomial smoothing.
        degree: int, default=1
            Degree used for local polynomial smoothing.
        bandwidth: float, default=1
            Bandwidth used for local polynomial smoothing.
        n_basis: int, default=10
            Number of splines basis used for GAM smoothing.

        Returns
        -------
        obj: DenseFunctionalData object
            An estimate of the mean as a DenseFunctionalData object with the
            same argvals as `self` and one observation.

        """
        mean_estim = self.values.mean(axis=0)

        if smooth is not None:
            argvals = self.argvals['input_dim_0']
            if self.n_dim > 1:
                raise ValueError('Only one dimensional data can be smoothed.')
            if smooth == 'LocalLinear':
                p = self.n_points['input_dim_0']
                points = kwargs.get('points', 0.5)
                neigh = kwargs.get('neighborhood',
                                   np.int(p * np.exp(-(np.log(np.log(p)))**2)))
                data_smooth = self.smooth(points=points, neighborhood=neigh)
                mean_estim = data_smooth.values.mean(axis=0)
            elif smooth == 'GAM':
                n_basis = kwargs.get('n_basis', 10)
                argvals = self.argvals['input_dim_0']
                mean_estim = pygam.LinearGAM(pygam.s(0, n_splines=n_basis)).\
                    fit(argvals, mean_estim).\
                    predict(argvals)
            elif smooth == 'SmoothingSpline':
                ss = SmoothingSpline()
                mean_estim = ss.fit_predict(argvals, mean_estim)
            else:
                raise NotImplementedError('Smoothing method not implemented.')
        return DenseFunctionalData(self.argvals, mean_estim[np.newaxis])
Esempio n. 33
0
import patsy as pt
import numpy as np
from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

# Prep the dataset
data = pd.read_csv(
    "/home/dusty/Econ8310/DataSets/HappinessWorld.csv")

# Generate x and y matrices
eqn = """happiness ~ -1 + freedom + family + year + economy + health + trust"""
y,x = pt.dmatrices(eqn, data=data)

# Initialize and fit the model
gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5))
gam = gam.gridsearch(np.asarray(x), y)

# Specify plot shape
titles = ['freedom', 'family', 'year', 'economy',
          'health', 'trust']

fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles)
fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False)

for i, title in enumerate(titles):
  XX = gam.generate_X_grid(term=i)
  pdep, confi = gam.partial_dependence(term=i, width=.95)
  trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect')
  ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI')
  ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')