def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:,0])
    
    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k:[] for k in X.columns}
        pred=np.zeros(y.shape[0])
        for train,test in CV.split(X,y):
            Xtrain = X.iloc[train,:]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test,:]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape)>1:
                p=p[:,0]
            pred[test]=p

            if get_importance:    
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k,v in importance_out.items():
                    importances[k].append(v)
                    
        cv_scores = [{'r': np.corrcoef(y,pred)[0,1],
                      'R2': np.corrcoef(y,pred)[0,1]**2,
                      'MAE': mean_absolute_error(y,pred)}]
        
        
        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{'r': np.corrcoef(y,in_pred)[0,1],
                          'R2': np.corrcoef(y,in_pred)[0,1]**2,
                          'MAE': mean_absolute_error(y,in_pred)}]
        GAM_results[name] = {'scores_cv': cv_scores,
                             'scores_insample': in_scores,
                             'pred_vars': X.columns,
                             'importances': importances,
                             'model': gam}
    return GAM_results
Exemple #2
0
def GAM(X, Y, factor = False):

    """SPLITTING THE DATASET"""
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options)

    """PREPROCESSING"""
    # NB: No need for one-hot encoding – categorical columns are already binary!
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    """CREATING A DESIGN MATRIX"""
    poly = PolynomialFeatures(1)
    X_test = poly.fit_transform(X_test)
    X_train = poly.fit_transform(X_train)

    linear = ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'y', 'n', 'y',
    'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']

    # for feature in X_train.T:
    #     unique = np.unique(feature)
    #     if len(unique) < 6:
    #         linear.append("n")
    #     else:
    #         idx = np.argsort(feature)
    #         plt.plot(feature[idx], Y.squeeze()[idx])
    #         plt.show()
    #         linear.append(input("Linear?\t"))

    linear = np.array(linear)
    linear[linear == "n"] = 0
    linear[linear == "y"] = 1
    linear = linear.astype(bool)

    gam_input = None
    for n,is_linear in enumerate(linear):
        if gam_input is not None:
            if is_linear:
                gam_input += GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input += GAM_spline(n)
        else:
            if is_linear:
                gam_input = GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input = GAM_spline(n)

    gam = LinearGAM(gam_input, fit_intercept = False, max_iter = int(1E5))
    gam.fit(X_train, Y_train)
    Y_predict_train = gam.predict(X_train)
    Y_predict_test = gam.predict(X_test)
    MSE_train = np.mean((Y_predict_train - Y_train)**2)
    MSE_test = np.mean((Y_predict_test - Y_test)**2)
    return MSE_train, MSE_test
Exemple #3
0
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:, 0])

    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k: [] for k in X.columns}
        pred = np.zeros(y.shape[0])
        for train, test in CV.split(X, y):
            Xtrain = X.iloc[train, :]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test, :]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape) > 1:
                p = p[:, 0]
            pred[test] = p

            if get_importance:
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k, v in importance_out.items():
                    importances[k].append(v)

        cv_scores = [{
            'r': np.corrcoef(y, pred)[0, 1],
            'R2': np.corrcoef(y, pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, pred)
        }]

        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{
            'r': np.corrcoef(y, in_pred)[0, 1],
            'R2': np.corrcoef(y, in_pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, in_pred)
        }]
        GAM_results[name] = {
            'scores_cv': cv_scores,
            'scores_insample': in_scores,
            'pred_vars': X.columns,
            'importances': importances,
            'model': gam
        }
    return GAM_results
def fit_pygam_model(X_train: pandas.core.frame.DataFrame,
                    X_test: pandas.core.frame.DataFrame,
                    y_train: pandas.core.frame.DataFrame,
                    y_test: pandas.core.frame.DataFrame):
    '''
    Creates a general additive model LinearGAM (normally distributed errors)
    with grid search. Returns the best model with given hyperparameters.
    hyperparameters: n_splines and lam regularization parameter.
    '''
    from pygam import LinearGAM
    gam = LinearGAM().gridsearch(X_train.values,
                                 y_train,
                                 n_splines=np.arange(3, 20),
                                 lam=np.logspace(-3, 3, 11))
    print(gam.summary())

    y_train_predicted = gam.predict(X_train)
    y_test_predicted = np.floor(gam.predict(X_test))

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted))
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    r2_train = r2_score(y_train, y_train_predicted)
    print("RMSE of training set is {}".format(rmse_train))
    print("MAE of testing set is {}".format(mae_train))
    print("R2 score of training set is {}\n".format(r2_train))

    if len(y_test) > 0:
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predicted))
        mae_test = mean_absolute_error(y_test, y_test_predicted)
        r2_test = r2_score(y_test, y_test_predicted)
        print("RMSE of testing set is {}".format(rmse_test))
        print("MAE of testing set is {}".format(mae_test))
        print("R2 score of testing set is {}\n".format(r2_test))
    '''
    Visualize the feature significance and confidence intervals
    '''
    num_features = len(X_train.columns)
    fig = plt.figure(figsize=(18, 12))
    fig.subplots_adjust(hspace=0.4)

    cnt = 1
    p_values = gam.statistics_['p_values']

    for i in range(num_features):
        axs = fig.add_subplot(num_features, 1, cnt)
        m = gam.generate_X_grid(term=i)
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i,
                                        X=m))  # this is the actual coefficents
        axs.plot(m[:, i],
                 gam.partial_dependence(term=i, X=m, width=.95)[1],
                 c='r',
                 ls='--')  # this plots the confidence intervals
        axs.set_title(X_train.columns[i] +
                      ('*' if p_values[cnt] < 0.05 else ''))
        cnt += 1
Exemple #5
0
def check_significance(x, y, x_test, cols, col_add):
    if len(cols) == 0:
        return True
    model = LinearGAM().gridsearch(x[cols].values, y, progress=False)
    predictions1 = model.predict(x_test[cols].values)
    model = LinearGAM().gridsearch(x[cols + [col_add]].values,
                                   y,
                                   progress=False)
    predictions2 = model.predict(x_test[cols + [col_add]].values)
    test_stats = wilcoxon(predictions1, predictions2)

    return test_stats.pvalue < 0.05
def get_surrogate_predictions(X, y, w, pred_mask=None):
    if pred_mask is None:
        pred_mask = np.ones(len(y), dtype=bool)
        fit_mask = pred_mask
    else:
        fit_mask = ~pred_mask
    # get surrogates
    model_1 = LinearGAM()
    model_1.fit(X[fit_mask & (w == 1), :], y[fit_mask & (w == 1)])
    mu_1_plug = model_1.predict(X[pred_mask, :])

    model_0 = LinearGAM()
    model_0.fit(X[fit_mask & (w == 0), :], y[fit_mask & (w == 0)])
    mu_0_plug = model_0.predict(X[pred_mask, :])

    return mu_0_plug, mu_1_plug
Exemple #7
0
def smoother_linearGAM(x,y,X,**kwargs):
    from pygam import LinearGAM, l, s
    if isinstance(x,list):
        x = np.array(x)
    x = x.reshape(len(x),1)
    if isinstance(y,list):
        y = np.array(y)
    if isinstance(X,list):
        X = np.array(X)
    if X is None:
        X = x.reshape(len(x),1)
    else:
        X = X.reshape(len(X),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    n_splines = int(len(y)/5)
    #gam = LinearGAM(n_splines=n_splines,\
    #                terms=s(0,basis='ps')\
    #                ).gridsearch(x, y)
    gam = LinearGAM( terms=s(0,basis='ps')\
                    ).gridsearch(x, y )
    # sample on the input grid
    means = gam.predict(X)
    return means
def GAM(X, Y):

    """SPLITTING THE DATASET"""
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options)

    """PREPROCESSING"""
    # NB: No need for one-hot encoding – categorical columns are already binary!
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    """CREATING A DESIGN MATRIX"""
    poly = PolynomialFeatures(1)
    X_test = poly.fit_transform(X_test)
    X_train = poly.fit_transform(X_train)

    gam_input = None
    for n in range(X_train.shape[1]):
        if gam_input is not None:
            gam_input += GAM_spline(n)
        else:
            gam_input = GAM_spline(n)

    gam = LinearGAM(gam_input).fit(X_train, Y_train)
    Y_predict = gam.predict(X_test)
    Y_predict[Y_predict >= 0.5] = 1
    Y_predict[Y_predict < 0.5] = 0
    accuracy = (Y_predict.squeeze() == Y_test.squeeze()).astype(int)
    accuracy = np.sum(accuracy)/accuracy.shape[0]
    return accuracy
def test_if_learner():
    # get data without noise
    X, y, w, ite, p, bs = make_te_data(n=200, noise=False)

    # get surrogate predictions to compare against po predictions
    mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w)

    # get surrogate predictions for two folds as inside the iflearner
    splitter = StratifiedKFold(n_splits=2, shuffle=True,
                               random_state=42)
    idx_list = []
    for train_index, test_index in splitter.split(X, w):
        idx_list.append((train_index, test_index))

    fold2_mask = np.zeros(200, dtype=bool)
    fold2_mask[idx_list[0][1]] = 1
    mu_0, mu_1 = np.zeros(200), np.zeros(200)
    mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask)
    mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask)
    pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1)

    # make second stage model
    t_model = LinearGAM()
    t_model.fit(X, pseudo_outcome)
    te_debiased = t_model.predict(X)

    # fit if learner
    if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    # test outcomes
    np.testing.assert_almost_equal(te, te_debiased)
    np.testing.assert_almost_equal(mu_0, mu_0_plug)
    np.testing.assert_almost_equal(mu_1, mu_1_plug)
    np.testing.assert_almost_equal(if_learner.predict(X), te_debiased)

    with pytest.raises(ValueError):
        # predicting po when base model not fitted should not be possible
        if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42)
        if_learner.fit(X, y, w, p)
        te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    with pytest.warns(UserWarning):
        # warning raised if only one fold?
        if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42)
        if_learner.fit(X, y, w, p)

    # check that binary_y setting also works (smoketest)
    X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline,
                                       noise=False, binary_y=True)
    if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(),
                             binary_y=True, setting=RR_NAME, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
Exemple #10
0
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest, pred)[0, 1]**2
        importances[predictor] = R2
    return importances
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest,pred)[0,1]**2
        importances[predictor] = R2
    return importances
Exemple #12
0
def GAM_linear(X, y):
    X= X.to_numpy()
    y = y.to_numpy()
    from pygam import LinearGAM, s, f, te
    gam = LinearGAM(s(0) +s(1) +f(2))
    gam.gridsearch(X,y)
    y_pred = gam.predict(X)
    y_pred = pd.DataFrame(y_pred)
    y_pred['actual'] =y
    y_pred['residual'] = y_pred.actual-y_pred[0]
    return gam, gam.summary(), y_pred
Exemple #13
0
def interp_gam(data):
    valid = np.isfinite(data.stream_dist.values[:, 0])
    sample_xy = data.sample_xy.values[valid]
    sample_st = data.stream_dist.values[valid]
    sample_z = data.sample_z.values[valid]
    if np.sum(valid) == 0:
        return np.nan

    gam = LinearGAM(
        s(0, n_splines=4) + s(1, n_splines=5) +
        te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z)
    z_pred = gam.predict(np.array([[0, 0]]))[0]
    return z_pred
Exemple #14
0
def gam_results(x, y, df, param, infection_time):
    gam = LinearGAM(s(0), lam=.5).fit(x, y)
    y_new = gam.predict(x)
    confi1 = gam.prediction_intervals(x, width=.95)
    pred = np.zeros(x.shape[0])
    for i in np.arange(x.shape[0]):
        if i == 0:
            pred[i] = np.mean(df[param].iloc[0:3])
        else:
            if i < infection_time:
                pred[i] = pred[i - 1] * y_new[i] + pred[i - 1]
            else:
                pred[i] = pred[i - 1] * y_new[i] + pred[i - 1]

    if param == 'Positive':
        pred = pred + np.concatenate(
            (np.zeros(infection_time),
             pred[0:(pred.shape[0] - infection_time)]),
            axis=0)

    x_forcast = np.arange(np.max(x), np.max(x) + 10)
    y_forcast = gam.predict(x_forcast)
    confi = gam.prediction_intervals(x_forcast, width=.95)

    forcast = np.zeros(x_forcast.shape[0])
    forcast_L = np.zeros(x_forcast.shape[0])
    forcast_U = np.zeros(x_forcast.shape[0])
    for i in np.arange(x_forcast.shape[0]):
        if i == 0:
            forcast[i] = df[param].iloc[-1]
            forcast_L[i] = forcast[i]
            forcast_U[i] = forcast[i]
        else:
            forcast[i] = forcast[i - 1] * y_forcast[i - 1] + forcast[i - 1]
            forcast_L[i] = forcast_L[i - 1] * confi[i - 1, 0] + forcast_L[i -
                                                                          1]
            forcast_U[i] = forcast_U[i - 1] * confi[i - 1, 1] + forcast_U[i -
                                                                          1]
    return ([pred, forcast, forcast_L, forcast_U, y_new, confi1])
Exemple #15
0
def AAM():

    gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147])
                        + l(3)  # the last travel time
                        + te(0, 1)  # distance and departure_time
                        + te(2, 0)  # distance and isWeekend
                        + l(2),  # isWeekend
                    fit_intercept=True)

    print(gam.gridsearch(X1, y1).summary())
    # print(gam.gridsearch(X1,y1).get_params(deep=True))
    '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data')
    plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction')
    plt.legend()
    plt.title('Extended Additive Model')
    plt.show()'''
    # error calculation
    rmse_val = rmse(np.array(y1), np.array(gam.predict(X1)))
    print("RMSE is: "+str(rmse_val))
    mae = mean_absolute_error(y1, gam.predict(X1))
    print("MAE is: "+str(mae))
    mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1)))
    print("MAPE is: "+ str(mape))
def predict_gam(ad_group,date):
    ads_file = 'data/ad_table.csv'
    df = pd.read_csv(ads_file, header=0, sep=',')
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
    splines=[5, 7, 10, 20, 30, 40, 45]
    lams = np.logspace(-3,3,7)
    if(ad_group in df['ad'].unique()):
        df_ad_group_train = df[df['ad'] == ad_group]
        df_ad_group_train = df_ad_group_train.reset_index()
        df_ad_group_train['time_period'] = (df_ad_group_train['date'] - df_ad_group_train['date'][0]).dt.days
        X_train = df_ad_group_train[['time_period']].values
        y_train = df_ad_group_train['shown'].values
        #auto tuning
        gam = LinearGAM().gridsearch(X_train, y_train, lam=lams, n_splines=splines)
        predictions = gam.predict(X_train)
        print('==== Tuning for ad group %s - best generalized cross-validation %f ' % (ad_group, gam.statistics_['GCV']))
        tuning_result = (gam.lam[0][0], gam.n_splines[0], gam.statistics_['GCV'])
        predict_date = (pd.to_datetime(date) - df_ad_group_train['date'][0]).days
        print("Auto tuning result=",tuning_result)
        print("Prediction for number of ads Shown for",ad_group,"on ",date,"=",gam.predict([[predict_date]]))
        print("Regression/Lambda value = ",gam.lam)
        print("n_splines=",gam.n_splines)
    else:
        print("Ad group does not exist")
Exemple #17
0
def GAM_model(df, feature_list):
    X_train = df[feature_list]
    y_train = df[['logerror']]
    scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X_train)
    X_scaled = pd.DataFrame(scaler.transform(X_train),
                            columns=X_train.columns.values).set_index(
                                [X_train.index.values])
    X_scaled = X_scaled.to_numpy()
    y_train = y_train.to_numpy()
    from pygam import LinearGAM, s, f, te
    gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5))
    gam.gridsearch(X_scaled, y_train)
    y_pred = gam.predict(X_scaled)
    y_pred = pd.DataFrame(y_pred)
    y_pred['actual'] = y_train
    y_pred.columns = ['predicted', 'actual']
    RMSE = float('{:.3f}'.format(
        sqrt(mean_squared_error(y_pred.actual, y_pred.predicted))))
    R2 = float('{:.3f}'.format(r2_score(y_pred.actual, y_pred.predicted)))
    return RMSE, R2, gam
Exemple #18
0
def GAMfitter(indir, dat_st, T0=None):
    fname = [i for i in os.listdir(indir) if dat_st in i]
    data = np.loadtxt(indir + fname[0])
    frequency = np.linspace(
        1e-3, 0.5, int(1e6))  # A range for frequencies (2 to 1000 day periods)
    power = LombScargle(data[:, 0],
                        data[:, 1]).power(frequency=frequency)  # Get spectrum
    ind = get_index_of_max(power)  # Best frequency

    if T0 is None:  # If we have no preset T0, try to get a minimum
        phs = get_phase_curve(data[:, 0], data[0, 0], 1 / frequency[ind])
        ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1])
        gam = LinearGAM(n_splines=30).gridsearch(ext_phs,
                                                 ext_mags)  # Fit a GAM

        XX = gam.generate_X_grid(term=0, n=500)
        fit = gam.predict(XX)  # This is the fit on the grid
        minimal_val = max(fit)  # Maximum magnitude (minimal brightness)
        min_ind = get_index_of_min(abs(data[:, 1] - minimal_val))
        T0 = data[min_ind, 0]

    phs = get_phase_curve(data[:, 0], T0, 1 / frequency[ind])
    ext_phs, ext_mags = phase_curve_extender(phs, data[:, 1])
    gam = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags)

    pred_int_vls = gam.prediction_intervals(phs, width=.85)
    cond = (data[:, 1] > pred_int_vls[:, 0]) & (data[:, 1] < pred_int_vls[:,
                                                                          1])

    filtered_data = data[cond]

    power_f = LombScargle(data[:, 0], data[:, 1]).power(frequency=frequency)
    ind_f = get_index_of_max(power_f)
    phs_f = get_phase_curve(filtered_data[:, 0], T0, 1 / frequency[ind_f])
    ext_phs, ext_mags = phase_curve_extender(phs_f, filtered_data[:, 1])
    gam_f = LinearGAM(n_splines=30).gridsearch(ext_phs, ext_mags)

    return filtered_data, gam_f, frequency[ind_f], T0
Exemple #19
0
def get_GAM_predictions(Xtrain, Ytrain, Xtest):
    """
    Perform grid search and train Linear GAM model and return predictions for the test set.
    :param Xtrain: X values for training.
    :param Ytrain: Y values for training.
    :param Xtest:  X values for validation.
    :return: Predictions from Linear GAM model for test dataset
    """
    # Create an array of lambda values to search
    lams = np.logspace(-3, 20, 35)
    # GAM search requires numpy arrays
    Xtrain_np = np.array(Xtrain, dtype=np.float64)
    Ytrain_np = np.array(Ytrain, dtype=np.float64)

    # Linear Generalised Additive Model
    model = LinearGAM(
        s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) +
        l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) +
        l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) +
        l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np,
                                                        Ytrain_np,
                                                        lam=lams)
    return model.predict(Xtest)
Exemple #20
0
def cleaner_linearGAM(x,y,**kwargs):
    from pygam import LinearGAM, l, s
    if isinstance(x,list):
        x = np.array(x)
    if isinstance(y,list):
        y = np.array(y)
    X = x.reshape(len(x),1)
    #if 'n_splines' in kwargs.keys():
    #    n_splines = kwargs['n_splines']
    #else:
    #    # This is because the automatic approach is too smooth
    #    #n_splines = int(len(y)/5)
    #gam = LinearGAM(n_splines=n_splines,\
    #                terms=s(0,basis='ps')\
    #                ).gridsearch(X, y)
    gam = LinearGAM(terms=s(0,basis='ps')).gridsearch(X, y)
    #gam = LinearGAM(n_splines=n_splines,terms=s(0)).gridsearch(X, y)
    # sample on the input grid
    means = gam.predict(X)
    bounds = gam.prediction_intervals(X, width=.95)
    idx = [i for i in range(len(y)) \
            if (y[i]>bounds[i,1] or y[i]<bounds[i,0])]
    return idx
# Based on "Elements of causal inference" code snippet 4.14


#https://pygam.readthedocs.io/en/latest/notebooks/quick_start.html#
import pygam
from pygam import LinearGAM, s p66

import numpy as np
np.random.seed(42)
N = 200
X = np.random.randn(N)
Y = np.power(X, 3) + np.random.randn(N)

gam_fwd = LinearGAM(s(0)).fit(X, Y)
Yhat = gam_fwd.predict(X)
residuals_fwd = Y - Yhat 
loglik_fwd = -(np.log(np.var(X)) + np.log(np.var(residuals_fwd)))
print(loglik_fwd)

gam_back = LinearGAM(s(0)).fit(Y, X)
Xhat = gam_fwd.predict(Y)
residuals_back = X - Xhat
loglik_back = -(np.log(np.var(Y)) + np.log(np.var(residuals_back)))
print(loglik_back)
Exemple #22
0
    def explain_instance_with_data(self,
                                   neighborhood_data,
                                   neighborhood_labels,
                                   distances,
                                   label,
                                   num_features,
                                   feature_selection='auto',
                                   model_regressor=None,
                                   gam_type=None):
        """Takes perturbed data, labels and distances, returns explanation.

        Args:
            neighborhood_data: perturbed data, 2d array. first element is
                               assumed to be the original data point.
            neighborhood_labels: corresponding perturbed labels. should have as
                                 many columns as the number of possible labels.
            distances: distances to original data point.
            label: label for which we want an explanation
            num_features: maximum number of features in explanation
            feature_selection: how to select num_features. options are:
                'forward_selection': iteratively add features to the model.
                    This is costly when num_features is high
                'highest_weights': selects the features that have the highest
                    product of absolute weight * original data point when
                    learning with all the features
                'lasso_path': chooses features based on the lasso
                    regularization path
                'none': uses all features, ignores num_features
                'auto': uses forward_selection if num_features <= 6, and
                    'highest_weights' otherwise.
            model_regressor: sklearn regressor to use in explanation.
                Defaults to Ridge regression if None. Must have
                model_regressor.coef_ and 'sample_weight' as a parameter
                to model_regressor.fit()

        Returns:
            (intercept, exp, score):
            intercept is a float.
            exp is a sorted list of tuples, where each tuple (x,y) corresponds
            to the feature id (x) and the local weight (y). The list is sorted
            by decreasing absolute value of y.
            score is the R^2 value of the returned explanation
        """

        weights = self.kernel_fn(distances)
        labels_column = neighborhood_labels[:, label]
        used_features = self.feature_selection(neighborhood_data,
                                               labels_column, weights,
                                               num_features, feature_selection)

        X = neighborhood_data[:, used_features]
        y = neighborhood_labels[:, label]
        (X_train, X_test, y_train, y_test, train_weights,
         test_weights) = train_test_split(X, y, weights, test_size=0.2)

        linear_model = Ridge(alpha=1,
                             fit_intercept=True,
                             random_state=self.random_state)

        gam = LinearGAM()
        dt = DecisionTreeRegressor()

        linear_model.fit(X_train, y_train, sample_weight=train_weights)
        gam.fit(X_train, y_train, weights=train_weights)
        dt.fit(X_train, y_train, sample_weight=train_weights)

        # # plot
        # for i, term in enumerate(gam.terms):
        #     if term.isintercept:
        #         continue
        #     XX = gam.generate_X_grid(term=i)
        #     # pdep = gam.predict(XX)
        #     pdep = gam.partial_dependence(term=i, X=XX) + linear_model.intercept_
        #     # line = XX[:, term.feature] * linear_model.coef_[term.feature]
        #     line = linear_model.predict(XX)
        #     dect = dt.predict(XX)
        #     plt.figure()
        #     plt.plot(XX[:, term.feature], pdep)
        #     plt.plot(XX[:, term.feature], line)
        #     plt.plot(XX[:, term.feature], dect)
        #     plt.title(repr(term))
        #     plt.show()
        # exit()

        y_lr = linear_model.predict(X_test)
        y_gam = gam.predict(X_test)
        y_dt = dt.predict(X_test)

        # y_lr = linear_model.predict(X_train)
        # y_gam = gam.predict(X_train)
        # y_dt = dt.predict(X_train)

        # mse_lr = mean_squared_error(y_test, y_lr, sample_weight=test_weights)
        # mse_gam = mean_squared_error(y_test, y_gam, sample_weight=test_weights)
        # mse_dt = mean_squared_error(y_test, y_dt, sample_weight=test_weights)

        mse_lr = explained_variance_score(y_test,
                                          y_lr,
                                          sample_weight=test_weights)
        mse_gam = explained_variance_score(y_test,
                                           y_gam,
                                           sample_weight=test_weights)
        mse_dt = explained_variance_score(y_test,
                                          y_dt,
                                          sample_weight=test_weights)

        # mse_lr = explained_variance_score(y_train, y_lr, sample_weight=train_weights)
        # mse_gam = explained_variance_score(y_train, y_gam, sample_weight=train_weights)
        # mse_dt = explained_variance_score(y_train, y_dt, sample_weight=train_weights)

        metrics = (mse_lr, mse_gam, mse_dt)

        prediction_score = linear_model.score(neighborhood_data[:,
                                                                used_features],
                                              labels_column,
                                              sample_weight=weights)

        local_pred = linear_model.predict(
            neighborhood_data[0, used_features].reshape(1, -1))

        linear_exp = sorted(zip(used_features, linear_model.coef_),
                            key=lambda x: np.abs(x[1]),
                            reverse=True)
        gam_exp = []
        for i, term in enumerate(gam.terms):
            if term.isintercept:
                continue
            XX = gam.generate_X_grid(term=i)
            y = gam.partial_dependence(term=i, X=XX)
            x = XX[:, i]
            feature = used_features[i]
            gam_exp.append((used_features[i], x, y))

        if self.verbose:
            print('Intercept', linear_model.intercept_)
            print(
                'Prediction_local',
                local_pred,
            )
            print('Right:', neighborhood_labels[0, label])
        # return (linear_model.intercept_,
        #         sorted(zip(used_features, linear_model.coef_),
        #                key=lambda x: np.abs(x[1]), reverse=True),
        #         prediction_score, local_pred)
        return (metrics, linear_exp, gam_exp)
Exemple #23
0
plt.ylabel('mpg')
plt.title('LOESS Smoothing')
plt.show()
'''
-------------------------------------------------------------------------------
------------------------Generalized Additive Models----------------------------
-------------------------------------------------------------------------------
'''

#GAMs
#https://github.com/dswah/pyGAM
#https://codeburst.io/pygam-getting-started-with-generalized-additive-models-in-python-457df5b4705f
from pygam import LinearGAM, LogisticGAM
gam_model = LinearGAM().fit(d[['disp', 'wt']], d['mpg'])
print(gam_model.summary())
gam_predictions = gam_model.predict(d[['disp', 'wt']])
gam_mse = np.mean((gam_predictions - d['mpg'])**2)
print('MSE:', gam_mse)

#Plot the predictions with confidence intervals
plt.plot(list(d.index), gam_predictions, 'r--')
plt.plot(list(d.index),
         gam_model.prediction_intervals(d[['disp', 'wt']], width=.95),
         color='b',
         ls='--')
plt.scatter(list(d.index), d['mpg'], facecolor='gray', edgecolors='none')
plt.xlabel('Row Index')
plt.ylabel('mpg')
plt.title('GAM Prediction with 95% Condidence Interval')
plt.show()
# * Ajustamos el modelo a nuestra base de datos de entrenamiento:

# In[9]:


model.gridsearch(X_train, y_train)


# #### Predicción

# In[10]:


#Predicción del modelo

y_pred_validation = model.predict(X_validation)
y_pred_validation


# #### Evaluación de nuestro modelo:

# In[11]:


# diseñamos función para evaluar

def mean_absolute_percentage_error(y_train, y_pred_validation): 
    return np.mean(np.abs((y_train - y_pred_validation) / y_train)) * 100


# In[12]:
Exemple #25
0
def main():
    
    f = open('results.txt', 'w')

    f.write("Preprocessing data...\n\n")
    # pre-process data
    train_X, train_Y, train_idx, _, test_X, test_idx = load_data(config.data_path, config.test_path)
    names = list(train_X)
    types = train_X.dtypes
    floats = (types == np.float64)

    new_X_GAM, new_test_GAM = construct_features(train_X, train_Y, test_X, have_poly=False)
    
    # feature selection
    f.write("Feature Selection\n")
    ridge_scores, ridge_X, ridge_test, ridge_names = select_features(train_X, train_Y, test_X, config.ridge_select, config.ridge_feats)
    lasso_scores, lasso_X, lasso_test, lasso_names = select_features(train_X, train_Y, test_X, config.lasso_select, config.lasso_feats)
    knn_scores, knn_X, knn_test, knn_names = select_features(train_X, train_Y, test_X, config.knn_select, config.knn_feats)
    rf_scores, rf_X, rf_test, rf_names = select_features(train_X, train_Y, test_X, config.rf_select, config.rf_feats)
    est_scores, est_X, est_test, est_names = select_features(train_X, train_Y, test_X, config.est_select, config.est_feats)
    write_selection_results(f, 'Ridge Regression', config.ridge_feats, ridge_scores, ridge_names)
    write_selection_results(f, 'LASSO Regression', config.lasso_feats, lasso_scores, lasso_names)
    write_selection_results(f, 'K-Nearest Neighbours', config.knn_feats, knn_scores, knn_names)
    write_selection_results(f, 'Random Forest', config.rf_feats, rf_scores, rf_names)
    write_selection_results(f, 'Gradient Boosting', config.est_feats, est_scores, est_names)
    f.write('\n#######################################\n\n')

    # model selection
    f.write("Model Selection\n")
    ridge_scores = cross_valid(config.ridge_models, ridge_X, train_Y)
    lasso_scores = cross_valid(config.lasso_models, lasso_X, train_Y)
    knn_scores = cross_valid(config.knn_models, knn_X, train_Y)
    rf_scores = cross_valid(config.rf_models, rf_X, train_Y)
    est_scores = cross_valid(config.est_models, est_X, train_Y)
    write_model_results(f, 'Ridge Regression', config.ridge_models, ridge_scores)
    write_model_results(f, 'LASSO Regression', config.lasso_models, lasso_scores)
    write_model_results(f, 'K-Nearest Neighbours', config.knn_models, knn_scores)
    write_model_results(f, 'Random Forest', config.rf_models, rf_scores)
    write_model_results(f, 'Gradient Boosting', config.est_models, est_scores)
    f.write('\n#######################################\n\n')

    best_reg = config.lasso3
    best_tree = config.est3
    best_reg.fit(lasso_X, train_Y)
    predictions_reg = best_reg.predict(lasso_test)
    best_tree.fit(est_X, train_Y)
    predictions_tree = best_tree.predict(est_test)
    write_test_file(predictions_reg, test_idx, 'results_reg.csv')
    write_test_file(predsictions_tree, test_idx, 'results_tree.csv')

#    valid_X = new_X[:200]
    valid_Y = train_Y[:200] 
#    new_X = new_X[200:]
    train1_Y = train_Y[200:]

#    est.fit(new_X, train_Y)
#    preds = est.predict(new_test)
    err = []
    for i in range(90, 100, 10):
        sel = SelectPercentile(mutual_info_regression, percentile=i)
        new1_X = sel.fit_transform(new_X, train_Y)
        valid_X = new1_X[:200]
 
        train_X = new1_X[200:]

        est.fit(train_X, train1_Y)
        predictions = est.predict(valid_X)
#    preds = np.exp(predictions)
#    print(predictions)
#    print(preds)
#    write_test_file(preds, test_idx)
        err.append(np.sqrt(mean_squared_error(valid_Y, predictions)))
        print(explained_variance_score(valid_Y, predictions))
        print(r2_score(valid_Y, predictions))
        plt.scatter(valid_Y, predictions)
        x = [10.5, 11, 11.5, 12, 12.5, 13, 13.5]
        y = [10.5, 11, 11.5, 12, 12.5, 13, 13.5]
        plt.plot(x,y,'--')
        plt.ylabel("Predictions")
        plt.xlabel("Actual Y-values")
        plt.show()
#    plt.plot([10,20,30,40,50,60,70,80,90],err)
#    plt.xlabel("Percentage of Feature")
#    plt.ylabel("Validation MSE")
#    plt.show()
#    preds = np.exp(preds)
#    write_test_file(preds, test_idx)
#    new2_X = rfe2.fit_transform(new_X, train_Y)
#    print(new2_X.shape)
#    new3_X = rfe3.fit_transform(new_X, train_Y)
#    print(new3_X.shape)
#    new4_X = rfe4.fit_transform(new_X, train_Y)
#    print(new4_X.shape)
#    new5_X = rfe5.fit_transform(new_X, train_Y)
#    print(new5_X.shape)
#    new2_X = rfe2.fit_transform(new_X, train_Y)
#    new1_X = rfe1.fit_transform(new_X, train_Y)
#    new2_X = rfe2.fit_transform(new_X, train_Y)
#    new3_X = rfe3.fit_transform(new_X, train_Y)
#    new4_X = rfe4.fit_transform(new_X, train_Y)
#    pca1.fit(train_X, train_Y)
#    sel3.fit(train_X, train_Y)
#    new4_X = pca2.fit_transform(new_X, train_Y)

#    names1 = [new_names[i] for i in np.where(rfe1.support_ == True)[0]]
#    names2 = [new_names[i] for i in np.where(rfe2.support_ == True)[0]]

#    scores1 = cross_valid(models, new_X, train_Y)
#    scores2 = cross_valid([lasso2], new2_X, train_Y)
#    scores3 = cross_valid([lasso3], new3_X, train_Y)
#    scores4 = cross_valid([lasso4], new4_X, train_Y)
#    scores5 = cross_valid([lasso5], new5_X, train_Y)
#    scores5 = cross_valid(models, new3_X, train_Y)
#    scores5 = cross_valid(models, new3_X, train_Y)
#    scores6 = cross_valid(models, new4_X, train_Y)
#    print(sel_names)
#    print(new1_X.shape)
#    print(new2_X.shape)
#    print(new_X.shape)
#    print(scores1)
#    print(scores2)
#    print(scores3)
#    print(scores4)
#    print(scores5)
#    valid_X = new_X[:200]
    valid_Y = train_Y[:200] 
#    train_X = new_X[200:]
    train_Y = train_Y[200:]
#    new_train = sel3.transform(train_X)
#    new_valid = sel3.transform(valid_X)
#    print(new_valid.shape)
    err = []
    for i in range(80, 90, 10):
        pca = PCA(n_components=i)
        new1_X = pca.fit_transform(new_X, train_Y)
        valid_X = new1_X[:200]
 
        train_X = new1_X[200:]

        gam = LinearGAM(n_splines=8).gridsearch(train_X, train_Y)
        predictions = gam.predict(valid_X)
#    preds = np.exp(predictions)
#    print(predictions)
#    print(preds)
#    write_test_file(preds, test_idx)
        err.append(np.sqrt(mean_squared_error(valid_Y, predictions)))
        print(explained_variance_score(valid_Y, predictions))
        print(r2_score(valid_Y, predictions))
        plt.scatter(valid_Y, predictions)
        x = [10.5, 11, 11.5, 12, 12.5, 13, 13.5]
        y = [10.5, 11, 11.5, 12, 12.5, 13, 13.5]
        plt.plot(x,y,'--')
        plt.ylabel("Predictions")
        plt.xlabel("Actual Y-values")
        plt.show()
        # Change the default axis colors from black to a slightly lighter black,
        # and a little thinner (0.5 instead of 1)
        plt.rcParams['axes.edgecolor'] = almost_black
        plt.rcParams['axes.labelcolor'] = almost_black

        ax1 = fig.add_subplot(211)
        ax2 = fig.add_subplot(212)

        if plot_type == "GAM":
            nsplines = 20

            lct_1D = np.tile(np.arange(8), 22)
            gam1 = LinearGAM(n_splines=nsplines).fit(
                lct_1D, soilmoist_rn1.reshape(8 * 22))
            x_pred = np.linspace(0, 7, num=100)
            y_pred1 = gam1.predict(x_pred)
            y_int1 = gam1.confidence_intervals(x_pred, width=.95)
            np.savetxt('soilmoist_rn1.out',
                       soilmoist_rn1.reshape(8 * 22),
                       delimiter=',')
            np.savetxt('soilmoist_rn2.out',
                       soilmoist_rn2.reshape(8 * 22),
                       delimiter=',')
            np.savetxt('soilmoist_tdr_rn2.out',
                       soilmoist_tdr_rn2.reshape(8 * 22),
                       delimiter=',')

            gam2 = LinearGAM(n_splines=nsplines).fit(
                lct_1D, soilmoist_rn2.reshape(8 * 22))
            y_pred2 = gam2.predict(x_pred)
            y_int2 = gam2.confidence_intervals(x_pred, width=.95)
Exemple #27
0
# Specify plot shape
titles = ['freedom', 'family', 'year', 'economy',
          'health', 'trust']

fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles)
fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False)

for i, title in enumerate(titles):
  XX = gam.generate_X_grid(term=i)
  pdep, confi = gam.partial_dependence(term=i, width=.95)
  trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect')
  ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI')
  ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')
  if i<3:
    fig.append_trace(trace, 1, i+1)
    fig.append_trace(ci1, 1, i+1)
    fig.append_trace(ci2, 1, i+1)
  else:
    fig.append_trace(trace, 2, i-2)
    fig.append_trace(ci1, 2, i-2)
    fig.append_trace(ci2, 2, i-2)

py.plot(fig)


# Making a Forecast

# predicting the outcome of the UAE in 2015
gam.predict([[0.64, 1.13, 2015, 1.47, 0.81, 0.38]])
Exemple #28
0
def GAMf(df,
         in_var,
         ex_vars,
         city,
         cut,
         pred_end='one_month',
         train_duration='all'):
    """
    Parameters
    ----------
    df: 
        dataframe containing all variables of interest for the whole time of measurement
    in_var: 
        independent variable
    ex_vars: 
        list of explanatory variables
    city: 
        name of specific city
    cut: 
        string of the format '%m/%d/%Y' indicating the date where training set ends & test set starts
    pred_end:
        end of the prediction period
         if 'one_month' pred_end is set to one month after the cut
    train_duration:
        int, indicating the number of months that should be used for training
        defaults to 'all' -> all available data before the cut date will be used as training data
        
    Returns
    -------
    gam:
        fitted gam model instance
        
        
    model_statistics:
        vector containing the following information about the fitted model
        
        rmse:
            RMSE for test set
        r_squared:
            pseudo R-squared for the fitted GAM model
        fac2:
            fraction of predictions that lies between 50% and 200% of the corresponding measurements
        test_len:
            number of observations in the test set
        train_len:
            number of observations in the training set
        ratio:
            ratio of prediction to true values for test set
        avg_err:
        
    preds:
        a dataframe containing all explanatory variables, the independent variable, the predicted values & 
        the absolute error divided by the average value of the pollution variables in the training set
    """

    # drop rows with NAN values for explantory variables
    df = df.dropna(subset=ex_vars)

    # subset dataset to given city
    df = df[df['city'] == city]

    # convert cut variable to datetime object
    cut = datetime.strptime(cut, '%m/%d/%Y')

    # if pred_end has the default value add one month to cut date to calculate end of the test dataset
    # else convert given string to datetime
    if (pred_end == 'one_month'):
        pred_end = cut + relativedelta(months=+1)
    else:
        pred_end = datetime.strptime(pred_end, '%m/%d/%Y')

    # determine subset of dataset used for training based on the given value for training duration
    if (train_duration == 'all'):
        df_train = df[df.index < cut]
    else:
        train_start = cut - relativedelta(months=+train_duration)
        df_train = df[df.index < cut]
        df_train = df_train[df_train.index > train_start]
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
    df_train = df_train.dropna(subset=ex_vars)

    # determine subset of dataset used for test
    df_test = df[df.index > cut]
    df_test = df_test[df_test.index < pred_end]

    # extract values for independent and explanatory variables
    train_X = df_train[ex_vars].values
    train_y = np.log(df_train[in_var].values)
    test_X = df_test[ex_vars].values
    test_y = np.log(df_test[in_var].values)

    # check if test and training set contain sufficient observations
    if ((len(test_y) != 0) and (len(train_y) != 0)):

        # generate TermList for GAM
        string = str()
        if isinstance(ex_vars, str):
            length = 1
        else:
            length = len(ex_vars)
        for i in range(0, length):
            if (ex_vars[i] in [
                    'weekday', 'month', 'season', 'hour', 'season', 'new_year',
                    'daytime'
            ]) and (len(train_y) > 300):
                string = string + "+f(" + str(i) + ")"
        #  else:
            elif ('ws' in ex_vars[i]):
                string = string + '+l(' + str(i) + ')'
            else:
                string = string + '+s(' + str(i) + ", lam = 0.6, basis = 'ps')"

        string = string[1:]

        # specify and fit GAM model
        gam = LinearGAM(eval(string))
        gam.fit(train_X, train_y)
        y_pred = gam.predict(test_X)

        # get max observed value for y
        max_value = train_y.max()

        # cut prediction to not get higher than maximum value in the training dataset
        y_pred[y_pred > max_value] = max_value

        # calculate model statistics
        ratio = np.mean(y_pred / test_y)
        rmse = np.sqrt(
            metrics.mean_squared_error(np.exp(test_y), np.exp(y_pred)))
        avg_err = np.mean(np.exp(test_y) - np.exp(y_pred))
        r_squared = list(gam.statistics_['pseudo_r2'].items())[0][1]
        fac2 = np.mean(test_y / y_pred < 2)

        # dataframe with independent & dependent variables, prediction and prediction error
        preds = df_test.copy()[ex_vars]
        preds['true'] = np.exp(test_y)
        preds['y_pred'] = np.exp(y_pred)
        preds['err'] = abs(preds['true'] -
                           preds['y_pred']) / (np.mean(train_y))

        confidence = gam.prediction_intervals(test_X)

        preds['lower'] = np.exp(confidence[:, 0])
        preds['upper'] = np.exp(confidence[:, 1])
    else:
        # return Nan and give a warning if the training set is very small
        print(
            'Problem with test and/or training data length for the station ' +
            city + 'in the month of ' + str(cut.month))
        print('Training Length: ' + str(len(train_y)) + ' Test Length: ' +
              str(len(test_y)))
        rmse = gam = ratio = preds = avg_err = r_squared = fac2 = float("NaN")

    # calculate length of test & training set
    test_len = len(test_X)
    train_len = len(train_X)
    model_statistics = [
        rmse, r_squared, fac2, test_len, train_len, ratio, avg_err
    ]

    return (gam, model_statistics, preds)
Exemple #29
0
from sklearn import datasets
from pygam import LinearGAM
import pandas as pd

boston = datasets.load_boston()
X = boston.data
y = boston.target
features = boston.feature_names
boston_data = pd.DataFrame(X, columns=features)
gam = LinearGAM().fit(boston_data[boston.feature_names], y)
X_test_res = gam.predict(X)
print(X_test_res[:5])
Exemple #30
0
class GAMEnsemble(EnsembleModel):
    """Implements GAM ensemble in [1]."""

    def __init__(self, nonlinear_ensemble=False, residual_process=True):
        """
        Initializer.

        Args:
            nonlinear_ensemble: (bool) Whether use nonlinear term to transform base model.
            residual_process: (bool) Whether model residual process.
        """
        model_name = (
            "Generalized Additive Ensemble" if residual_process
            else "{} Stacking".format("Nonlinear" if nonlinear_ensemble else "Linear"))

        super().__init__(model_name)
        self.gam_model = None
        self.nonlinear_ensemble = nonlinear_ensemble
        self.model_residual = residual_process

    def train(self, X, y, base_pred):
        """Trains ensemble model based on data and base predictions.

        Adds value to class attribute "model_weight"

        Args:
            X: (np.ndarray) Training features, shape (N, D)
            y: (np.ndarray)  Training labels, shape (N, 1)
            base_pred: (dict of np.ndarray) Dictionary of base model predictions
                With keys (str) being model name, and values (np.ndarray) being
                predictions corresponds to X and y.
        """
        # build feature and  gam terms
        ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred)

        # define model
        self.gam_model = LinearGAM(feature_terms)

        # additional fine-tuning
        lam_grid = self._build_lambda_grid(n_grid=100)
        self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid,
                                  progress=False)

    def predict(self, X, base_pred):
        """Predicts label based on feature and base model.

        Args:
            X: (np.ndarray) Training features, shape (N, D)
            base_pred: (dict of np.ndarray) Dictionary of base model predictions
                With keys (str) being model name, and values (np.ndarray) being
                predictions corresponds to X and y.

        Returns:
            (np.ndarray) ensemble prediction and variance

        Raises:
            (ValueError) If self.model_weight is empty.
        """
        if not self.gam_model:
            raise ValueError("Attribute gam_model empty."
                             "Model was not trained properly.")

        # build feature and  gam terms
        ens_feature, _ = self._build_ensemble_feature(X, base_pred)

        # prediction
        prediction = self.gam_model.predict(ens_feature)
        prediction_var = ((self.gam_model.prediction_intervals(
            ens_feature, width=.95)[:, 1] - prediction) / 2) ** 2

        return prediction, prediction_var

    def _build_ensemble_feature(self, X, base_pred):
        """Builds featurre array and corresponding GAM TermList.

        Terms corresponding to X will be summation of
            dimension-wise splines, plus a tensor-product term across all dimension.

        """
        ensemble_term_func = s if self.nonlinear_ensemble else l

        ens_feature = np.asarray(list(base_pred.values())).T
        term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])]

        # optionally, add residual process
        if self.model_residual:
            # build gam terms
            term_list += [s(dim_index) for dim_index in
                          range(ens_feature.shape[1],
                                ens_feature.shape[1] + X.shape[1])]
            if X.shape[1] > 1:
                term_list += [te(*list(ens_feature.shape[1] +
                                       np.array(range(X.shape[1]))))]

            # update features
            ens_feature = np.concatenate([ens_feature, X], axis=1)

        gam_feature_terms = TermList(*term_list)

        return ens_feature, gam_feature_terms

    def _build_lambda_grid(self, n_grid=100):
        # count actual number of terms in each nonlinear term
        # (e.g. te(0, 1) will actually have two terms)
        n_terms = np.sum([len(model_term._terms) if model_term.istensor else 1
                          for model_term in self.gam_model.terms])
        lam = np.random.rand(n_grid, n_terms)
        # rescale to between (0, 1)
        lam_norm = (lam - np.min(lam)) / (np.max(lam) - np.min(lam))

        return np.exp((lam_norm - 0.5) * 6)
Exemple #31
0
class DeepModels:

    # Sequential 6 layer neural network
    def returnSequential6(self, idim = 20):
        model = Sequential()
        model.add(Dense(50, input_dim=idim, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential6_regularized(self, idim = 20):
        model = Sequential()
        model.add(Dense(50, input_dim=idim, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential9(self, idim = 20):
        model = Sequential()
        model.add(Dense(80, input_dim = idim, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential15(self, idim = 20):
        model = Sequential()
        model.add(Dense(140, input_dim=idim, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential15_regularized(self, idim = 20):
        model = Sequential()
        model.add(Dense(140, input_dim=idim, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model


    def returnSequential21(self, idim = 20):
        model = Sequential()
        model.add(Dense(200, input_dim=idim, activation='relu'))
        model.add(Dense(190, activation='relu'))
        model.add(Dense(180, activation='relu'))
        model.add(Dense(170, activation='relu'))
        model.add(Dense(160, activation='relu'))
        model.add(Dense(150, activation='relu'))
        model.add(Dense(140, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def RNN(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(10, input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def multi_RNN(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(14, input_dim=idim, activation='relu'))
        model.add(Dense(7, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def multi_RNN2(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(40, input_dim=idim))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def baseline(self, idim=20):
        # Create model
        model = Sequential()
        model.add(Dense(20, input_dim=idim, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mean_absolute_error'])
        return model

    def lstm(self, idim = 20):
        model = Sequential()
        model.add(LSTM(20, input_dim=idim))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

    def multi_lstm(self, idim = 20):
        model = Sequential()
        model.add(LSTM(14, input_dim=idim, activation='relu'))
        model.add(Dense(7, input_dim=idim, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

    # Sequential 4 layer neural network
    def returnSequential4(self, idim = 20):
        model = Sequential()
        model.add(Dense(20, activation='relu', input_dim=idim))
        model.add(Dense(units=15, activation='relu'))
        model.add(Dense(units=10, activation='relu'))
        model.add(Dense(units=5, activation='relu'))
        model.add(Dense(units=1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')

        return model

        # Sequential 4 layer neural network

    def returnSequential8(self, idim=20):
        model = Sequential()
        model.add(Dense(70, activation='relu', input_dim=idim))
        model.add(Dense(units=60, activation='relu'))
        model.add(Dense(units=50, activation='relu'))
        model.add(Dense(units=40, activation='relu'))
        model.add(Dense(units=30, activation='relu'))
        model.add(Dense(units=20, activation='relu'))
        model.add(Dense(units=10, activation='relu'))
        model.add(Dense(units=1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)))
        model.compile(optimizer='Adam', loss='mean_absolute_error')

        return model

    def base(self, idim=20):
        model = Sequential()
        model.add(Dense(10, activation='relu', input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def base2(self, idim=20):
        model = Sequential()
        model.add(Dense(14, activation='relu', input_dim=idim))
        model.add(Dense(7, activation='relu', input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def __init__(self, m, idim=20):
        if m == 0:
            self.model = self.base(idim)
            self.type = 2
        elif m == 1:
            self.model = self.base2(idim)
            self.type = 2
        elif m == 2:
            self.model = self.returnSequential4(idim)
            self.type = 2
        elif m == 3:
            self.model = self.returnSequential8(idim)
            self.type = 2
        elif m == 4:
            self.model = self.returnSequential15_regularized(idim)
            self.type = 2
        elif m == 5:
            self.model = self.multi_RNN(idim)
            self.type = 1
        elif m == 6:
            self.model = self.multi_lstm(idim)
            self.type = 1
        elif m == 7:
            self.model = LinearGAM()
            self.type = 3
        elif m == 8:
            self.model = self.RNN(idim)
            self.type = 1
        elif m == 9:
            self.model = self.lstm(idim)
            self.type = 1

    def returnModel(self):
        return self.model

    def train(self, X, y, bs=10, epochs=100):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        if self.type == 3:
            self.model.gridsearch(X,y)
        else:
            self.model.fit(X, y, batch_size = bs, epochs = epochs, shuffle=True, verbose = 0)

    def prediction(self, X):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        return self.model.predict(X)

    def cross_eval_with_plotting(self, city, X,y,bs=10,ep=100, k=3):
        scores = []
        multiplier = 0
        fig10, ax10 = plt.subplots()
        if self.type == 0:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                a, score = self.model.evaluate(X_test, y_test, verbose=0)
                predictions = self.model.predict(X_test)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                         alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g')

                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 1:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            scores = []
            multiplier = 0
            fig10, ax10 = plt.subplots()
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
                X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                predictions = self.model.predict(X_test)
                plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), y_test, 'm', alpha=0.4)
                plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), predictions, 'g')
                score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 2:
            multiplier = 0
            fig10, ax10 = plt.subplots()
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0)
                predictions = self.model.predict(X_test)

                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                        alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g')

                score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 3:
            multiplier = 0
            fig10, ax10 = plt.subplots()
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.gridsearch(X_train, y_train)
                y_pre = self.model.predict(X_test)

                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                         alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_pre, 'g')

                scores.append(mean_absolute_error(y_pre, y_test))
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

    def cross_eval(self, X, y, bs=10, ep=100, k=3):
            scores = []
            if self.type == 0:
                kf = KFold(n_splits=k, shuffle=True, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                    a, score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 1:
                kf = KFold(n_splits=k, shuffle=False, random_state=0)
                scores = []
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
                    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
                    self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                    score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 2:
                kf = KFold(n_splits=k, shuffle=True, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0)
                    score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 3:
                kf = KFold(n_splits=k, shuffle=False, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.gridsearch(X_train, y_train)
                    y_pre = self.model.predict(X_test)
                    print(y_pre)
                    scores.append(mean_absolute_error(y_pre, y_test))
                return sum(scores) / len(scores)
def main(flux_dir):
    K_TO_C = 273.15
    sites = ["AdelaideRiver","Calperum","CapeTribulation","CowBay",\
             "CumberlandPlains","DalyPasture","DalyUncleared",\
             "DryRiver","Emerald","Gingin","GreatWesternWoodlands",\
             "HowardSprings","Otway","RedDirtMelonFarm","RiggsCreek",\
             "Samford","SturtPlains","Tumbarumba","Whroo",\
             "WombatStateForest","Yanco"]

    pfts = ["SAV","SHB","TRF","TRF","EBF","GRA","SAV",\
            "SAV","NA","EBF","EBF",\
            "SAV","GRA","NA","GRA",\
            "GRA","GRA","EBF","EBF",\
            "EBF","GRA"]

    d = dict(zip(sites, pfts))
    id = dict(zip(sites, pd.factorize(pfts)[0]))

    plot_dir = "plots"
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    flux_files = sorted(glob.glob(os.path.join(flux_dir, "*_flux.nc")))
    met_files = sorted(glob.glob(os.path.join(flux_dir, "*_met.nc")))

    data_qle = []
    data_qh = []
    data_tair = []
    data_sw = []
    pft_ids = []

    # collect up data
    for flux_fn, met_fn in zip(flux_files, met_files):
        (site, df_flx, df_met) = open_file(flux_fn, met_fn)

        if d[site] != "NA":
            pft = d[site]
            colour_id = id[site]

            # Mask crap stuff
            df_met.where(df_flx.Qle_qc == 1, inplace=True)
            df_met.where(df_flx.Qh_qc == 1, inplace=True)

            df_flx.where(df_flx.Qle_qc == 1, inplace=True)
            df_flx.where(df_flx.Qh_qc == 1, inplace=True)
            #df_flx.where(df_met.Tair_qc == 1, inplace=True)
            #df_flx.where(df_met.SWdown == 1, inplace=True)

            #df_met.where(df_met.SWdown == 1, inplace=True)
            #df_met.where(df_met.Tair_qc == 1, inplace=True)

            # Mask dew
            df_met.where(df_flx.Qle > 0., inplace=True)
            df_flx.where(df_flx.Qle > 0., inplace=True)

            df_flx.dropna(inplace=True)
            df_met.dropna(inplace=True)

            df_flx = df_flx.between_time("09:00", "13:00")
            df_met = df_met.between_time("09:00", "13:00")

            if len(df_flx) > 0 and len(df_met) > 0:
                #data_qle[pft].append(df_flx.Qle.values)
                #data_qh[pft].append(df_flx.Qh.values)
                #data_tair[pft].append(df_met.Tair.values - K_TO_C)
                #data_sw[pft].append(df_met.SWdown.values)

                data_qle.append(df_flx.Qle.values)
                data_qh.append(df_flx.Qh.values)
                data_tair.append(df_met.Tair.values - K_TO_C)
                data_sw.append(df_met.SWdown.values)
                pft_ids.append([pft] * len(df_flx))

    pft_ids = list(itertools.chain(*pft_ids))
    data_qle = list(itertools.chain(*data_qle))
    data_qh = list(itertools.chain(*data_qh))
    data_sw = list(itertools.chain(*data_sw))
    data_tair = list(itertools.chain(*data_tair))

    data_qle = np.asarray(data_qle)
    data_qh = np.asarray(data_qh)
    data_tair = np.asarray(data_tair)
    data_sw = np.asarray(data_sw)
    pft_ids = np.asarray(pft_ids)

    colours = ["red", "green", "blue", "yellow", "pink"]

    fig = plt.figure(figsize=(14, 4))
    fig.subplots_adjust(hspace=0.1)
    fig.subplots_adjust(wspace=0.1)
    plt.rcParams['text.usetex'] = False
    plt.rcParams['font.family'] = "sans-serif"
    plt.rcParams['font.sans-serif'] = "Helvetica"
    plt.rcParams['axes.labelsize'] = 14
    plt.rcParams['font.size'] = 14
    plt.rcParams['legend.fontsize'] = 14
    plt.rcParams['xtick.labelsize'] = 14
    plt.rcParams['ytick.labelsize'] = 14

    almost_black = '#262626'
    # change the tick colors also to the almost black
    plt.rcParams['ytick.color'] = almost_black
    plt.rcParams['xtick.color'] = almost_black

    # change the text colors also to the almost black
    plt.rcParams['text.color'] = almost_black

    # Change the default axis colors from black to a slightly lighter black,
    # and a little thinner (0.5 instead of 1)
    plt.rcParams['axes.edgecolor'] = almost_black
    plt.rcParams['axes.labelcolor'] = almost_black

    ax1 = fig.add_subplot(221)
    ax2 = fig.add_subplot(222)
    ax3 = fig.add_subplot(223)
    ax4 = fig.add_subplot(224)

    colour_id = 0
    for pft in np.unique(pfts):

        if pft != "NA":
            qle = data_qle[np.argwhere(pft_ids == pft)]
            qh = data_qh[np.argwhere(pft_ids == pft)]
            tair = data_tair[np.argwhere(pft_ids == pft)]
            sw = data_sw[np.argwhere(pft_ids == pft)]

            print(pft, len(qle), len(qh), len(tair), len(sw))

            gam = LinearGAM(n_splines=20).gridsearch(sw, qh)
            XX = generate_X_grid(gam)
            CI = gam.confidence_intervals(XX, width=.95)

            ax1.plot(XX,
                     gam.predict(XX),
                     color=colours[colour_id],
                     ls='-',
                     lw=2.0)
            ax1.fill_between(XX[:, 0],
                             CI[:, 0],
                             CI[:, 1],
                             color=colours[colour_id],
                             alpha=0.7)

            gam = LinearGAM(n_splines=20).gridsearch(sw, qle)
            XX = generate_X_grid(gam)
            CI = gam.confidence_intervals(XX, width=.95)

            ax2.plot(XX,
                     gam.predict(XX),
                     color=colours[colour_id],
                     ls='-',
                     lw=2.0)
            ax2.fill_between(XX[:, 0],
                             CI[:, 0],
                             CI[:, 1],
                             color=colours[colour_id],
                             alpha=0.7)

            gam = LinearGAM(n_splines=20).gridsearch(tair, qh)
            XX = generate_X_grid(gam)
            CI = gam.confidence_intervals(XX, width=.95)
            ax3.plot(XX,
                     gam.predict(XX),
                     color=colours[colour_id],
                     ls='-',
                     lw=2.0)
            ax3.fill_between(XX[:, 0],
                             CI[:, 0],
                             CI[:, 1],
                             color=colours[colour_id],
                             alpha=0.7)

            gam = LinearGAM(n_splines=20).gridsearch(tair, qle)
            XX = generate_X_grid(gam)
            CI = gam.confidence_intervals(XX, width=.95)
            ax4.plot(XX,
                     gam.predict(XX),
                     color=colours[colour_id],
                     ls='-',
                     lw=2.0)
            ax4.fill_between(XX[:, 0],
                             CI[:, 0],
                             CI[:, 1],
                             color=colours[colour_id],
                             alpha=0.7)

            colour_id += 1

    plt.setp(ax1.get_xticklabels(), visible=False)
    plt.setp(ax2.get_xticklabels(), visible=False)

    ax1.set_xlim(0, 1300)
    ax1.set_ylim(0, 1000)
    ax2.set_xlim(0, 45)
    ax2.set_ylim(0, 1000)
    ax3.set_xlabel("SW down (W m$^{-2}$)")
    ax4.set_xlabel("Tair ($^\circ$C)")
    ax1.set_ylabel("Qh flux (W m$^{-2}$)")
    ax2.set_ylabel("Qle flux (W m$^{-2}$)")
    #ax1.legend(numpoints=1, loc="best")
    #fig.savefig(os.path.join(plot_dir, "%s.pdf" % (site)),
    #            bbox_inches='tight', pad_inches=0.1)

    fig.savefig(os.path.join(plot_dir, "ozflux_by_pft.png"),
                bbox_inches='tight',
                pad_inches=0.1,
                dpi=150)
Exemple #33
0
######################################################
# constraints

from pygam import LinearGAM, s
from pygam.datasets import hepatitis

X, y = hepatitis(return_X_y=True)
X.shape

gam1 = LinearGAM(s(0, constraints='monotonic_inc')).fit(X, y)
gam2 = LinearGAM(s(0, constraints='concave')).fit(X, y)

fig, ax = plt.subplots(1, 2)
ax[0].plot(X, y, label='data')
ax[0].plot(X, gam1.predict(X), label='monotonic fit')
ax[0].legend()

ax[1].plot(X, y, label='data')
ax[1].plot(X, gam2.predict(X), label='concave fit')
ax[1].legend()

######################################################
# api

from pygam import LogisticGAM, s, f
from pygam.datasets import toy_classification

X, y = toy_classification(return_X_y=True, n=5000)

gam = LogisticGAM(s(0) + s(1) + s(2) + s(3) + s(4) + f(5))