def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:,0])
    
    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k:[] for k in X.columns}
        pred=np.zeros(y.shape[0])
        for train,test in CV.split(X,y):
            Xtrain = X.iloc[train,:]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test,:]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape)>1:
                p=p[:,0]
            pred[test]=p

            if get_importance:    
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k,v in importance_out.items():
                    importances[k].append(v)
                    
        cv_scores = [{'r': np.corrcoef(y,pred)[0,1],
                      'R2': np.corrcoef(y,pred)[0,1]**2,
                      'MAE': mean_absolute_error(y,pred)}]
        
        
        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{'r': np.corrcoef(y,in_pred)[0,1],
                          'R2': np.corrcoef(y,in_pred)[0,1]**2,
                          'MAE': mean_absolute_error(y,in_pred)}]
        GAM_results[name] = {'scores_cv': cv_scores,
                             'scores_insample': in_scores,
                             'pred_vars': X.columns,
                             'importances': importances,
                             'model': gam}
    return GAM_results
Beispiel #2
0
def GAM(X, Y, factor = False):

    """SPLITTING THE DATASET"""
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, **options)

    """PREPROCESSING"""
    # NB: No need for one-hot encoding – categorical columns are already binary!
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    """CREATING A DESIGN MATRIX"""
    poly = PolynomialFeatures(1)
    X_test = poly.fit_transform(X_test)
    X_train = poly.fit_transform(X_train)

    linear = ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'y', 'n', 'y',
    'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']

    # for feature in X_train.T:
    #     unique = np.unique(feature)
    #     if len(unique) < 6:
    #         linear.append("n")
    #     else:
    #         idx = np.argsort(feature)
    #         plt.plot(feature[idx], Y.squeeze()[idx])
    #         plt.show()
    #         linear.append(input("Linear?\t"))

    linear = np.array(linear)
    linear[linear == "n"] = 0
    linear[linear == "y"] = 1
    linear = linear.astype(bool)

    gam_input = None
    for n,is_linear in enumerate(linear):
        if gam_input is not None:
            if is_linear:
                gam_input += GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input += GAM_spline(n)
        else:
            if is_linear:
                gam_input = GAM_line(n)
                if factor:
                    gam_input += GAM_factor(n)
            else:
                gam_input = GAM_spline(n)

    gam = LinearGAM(gam_input, fit_intercept = False, max_iter = int(1E5))
    gam.fit(X_train, Y_train)
    Y_predict_train = gam.predict(X_train)
    Y_predict_test = gam.predict(X_test)
    MSE_train = np.mean((Y_predict_train - Y_train)**2)
    MSE_test = np.mean((Y_predict_test - Y_test)**2)
    return MSE_train, MSE_test
Beispiel #3
0
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10):
    # set up GAM
    formula = s(0, n_splines)
    for i in range(1, X.shape[1]):
        formula = formula + s(i, n_splines)
    gam = LinearGAM(formula)
    gam.fit(X, X.iloc[:, 0])

    # run full model
    GAM_results = {}
    for name, y in Y.iteritems():
        print("\nFitting for %s\n" % name)
        CV = BalancedKFold(folds)
        importances = {k: [] for k in X.columns}
        pred = np.zeros(y.shape[0])
        for train, test in CV.split(X, y):
            Xtrain = X.iloc[train, :]
            ytrain = y.iloc[train]
            Xtest = X.iloc[test, :]
            ytest = y.iloc[test]
            gam = LinearGAM(formula)
            gam.gridsearch(Xtrain, ytrain)

            # out of fold
            p = gam.predict(Xtest)
            if len(p.shape) > 1:
                p = p[:, 0]
            pred[test] = p

            if get_importance:
                # get importances, defined as the predictive ability of each variable on its own
                importance_out = get_importances(Xtrain, ytrain, Xtest, ytest)
                for k, v in importance_out.items():
                    importances[k].append(v)

        cv_scores = [{
            'r': np.corrcoef(y, pred)[0, 1],
            'R2': np.corrcoef(y, pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, pred)
        }]

        # insample
        gam.gridsearch(X, y)
        in_pred = gam.predict(X)
        in_scores = [{
            'r': np.corrcoef(y, in_pred)[0, 1],
            'R2': np.corrcoef(y, in_pred)[0, 1]**2,
            'MAE': mean_absolute_error(y, in_pred)
        }]
        GAM_results[name] = {
            'scores_cv': cv_scores,
            'scores_insample': in_scores,
            'pred_vars': X.columns,
            'importances': importances,
            'model': gam
        }
    return GAM_results
def test_if_learner():
    # get data without noise
    X, y, w, ite, p, bs = make_te_data(n=200, noise=False)

    # get surrogate predictions to compare against po predictions
    mu_0_plug, mu_1_plug = get_surrogate_predictions(X, y, w)

    # get surrogate predictions for two folds as inside the iflearner
    splitter = StratifiedKFold(n_splits=2, shuffle=True,
                               random_state=42)
    idx_list = []
    for train_index, test_index in splitter.split(X, w):
        idx_list.append((train_index, test_index))

    fold2_mask = np.zeros(200, dtype=bool)
    fold2_mask[idx_list[0][1]] = 1
    mu_0, mu_1 = np.zeros(200), np.zeros(200)
    mu_0[~fold2_mask], mu_1[~fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=~fold2_mask)
    mu_0[fold2_mask], mu_1[fold2_mask] = get_surrogate_predictions(X, y, w, pred_mask=fold2_mask)
    pseudo_outcome = eif_transformation_CATE(y, w, p, mu_0, mu_1)

    # make second stage model
    t_model = LinearGAM()
    t_model.fit(X, pseudo_outcome)
    te_debiased = t_model.predict(X)

    # fit if learner
    if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    # test outcomes
    np.testing.assert_almost_equal(te, te_debiased)
    np.testing.assert_almost_equal(mu_0, mu_0_plug)
    np.testing.assert_almost_equal(mu_1, mu_1_plug)
    np.testing.assert_almost_equal(if_learner.predict(X), te_debiased)

    with pytest.raises(ValueError):
        # predicting po when base model not fitted should not be possible
        if_learner = IFLearnerTE(LinearGAM(), n_folds=2, random_state=42)
        if_learner.fit(X, y, w, p)
        te, mu_0, mu_1 = if_learner.predict(X, return_po=True)

    with pytest.warns(UserWarning):
        # warning raised if only one fold?
        if_learner = IFLearnerTE(LinearGAM(), n_folds=1, random_state=42)
        if_learner.fit(X, y, w, p)

    # check that binary_y setting also works (smoketest)
    X, y, w, ite, p, bs = make_te_data(n=200, baseline_model=binary_gyorfi_baseline,
                                       noise=False, binary_y=True)
    if_learner = IFLearnerTE(base_estimator=LogisticGAM(), te_estimator=LinearGAM(),
                             binary_y=True, setting=RR_NAME, fit_base_model=True)
    if_learner.fit(X, y, w, p)
    te, mu_0, mu_1 = if_learner.predict(X, return_po=True)
Beispiel #5
0
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest, pred)[0, 1]**2
        importances[predictor] = R2
    return importances
def get_importances(X, y, Xtest, ytest):
    importances = {}
    for predictor, vals in X.iteritems():
        gam = LinearGAM(s(0), fit_intercept=False)
        gam.fit(vals, y)
        gam.gridsearch(vals, y)
        pred = gam.predict(Xtest[predictor])
        # define importances as the R2 for that factor alone
        R2 = np.corrcoef(ytest,pred)[0,1]**2
        importances[predictor] = R2
    return importances
Beispiel #7
0
    def tsSSE(self, model='linear'):

        sse = 0

        for i in range(self.m):

            index = [
                item for sublist in np.where(self.dataLabel == i)
                for item in sublist
            ]
            Xfit = self.Xall[index, :]
            Afit = self.Aall[index]
            Bfit = self.Ball[index]

            Af = Afit * self.model.decision_function(Xfit)
            Xmat = np.column_stack((Xfit, Af))

            if model == 'linear':
                ## linear regression model for B
                Xmat = sm.add_constant(Xmat)
                BModel = sm.OLS(Bfit, Xmat)
                res = BModel.fit()
                pred = res.predict()
            elif model == 'GAM':
                BModel = LinearGAM(fit_intercept=True)
                res = BModel.fit(
                    Xmat, Bfit)  ##the GAM model can be specified differently
                pred = res.predict(Xmat)

            sse = sse + sum([(Bfit[elem] - pred[elem])**2
                             for elem in range(len(Bfit))])

        return sse
def get_surrogate_predictions(X, y, w, pred_mask=None):
    if pred_mask is None:
        pred_mask = np.ones(len(y), dtype=bool)
        fit_mask = pred_mask
    else:
        fit_mask = ~pred_mask
    # get surrogates
    model_1 = LinearGAM()
    model_1.fit(X[fit_mask & (w == 1), :], y[fit_mask & (w == 1)])
    mu_1_plug = model_1.predict(X[pred_mask, :])

    model_0 = LinearGAM()
    model_0.fit(X[fit_mask & (w == 0), :], y[fit_mask & (w == 0)])
    mu_0_plug = model_0.predict(X[pred_mask, :])

    return mu_0_plug, mu_1_plug
Beispiel #9
0
    def _fit_gams(self, temp_t, temp_m, temp_y):
        """Fits the mediator and outcome GAMs"""
        temp_mediator_model = LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order),
            fit_intercept=True,
            max_iter=self.max_iter,
            lam=self.lambda_,
        )
        temp_mediator_model.fit(temp_t, temp_m)

        temp_outcome_model = LinearGAM(
            s(0, n_splines=self.n_splines, spline_order=self.spline_order) +
            s(1, n_splines=self.n_splines, spline_order=self.spline_order),
            fit_intercept=True,
            max_iter=self.max_iter,
            lam=self.lambda_,
        )
        temp_outcome_model.fit(pd.concat([temp_t, temp_m], axis=1), temp_y)

        return temp_mediator_model, temp_outcome_model
Beispiel #10
0
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa):
    prediction = []
    actual_value = []
    n_splines_all = []
    lam_all = []

    # THIS IS OUTER LOOP: for VALIDATION/TESTING
    #train n models and evaluate their average performance
    gene_indexes = index_set
    y = cell_count_aa
    X = gene_expression[gene_expression.columns[gene_indexes]]

    gam = LinearGAM()
    kf = KFold(n_splits=10)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        gam = gam.gridsearch(X_train,
                             y_train,
                             n_splines=np.arange(15, 35),
                             lam=[0.5, 0.6, 0.7])
        n_splines_all.append(gam.n_splines)
        lam_all.append(gam.lam)

    lams = np.array(lam_all)
    lams_mean = lams.mean()
    n_splines_all = np.array(n_splines_all)
    n_splines_mean = n_splines_all.mean()

    gam = LinearGAM(n_splines=n_splines_mean, lam=lams_mean)
    loo = LeaveOneOut()
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regr = gam.fit(X_train, y_train)
        prediction_val = regr.predict(X_test)[0]
        prediction.append(prediction_val)
        actual_value.append(y_test[0])
        print(test_index)
        print(str(prediction_val), " ", str(y_test[0]))
    #calculate spearman correlation over all of the models
    rho, pval = spearmanr(actual_value, prediction)

    return lams_mean, n_splines_mean, rho, pval
Beispiel #11
0
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa):
    prediction = []
    actual_value = []
    n_splines_all = []
    lam_all = []

    # THIS IS OUTER LOOP: for VALIDATION/TESTING
    #train n models and evaluate their average performance
    gene_indexes = index_set
    y = cell_count_aa
    X = gene_expression[gene_expression.columns[gene_indexes]]
    loo = LeaveOneOut()
    loo.get_n_splits(X)
    gam = LinearGAM()
    gam = gam.gridsearch(X,
                         y,
                         n_splines=np.arange(10, 50),
                         lam=[0.4, 0.5, 0.6, 0.7, 0.8])

    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # THIS IS INNER LOOP: for TRAINING/VALIDATION
        #train model with given optimized parameters
        regr = gam.fit(X_train, y_train)
        #make a prediction on OUTER LOOP test set
        prediction_val = regr.predict(X_test)[0]
        # store predictions and actual values
        prediction.append(prediction_val)
        actual_value.append(y_test[0])
        # add optimal parameter values to arrays
        n_splines_all.append(regr.n_splines)
        lam_all.append(regr.lam)
        print(test_index)
        print(str(prediction_val), " ", str(y_test[0]))
    #calculate spearman correlation over all of the models
    rho, pval = spearmanr(actual_value, prediction)
    lams = np.array(lam_all)
    lams_mean = lams.mean()
    n_splines_all = np.array(n_splines_all)
    n_splines_mean = n_splines_all.mean()
    return lams_mean, n_splines_mean, rho, pval
Beispiel #12
0
class DeepModels:

    # Sequential 6 layer neural network
    def returnSequential6(self, idim = 20):
        model = Sequential()
        model.add(Dense(50, input_dim=idim, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential6_regularized(self, idim = 20):
        model = Sequential()
        model.add(Dense(50, input_dim=idim, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential9(self, idim = 20):
        model = Sequential()
        model.add(Dense(80, input_dim = idim, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential15(self, idim = 20):
        model = Sequential()
        model.add(Dense(140, input_dim=idim, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def returnSequential15_regularized(self, idim = 20):
        model = Sequential()
        model.add(Dense(140, input_dim=idim, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model


    def returnSequential21(self, idim = 20):
        model = Sequential()
        model.add(Dense(200, input_dim=idim, activation='relu'))
        model.add(Dense(190, activation='relu'))
        model.add(Dense(180, activation='relu'))
        model.add(Dense(170, activation='relu'))
        model.add(Dense(160, activation='relu'))
        model.add(Dense(150, activation='relu'))
        model.add(Dense(140, activation='relu'))
        model.add(Dense(130, activation='relu'))
        model.add(Dense(120, activation='relu'))
        model.add(Dense(110, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(90, activation='relu'))
        model.add(Dense(80, activation='relu'))
        model.add(Dense(70, activation='relu'))
        model.add(Dense(60, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(40, activation='relu'))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def RNN(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(10, input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def multi_RNN(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(14, input_dim=idim, activation='relu'))
        model.add(Dense(7, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def multi_RNN2(self, idim = 20):
        model = Sequential()
        model.add(SimpleRNN(40, input_dim=idim))
        model.add(Dense(30, activation='relu'))
        model.add(Dense(20, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def baseline(self, idim=20):
        # Create model
        model = Sequential()
        model.add(Dense(20, input_dim=idim, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mean_absolute_error'])
        return model

    def lstm(self, idim = 20):
        model = Sequential()
        model.add(LSTM(20, input_dim=idim))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

    def multi_lstm(self, idim = 20):
        model = Sequential()
        model.add(LSTM(14, input_dim=idim, activation='relu'))
        model.add(Dense(7, input_dim=idim, activation='relu'))
        model.add(Dense(1, activation='linear'))
        model.compile(loss='mean_absolute_error', optimizer='adam')
        return model

    # Sequential 4 layer neural network
    def returnSequential4(self, idim = 20):
        model = Sequential()
        model.add(Dense(20, activation='relu', input_dim=idim))
        model.add(Dense(units=15, activation='relu'))
        model.add(Dense(units=10, activation='relu'))
        model.add(Dense(units=5, activation='relu'))
        model.add(Dense(units=1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')

        return model

        # Sequential 4 layer neural network

    def returnSequential8(self, idim=20):
        model = Sequential()
        model.add(Dense(70, activation='relu', input_dim=idim))
        model.add(Dense(units=60, activation='relu'))
        model.add(Dense(units=50, activation='relu'))
        model.add(Dense(units=40, activation='relu'))
        model.add(Dense(units=30, activation='relu'))
        model.add(Dense(units=20, activation='relu'))
        model.add(Dense(units=10, activation='relu'))
        model.add(Dense(units=1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01)))
        model.compile(optimizer='Adam', loss='mean_absolute_error')

        return model

    def base(self, idim=20):
        model = Sequential()
        model.add(Dense(10, activation='relu', input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def base2(self, idim=20):
        model = Sequential()
        model.add(Dense(14, activation='relu', input_dim=idim))
        model.add(Dense(7, activation='relu', input_dim=idim))
        model.add(Dense(1, activation='linear'))
        model.compile(optimizer='Adam', loss='mean_absolute_error')
        return model

    def __init__(self, m, idim=20):
        if m == 0:
            self.model = self.base(idim)
            self.type = 2
        elif m == 1:
            self.model = self.base2(idim)
            self.type = 2
        elif m == 2:
            self.model = self.returnSequential4(idim)
            self.type = 2
        elif m == 3:
            self.model = self.returnSequential8(idim)
            self.type = 2
        elif m == 4:
            self.model = self.returnSequential15_regularized(idim)
            self.type = 2
        elif m == 5:
            self.model = self.multi_RNN(idim)
            self.type = 1
        elif m == 6:
            self.model = self.multi_lstm(idim)
            self.type = 1
        elif m == 7:
            self.model = LinearGAM()
            self.type = 3
        elif m == 8:
            self.model = self.RNN(idim)
            self.type = 1
        elif m == 9:
            self.model = self.lstm(idim)
            self.type = 1

    def returnModel(self):
        return self.model

    def train(self, X, y, bs=10, epochs=100):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        if self.type == 3:
            self.model.gridsearch(X,y)
        else:
            self.model.fit(X, y, batch_size = bs, epochs = epochs, shuffle=True, verbose = 0)

    def prediction(self, X):
        if self.type == 1:
            X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        return self.model.predict(X)

    def cross_eval_with_plotting(self, city, X,y,bs=10,ep=100, k=3):
        scores = []
        multiplier = 0
        fig10, ax10 = plt.subplots()
        if self.type == 0:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                a, score = self.model.evaluate(X_test, y_test, verbose=0)
                predictions = self.model.predict(X_test)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                         alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g')

                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 1:
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            scores = []
            multiplier = 0
            fig10, ax10 = plt.subplots()
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
                X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
                self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                predictions = self.model.predict(X_test)
                plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), y_test, 'm', alpha=0.4)
                plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), predictions, 'g')
                score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 2:
            multiplier = 0
            fig10, ax10 = plt.subplots()
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0)
                predictions = self.model.predict(X_test)

                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                        alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g')

                score = self.model.evaluate(X_test, y_test, verbose=0)
                scores.append(score)
                multiplier = multiplier + 1
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

        elif self.type == 3:
            multiplier = 0
            fig10, ax10 = plt.subplots()
            kf = KFold(n_splits=k, shuffle=False, random_state=0)
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                self.model.gridsearch(X_train, y_train)
                y_pre = self.model.predict(X_test)

                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm',
                         alpha=0.4)
                plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_pre, 'g')

                scores.append(mean_absolute_error(y_pre, y_test))
            plt.title('True vs. Predicted Cases in {}'.format(city))
            plt.xlabel('Week')
            plt.ylabel('Cases of Dengue')
            plt.legend(['True', 'Predicted'])
            plt.show()
            return sum(scores) / len(scores)

    def cross_eval(self, X, y, bs=10, ep=100, k=3):
            scores = []
            if self.type == 0:
                kf = KFold(n_splits=k, shuffle=True, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                    a, score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 1:
                kf = KFold(n_splits=k, shuffle=False, random_state=0)
                scores = []
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
                    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
                    self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0)
                    score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 2:
                kf = KFold(n_splits=k, shuffle=True, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0)
                    score = self.model.evaluate(X_test, y_test, verbose=0)
                    scores.append(score)
                return sum(scores) / len(scores)

            elif self.type == 3:
                kf = KFold(n_splits=k, shuffle=False, random_state=0)
                for train_index, test_index in kf.split(X):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]
                    self.model.gridsearch(X_train, y_train)
                    y_pre = self.model.predict(X_test)
                    print(y_pre)
                    scores.append(mean_absolute_error(y_pre, y_test))
                return sum(scores) / len(scores)
Beispiel #13
0
    def explain_instance_with_data(self,
                                   neighborhood_data,
                                   neighborhood_labels,
                                   distances,
                                   label,
                                   num_features,
                                   feature_selection='auto',
                                   model_regressor=None,
                                   gam_type=None):
        """Takes perturbed data, labels and distances, returns explanation.

        Args:
            neighborhood_data: perturbed data, 2d array. first element is
                               assumed to be the original data point.
            neighborhood_labels: corresponding perturbed labels. should have as
                                 many columns as the number of possible labels.
            distances: distances to original data point.
            label: label for which we want an explanation
            num_features: maximum number of features in explanation
            feature_selection: how to select num_features. options are:
                'forward_selection': iteratively add features to the model.
                    This is costly when num_features is high
                'highest_weights': selects the features that have the highest
                    product of absolute weight * original data point when
                    learning with all the features
                'lasso_path': chooses features based on the lasso
                    regularization path
                'none': uses all features, ignores num_features
                'auto': uses forward_selection if num_features <= 6, and
                    'highest_weights' otherwise.
            model_regressor: sklearn regressor to use in explanation.
                Defaults to Ridge regression if None. Must have
                model_regressor.coef_ and 'sample_weight' as a parameter
                to model_regressor.fit()

        Returns:
            (intercept, exp, score):
            intercept is a float.
            exp is a sorted list of tuples, where each tuple (x,y) corresponds
            to the feature id (x) and the local weight (y). The list is sorted
            by decreasing absolute value of y.
            score is the R^2 value of the returned explanation
        """

        weights = self.kernel_fn(distances)
        labels_column = neighborhood_labels[:, label]
        used_features = self.feature_selection(neighborhood_data,
                                               labels_column, weights,
                                               num_features, feature_selection)

        X = neighborhood_data[:, used_features]
        y = neighborhood_labels[:, label]
        (X_train, X_test, y_train, y_test, train_weights,
         test_weights) = train_test_split(X, y, weights, test_size=0.2)

        linear_model = Ridge(alpha=1,
                             fit_intercept=True,
                             random_state=self.random_state)

        gam = LinearGAM()
        dt = DecisionTreeRegressor()

        linear_model.fit(X_train, y_train, sample_weight=train_weights)
        gam.fit(X_train, y_train, weights=train_weights)
        dt.fit(X_train, y_train, sample_weight=train_weights)

        # # plot
        # for i, term in enumerate(gam.terms):
        #     if term.isintercept:
        #         continue
        #     XX = gam.generate_X_grid(term=i)
        #     # pdep = gam.predict(XX)
        #     pdep = gam.partial_dependence(term=i, X=XX) + linear_model.intercept_
        #     # line = XX[:, term.feature] * linear_model.coef_[term.feature]
        #     line = linear_model.predict(XX)
        #     dect = dt.predict(XX)
        #     plt.figure()
        #     plt.plot(XX[:, term.feature], pdep)
        #     plt.plot(XX[:, term.feature], line)
        #     plt.plot(XX[:, term.feature], dect)
        #     plt.title(repr(term))
        #     plt.show()
        # exit()

        y_lr = linear_model.predict(X_test)
        y_gam = gam.predict(X_test)
        y_dt = dt.predict(X_test)

        # y_lr = linear_model.predict(X_train)
        # y_gam = gam.predict(X_train)
        # y_dt = dt.predict(X_train)

        # mse_lr = mean_squared_error(y_test, y_lr, sample_weight=test_weights)
        # mse_gam = mean_squared_error(y_test, y_gam, sample_weight=test_weights)
        # mse_dt = mean_squared_error(y_test, y_dt, sample_weight=test_weights)

        mse_lr = explained_variance_score(y_test,
                                          y_lr,
                                          sample_weight=test_weights)
        mse_gam = explained_variance_score(y_test,
                                           y_gam,
                                           sample_weight=test_weights)
        mse_dt = explained_variance_score(y_test,
                                          y_dt,
                                          sample_weight=test_weights)

        # mse_lr = explained_variance_score(y_train, y_lr, sample_weight=train_weights)
        # mse_gam = explained_variance_score(y_train, y_gam, sample_weight=train_weights)
        # mse_dt = explained_variance_score(y_train, y_dt, sample_weight=train_weights)

        metrics = (mse_lr, mse_gam, mse_dt)

        prediction_score = linear_model.score(neighborhood_data[:,
                                                                used_features],
                                              labels_column,
                                              sample_weight=weights)

        local_pred = linear_model.predict(
            neighborhood_data[0, used_features].reshape(1, -1))

        linear_exp = sorted(zip(used_features, linear_model.coef_),
                            key=lambda x: np.abs(x[1]),
                            reverse=True)
        gam_exp = []
        for i, term in enumerate(gam.terms):
            if term.isintercept:
                continue
            XX = gam.generate_X_grid(term=i)
            y = gam.partial_dependence(term=i, X=XX)
            x = XX[:, i]
            feature = used_features[i]
            gam_exp.append((used_features[i], x, y))

        if self.verbose:
            print('Intercept', linear_model.intercept_)
            print(
                'Prediction_local',
                local_pred,
            )
            print('Right:', neighborhood_labels[0, label])
        # return (linear_model.intercept_,
        #         sorted(zip(used_features, linear_model.coef_),
        #                key=lambda x: np.abs(x[1]), reverse=True),
        #         prediction_score, local_pred)
        return (metrics, linear_exp, gam_exp)
Beispiel #14
0
def GAMf(df,
         in_var,
         ex_vars,
         city,
         cut,
         pred_end='one_month',
         train_duration='all'):
    """
    Parameters
    ----------
    df: 
        dataframe containing all variables of interest for the whole time of measurement
    in_var: 
        independent variable
    ex_vars: 
        list of explanatory variables
    city: 
        name of specific city
    cut: 
        string of the format '%m/%d/%Y' indicating the date where training set ends & test set starts
    pred_end:
        end of the prediction period
         if 'one_month' pred_end is set to one month after the cut
    train_duration:
        int, indicating the number of months that should be used for training
        defaults to 'all' -> all available data before the cut date will be used as training data
        
    Returns
    -------
    gam:
        fitted gam model instance
        
        
    model_statistics:
        vector containing the following information about the fitted model
        
        rmse:
            RMSE for test set
        r_squared:
            pseudo R-squared for the fitted GAM model
        fac2:
            fraction of predictions that lies between 50% and 200% of the corresponding measurements
        test_len:
            number of observations in the test set
        train_len:
            number of observations in the training set
        ratio:
            ratio of prediction to true values for test set
        avg_err:
        
    preds:
        a dataframe containing all explanatory variables, the independent variable, the predicted values & 
        the absolute error divided by the average value of the pollution variables in the training set
    """

    # drop rows with NAN values for explantory variables
    df = df.dropna(subset=ex_vars)

    # subset dataset to given city
    df = df[df['city'] == city]

    # convert cut variable to datetime object
    cut = datetime.strptime(cut, '%m/%d/%Y')

    # if pred_end has the default value add one month to cut date to calculate end of the test dataset
    # else convert given string to datetime
    if (pred_end == 'one_month'):
        pred_end = cut + relativedelta(months=+1)
    else:
        pred_end = datetime.strptime(pred_end, '%m/%d/%Y')

    # determine subset of dataset used for training based on the given value for training duration
    if (train_duration == 'all'):
        df_train = df[df.index < cut]
    else:
        train_start = cut - relativedelta(months=+train_duration)
        df_train = df[df.index < cut]
        df_train = df_train[df_train.index > train_start]
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
    df_train = df_train.dropna(subset=ex_vars)

    # determine subset of dataset used for test
    df_test = df[df.index > cut]
    df_test = df_test[df_test.index < pred_end]

    # extract values for independent and explanatory variables
    train_X = df_train[ex_vars].values
    train_y = np.log(df_train[in_var].values)
    test_X = df_test[ex_vars].values
    test_y = np.log(df_test[in_var].values)

    # check if test and training set contain sufficient observations
    if ((len(test_y) != 0) and (len(train_y) != 0)):

        # generate TermList for GAM
        string = str()
        if isinstance(ex_vars, str):
            length = 1
        else:
            length = len(ex_vars)
        for i in range(0, length):
            if (ex_vars[i] in [
                    'weekday', 'month', 'season', 'hour', 'season', 'new_year',
                    'daytime'
            ]) and (len(train_y) > 300):
                string = string + "+f(" + str(i) + ")"
        #  else:
            elif ('ws' in ex_vars[i]):
                string = string + '+l(' + str(i) + ')'
            else:
                string = string + '+s(' + str(i) + ", lam = 0.6, basis = 'ps')"

        string = string[1:]

        # specify and fit GAM model
        gam = LinearGAM(eval(string))
        gam.fit(train_X, train_y)
        y_pred = gam.predict(test_X)

        # get max observed value for y
        max_value = train_y.max()

        # cut prediction to not get higher than maximum value in the training dataset
        y_pred[y_pred > max_value] = max_value

        # calculate model statistics
        ratio = np.mean(y_pred / test_y)
        rmse = np.sqrt(
            metrics.mean_squared_error(np.exp(test_y), np.exp(y_pred)))
        avg_err = np.mean(np.exp(test_y) - np.exp(y_pred))
        r_squared = list(gam.statistics_['pseudo_r2'].items())[0][1]
        fac2 = np.mean(test_y / y_pred < 2)

        # dataframe with independent & dependent variables, prediction and prediction error
        preds = df_test.copy()[ex_vars]
        preds['true'] = np.exp(test_y)
        preds['y_pred'] = np.exp(y_pred)
        preds['err'] = abs(preds['true'] -
                           preds['y_pred']) / (np.mean(train_y))

        confidence = gam.prediction_intervals(test_X)

        preds['lower'] = np.exp(confidence[:, 0])
        preds['upper'] = np.exp(confidence[:, 1])
    else:
        # return Nan and give a warning if the training set is very small
        print(
            'Problem with test and/or training data length for the station ' +
            city + 'in the month of ' + str(cut.month))
        print('Training Length: ' + str(len(train_y)) + ' Test Length: ' +
              str(len(test_y)))
        rmse = gam = ratio = preds = avg_err = r_squared = fac2 = float("NaN")

    # calculate length of test & training set
    test_len = len(test_X)
    train_len = len(train_X)
    model_statistics = [
        rmse, r_squared, fac2, test_len, train_len, ratio, avg_err
    ]

    return (gam, model_statistics, preds)
Beispiel #15
0
df2 = df[(df.Year == 2013) | (df.Year == 2014) | (df.Year == 2015) | (df.Year == 2016) | (df.Year == 2017)]

df3 = df2.dropna()

x = df3[['Farenheit', 'Year', 'Month', 'Day_of_week', 'Hour']]

y = df3['AEP_MW']

# splitting the data into the 80% used to train the model and the 20% used to test the model

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 6)

# passing the data into the model
gam = LinearGAM()
gam.fit(x_train, y_train)

y_predicted = gam.predict(x_test)

gam.summary()
# building out the axis labels for the graphs of the different features
months = range(13)
month_names = [" ", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep","Oct", "Nov", "Dec"]

days = range(8)
day_names = ['', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

hours = [0, 0, 3, 6, 9, 12, 15, 18, 21, 24]
hour_times = ['', '12AM', '3AM', '6AM', '9AM', '12PM', '3PM', '6PM', '9PM', '12AM']

plt.figure();