Beispiel #1
0
def OLSregression(y, x, n=0):

    x0 = sm.add_constant(x)
    m1 = sm.OLS(y, x0).fit()

    print(m1.summary())
    print(np.std(m1.predict() - np.array(y)))

    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot(111)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.plot(y.index,
             pd.Series(m1.predict()),
             label='fitted',
             c=pick_a_color())
    plt.plot(y.index, y, label='actual', c=pick_a_color())

    plt.legend(loc='best')
    plt.title('OLS')
    plt.show()

    if n == 0:
        return m1.params
    else:
        m2=en(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],\
              l1_ratio=[.01, .1, .5, .9, .99],  max_iter=5000).fit(x, y)
        print(m2.intercept_, m2.coef_)
        print(np.std(m2.predict(x) - np.array(y)))

        fig = plt.figure(figsize=(10, 5))
        ax = fig.add_subplot(111)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        plt.plot(y.index,
                 pd.Series(m2.predict(x)),
                 label='fitted',
                 c=pick_a_color())
        plt.plot(y.index, y, label='actual', c=pick_a_color())
        plt.legend(loc='best')

        plt.title('Elastic Net')
        plt.show()

        return [m2.intercept_] + [item for item in m2.coef_]
def get_estimator():
    """Build your estimator here."""

    eln = en(alpha=1,
             l1_ratio=0.05,
             fit_intercept=True,
             normalize=False,
             precompute=False,
             max_iter=10000,
             copy_X=True,
             tol=0.0001,
             warm_start=False,
             positive=False,
             random_state=123,
             selection='random')

    estimator = make_pipeline(VBMFeatureExtractor(), StandardScaler(), eln)

    return estimator
        x.append(fam_panEFM[v]['freq_reactions'][r])

    sel = np.random.choice(np.arange(1000), replace=False, size=200)
    x = np.array(x).T

    for m in fam_panEFM[v]['y_metabolome']:
        y.append(fam_panEFM[v]['freq_metabolite_use'][m])
        mets.append(m)

    y = np.array(y).T

    for r in fam_panEFM[v]['reac_met_freq_association']:
        z.append(fam_panEFM[v]['freq_mod_reactions'][r])
    z = np.array(z)
    from sklearn.linear_model import MultiTaskElasticNetCV as en
    enet = en(cv=2, n_jobs=5, verbose=1, max_iter=10000)
    EN = enet.fit(x[sel], y[sel])
    p = EN.predict(z.reshape(1, -1)).flatten()
    d = {mets[i]: p[i] for i in range(len(mets))}

    metab_d[v] = d.copy()

en_res = {}

for fam in metab_d:
    for m in metab_d[fam]:
        if m not in en_res:
            en_res[m] = np.zeros(len(families))

for i, fam in enumerate(families):
    for m in metab_d[fam]:
Beispiel #4
0
#lets denote data from 2017-4-25 to 2018-4-25 as backtesting window/testing set
x0 = pd.concat([df['usd'], df['gbp'], df['eur'], df['brent']], axis=1)
x1 = sm.add_constant(x0)
x = x1[x1.index < '2017-04-25']
y = df['nok'][df.index < '2017-04-25']

model = sm.OLS(y, x).fit()
print(model.summary(), '\n')

# In[4]:

#nevertheless, from the summary u can tell there is multicollinearity
#the condition number is skyrocketing
#alternatively, i can use elastic net regression to achieve the convergence
m = en(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
       l1_ratio=[.01, .1, .5, .9, .99],
       max_iter=5000).fit(x0[x0.index < '2017-04-25'], y)
print(m.intercept_, m.coef_)

#elastic net estimation results:
#3.79776228406 [ 0.00388958  0.01992038  0.02823187  0.00050092]

# In[5]:

#we plot the difference between two different approaches
#note that the difference is negative skewed
df['sk_fit'] = (df['usd'] * m.coef_[0] + df['gbp'] * m.coef_[1] +
                df['eur'] * m.coef_[2] + df['brent'] * m.coef_[3] +
                m.intercept_)
df['ols_fit'] = (df['usd'] * model.params[1] + df['gbp'] * model.params[2] +
                 df['eur'] * model.params[3] + df['brent'] * model.params[4] +
Beispiel #5
0
#shuffle the whole dataset
X_r, X_d, y_r, y_d = train_test_split(X, y, test_size=0, random_state=0)
max_score_tra = 0
para_tra = None

max_score_cv = 0
para_cv = None

for precompute in [True, False]:
    for fit_intercept in [True, False]:
        for normalize in [True, False]:
            for copy_X in [True, False]:
                for warm_start in [True, False]:
                    for positive in [True, False]:
                    
                        clf = en(precompute=precompute, fit_intercept = fit_intercept, normalize = normalize, copy_X = copy_X, warm_start = warm_start, positive = positive, random_state = 0)
                        clf1 = en(precompute=precompute, fit_intercept = fit_intercept, normalize = normalize, copy_X = copy_X, warm_start = warm_start, positive = positive, random_state = 0)
                        clf.fit(X_train, y_train)
                        score_tra = clf.score(X_test, y_test)
                        score_cv = cross_val_score(clf1, X_r, y_r, cv=5)
                        if score_tra > max_score_tra:
                            max_score_tra = score_tra
                            para_tra =  clf.get_params

                        if score_cv.mean() > max_score_cv:
                            max_score_cv = score_cv.mean()
                            para_cv =  clf1.get_params
    #                     y_pred = clf.predict(X_test)
    #                     print clf.get_params
# print clf.scores_
print max_score_tra
Beispiel #6
0
 br = '\n'
 X = np.load('data/X_boston.npy')
 y = np.load('data/y_boston.npy')
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 regressors = [
     lr(),
     bay(),
     rr(alpha=.5, random_state=0),
     l(alpha=0.1, random_state=0),
     ll(),
     knn(),
     ard(),
     rfr(random_state=0, n_estimators=100),
     SVR(gamma='scale', kernel='rbf'),
     rcv(fit_intercept=False),
     en(random_state=0),
     dtr(random_state=0),
     ada(random_state=0),
     gbr(random_state=0)
 ]
 print('unscaled:', br)
 for reg in regressors:
     reg.fit(X_train, y_train)
     rmse, name = get_error(reg, X_test, y_test)
     name = reg.__class__.__name__
     print(name + '(rmse):', end=' ')
     print(rmse)
 print()
 print('scaled:', br)
 scaler = StandardScaler()
 X_train_std = scaler.fit_transform(X_train)
Beispiel #7
0
reducers_cfg[PCA.__name__] = dict(
    reducer__n_components=[],
    # reducer__whiten = [True, False],
    reducer__svd_solver=['auto'])
reducers_cfg[GenericUnivariateSelect.__name__] = dict(
    reducer__score_func=[f_regression],
    reducer__mode=['k_best'],
    reducer__param=[])
reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[],
                                  reducer__step=[0.1])
#########################
####### Models ##########
#########################

# models = [br(), en(), ls(), lo(), ll()]
models = [br(), en(), ll()]

models_cfg = {}


def init(para=None):
    # print (para);
    if para == 2:

        #########################
        ####Data Preprocessor ###
        #########################

        preprocessors = [DummyTransformer]
        preprocessors_cfg = {}
        preprocessors_cfg[DummyTransformer.func.__name__] = {}
    oof_ridge_383[val_idx] = ridge_383.predict(X_train_383[val_idx])

    predictions_ridge_383 += ridge_383.predict(X_test_383) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_383, target)))

folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_en_383 = np.zeros(train_shape)
predictions_en_383 = np.zeros(len(X_test_383))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
    print("fold n°{}".format(fold_ + 1))
    tr_x = X_train_383[trn_idx]
    tr_y = y_train[trn_idx]
    # ElasticNet 弹性网络
    en_383 = en(alpha=1.0, l1_ratio=0.06)
    en_383.fit(tr_x, tr_y)
    oof_en_383[val_idx] = en_383.predict(X_train_383[val_idx])

    predictions_en_383 += en_383.predict(X_test_383) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_383, target)))

folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_br_383 = np.zeros(train_shape)
predictions_br_383 = np.zeros(len(X_test_383))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
    print("fold n°{}".format(fold_ + 1))
    tr_x = X_train_383[trn_idx]
    tr_y = y_train[trn_idx]