Beispiel #1
0
def Stepwise_Forward_Selection(Data, Inputs, Output):
    Model_var1 = sm.OLS
    X = Data[Inputs]
    y = Data[Output]
    initial_list = []
    threshold_in = 0.05
    verbose = True
    included = list(initial_list)
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = Model_var1(
                y, sm.add_constant(pd.DataFrame(X[included +
                                                  [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(
                    best_feature, best_pval))
        if not changed:
            break
    return included
Beispiel #2
0
    def test_summary_col_ordering_preserved(self):
        # gh-3767
        x = [1, 5, 7, 3, 5]
        x = add_constant(x)
        x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
        x2 = pd.DataFrame(x2, columns=['const', 'b', 'a'])
        y1 = [6, 4, 2, 7, 4]
        y2 = [8, 5, 0, 12, 4]
        reg1 = OLS(y1, x2).fit()
        reg2 = OLS(y2, x2).fit()

        info_dict = {
            'R2': lambda x: '{:.3f}'.format(int(x.rsquared)),
            'N': lambda x: '{0:d}'.format(int(x.nobs))
        }
        original = actual = summary_col([reg1, reg2], float_format='%0.4f')
        actual = summary_col([reg1, reg2],
                             regressor_order=['a', 'b'],
                             float_format='%0.4f',
                             info_dict=info_dict)
        variables = ('const', 'b', 'a')
        for line in str(original).split('\n'):
            for variable in variables:
                if line.startswith(variable):
                    assert line in str(actual)
Beispiel #3
0
def SUR_model(type):
    from linearmodels.system import SUR
    from collections import OrderedDict
    import statsmodels.regression.linear_model as smlm

    Equation = OrderedDict()
    for i in range(34):
        x_lag1 = np.nan * np.ones(X_data.shape[0])
        y_lag1 = np.nan * np.ones(X_data.shape[0])
        x_lag1[1:] = X_data.iloc[:-1, i]
        y_lag1[1:] = y_data.iloc[:-1, i]
        y_reg = y_data.iloc[:, i]
        y_reg.name = 'netflow_' + str(i)

        X_exo = pd.concat([pd.Series(y_lag1), pd.Series(x_lag1)], axis=1)
        X_exo = smlm.add_constant(X_exo)
        # X_exo = X_exo.iloc[1:, :]
        X_exo.columns = ['const', 'netflow_lag1', 'panic']

        name = 'Platform_' + str(i)
        Equation[name] = {'dependent': y_reg, 'exog': X_exo}

    reg = SUR(Equation).fit(method=type)
    print(reg)

    return reg, Equation
Beispiel #4
0
    def test_summarycol(self):
        # Test for latex output of summary_col object
        desired = r'''
\begin{table}
\caption{}
\begin{center}
\begin{tabular}{lcc}
\hline
      &   y I    &   y II    \\
\midrule
\midrule
const & 7.7500   & 12.4231   \\
      & (1.1058) & (3.1872)  \\
x1    & -0.7500  & -1.5769   \\
      & (0.2368) & (0.6826)  \\
\hline
\end{tabular}
\end{center}
\end{table}
'''
        x = [1, 5, 7, 3, 5]
        x = add_constant(x)
        y1 = [6, 4, 2, 7, 4]
        y2 = [8, 5, 0, 12, 4]
        reg1 = OLS(y1, x).fit()
        reg2 = OLS(y2, x).fit()
        actual = summary_col([reg1, reg2]).as_latex()
        actual = '\n%s\n' % actual
        assert_equal(desired, actual)
Beispiel #5
0
    def test_summarycol(self):
        # Test for latex output of summary_col object
        desired = r'''
\begin{table}
\caption{}
\begin{center}
\begin{tabular}{lcc}
\hline
      &   y I    &   y II    \\
\midrule
\midrule
const & 7.7500   & 12.4231   \\
      & (1.1058) & (3.1872)  \\
x1    & -0.7500  & -1.5769   \\
      & (0.2368) & (0.6826)  \\
\hline
\end{tabular}
\end{center}
\end{table}
'''
        x = [1,5,7,3,5]
        x = add_constant(x)
        y1 = [6,4,2,7,4]
        y2 = [8,5,0,12,4]
        reg1 = OLS(y1,x).fit()
        reg2 = OLS(y2,x).fit()
        actual = summary_col([reg1,reg2]).as_latex()
        actual = '\n%s\n' % actual
        assert_equal(desired, actual)
def stepwise_selection(X,
                       y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out=0.05,
                       verbose=True):
    """ Perform a forward-backward feature selection based on p-values"""
    included = list(initial_list)
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(
                y, sm.add_constant(pd.DataFrame(X[included +
                                                  [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(
                    best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(
                    worst_feature, worst_pval))
        if not changed:
            break
    return included
Beispiel #7
0
    def test_OLSsummary(self):
        # Test that latex output of regular OLS output still contains
        # multiple tables

        x = [1,5,7,3,5]
        x = add_constant(x)
        y1 = [6,4,2,7,4]
        reg1 = OLS(y1,x).fit()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = reg1.summary().as_latex()
        string_to_find = r'''\end{tabular}
\begin{tabular}'''
        result = string_to_find in actual
        assert(result is True)
Beispiel #8
0
def compute_QTL_gti_peaki(datapoint):

    [peak_sample, gt_sample, weight_sample] = datapoint
    valid_samples = np.where(gt_sample!= -1)[0]
    
    y = np.array(peak_sample[valid_samples])
    y = y.astype(float)
    x = np.array(gt_sample[valid_samples])
    x_weights = np.array(weight_sample[valid_samples])
   
    x = sm.add_constant(x) 
    wls_model = sm.WLS(y, x, weights = x_weights)
    results = wls_model.fit()

    return results.pvalues[1]
Beispiel #9
0
    def test_OLSsummary(self):
        # Test that latex output of regular OLS output still contains
        # multiple tables

        x = [1, 5, 7, 3, 5]
        x = add_constant(x)
        y1 = [6, 4, 2, 7, 4]
        reg1 = OLS(y1, x).fit()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = reg1.summary().as_latex()
        string_to_find = r'''\end{tabular}
\begin{tabular}'''
        result = string_to_find in actual
        assert (result is True)
Beispiel #10
0
 def test_summarycol_drop_omitted(self):
     # gh-3702
     x = [1, 5, 7, 3, 5]
     x = add_constant(x)
     x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
     y1 = [6, 4, 2, 7, 4]
     y2 = [8, 5, 0, 12, 4]
     reg1 = OLS(y1, x).fit()
     reg2 = OLS(y2, x2).fit()
     actual = summary_col([reg1, reg2], regressor_order=['const', 'x1'],
                          drop_omitted=True)
     assert 'x2' not in str(actual)
     actual = summary_col([reg1, reg2], regressor_order=['x1'],
                          drop_omitted=False)
     assert 'const' in str(actual)
     assert 'x2' in str(actual)
Beispiel #11
0
 def test_summarycol_drop_omitted(self):
     # gh-3702
     x = [1, 5, 7, 3, 5]
     x = add_constant(x)
     x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
     y1 = [6, 4, 2, 7, 4]
     y2 = [8, 5, 0, 12, 4]
     reg1 = OLS(y1, x).fit()
     reg2 = OLS(y2, x2).fit()
     actual = summary_col([reg1, reg2], regressor_order=['const', 'x1'],
                          drop_omitted=True)
     assert 'x2' not in str(actual)
     actual = summary_col([reg1, reg2], regressor_order=['x1'],
                          drop_omitted=False)
     assert 'const' in str(actual)
     assert 'x2' in str(actual)
Beispiel #12
0
 def test_summary_col_ordering_preserved(self):
     # gh-3767
     x = [1, 5, 7, 3, 5]
     x = add_constant(x)
     x2 = np.concatenate([x, np.array([[3], [9], [-1], [4], [0]])], 1)
     y1 = [6, 4, 2, 7, 4]
     y2 = [8, 5, 0, 12, 4]
     reg1 = OLS(y1, x2).fit()
     reg2 = OLS(y2, x2).fit()
     info_dict = {'R2': lambda x: '{:.3f}'.format(int(x.rsquared)),
                  'N': lambda x: '{0:d}'.format(int(x.nobs))}
     original = actual = summary_col([reg1, reg2], float_format='%0.4f')
     actual = summary_col([reg1, reg2], regressor_order=['x2', 'x1'],
                          float_format='%0.4f',
                          info_dict=info_dict)
     variables = ('const', 'x1', 'x2')
     for line in str(original).split('\n'):
         for variable in variables:
             if line.startswith(variable):
                 assert line in str(actual)
Beispiel #13
0
def iteration(e0, depth):
    y_star = y_data
    Wald_iter_df = pd.DataFrame(index=range(depth), columns=y_star.columns)
    for d in range(depth):
        e = e0.sample(frac=1).reset_index(drop=True)
        for i in range(34):
            ei = e.iloc[:, i]
            y_star_lag = pd.Series(index=y_star.index)
            y_star_lag[1:] = y_star.iloc[:-1, i]

            y_star.iloc[1:, i] = ei + Params_df.loc[
                'const', i] + Params_df.loc[0, i] * y_star_lag[1:]  # 生成y-star
            # y_star.iloc[0, i] = y_data.iloc[0, i]

            x_lag1 = pd.Series(index=y_star.index)
            y_lag1 = pd.Series(index=y_star.index)
            x_lag1[1:] = X_data.iloc[:-1, i]
            y_lag1[1:] = y_star.iloc[:-1, i]
            y_reg = y_star.iloc[:, i]

            X_exo = pd.concat([y_lag1, x_lag1], axis=1)
            X_exo = lm.add_constant(X_exo)
            model = lm.OLS(y_reg, X_exo, missing='drop')
            res = model.fit()
            R = np.eye(len(res.params))[2]
            # print(res.wald_test(R).fvalue[0][0])
            # Wald_iter_df.iloc[d, i] = res.wald_test(R).fvalue[0][0]

            try:
                wald_i = res.wald_test(R).fvalue[0][0]
            except ValueError:
                Wald_iter_df.iloc[d, i] = np.nan
                print(d, i, "Appear")
                # print(X_exo)
            else:
                Wald_iter_df.iloc[d, i] = wald_i

    return Wald_iter_df
Beispiel #14
0
def test_demo():
    Wald_test_list = []
    e0 = pd.DataFrame(index=X_data.index, columns=y_data.columns)
    for i in range(34):
        x_lag1 = np.nan * np.ones(X_data.shape[0])
        y_lag1 = np.nan * np.ones(X_data.shape[0])
        x_lag1[1:] = X_data.iloc[:-1, i]
        y_lag1[1:] = y_data.iloc[:-1, i]
        y_reg = y_data.iloc[:, i]

        X_exo = pd.concat([pd.Series(y_lag1), pd.Series(x_lag1)], axis=1)
        X_exo = lm.add_constant(X_exo)
        model = lm.OLS(y_reg, X_exo, missing='drop')
        res = model.fit()
        R = np.eye(len(res.params))[2]
        print(res.params)
        print(R)
        print(res.wald_test(R))
        Params_df.iloc[:, i] = res.params
        Wald_test_list.append(res.wald_test(R).fvalue[0][0])  # 计算初始样本的F-value
        e0.iloc[:, i] = y_reg - res.params[
            'const'] - res.params[0] * y_lag1  # 长度为34,计算残值

    return e0, Wald_test_list
Beispiel #15
0
#==============================================================================

#==============================================================================
# 선형 회귀 진단
# 1. 잔차의 정규성 검정
# 2. 잔차의 자기상관계수 검정
# 3. 독립변수에 대한 condition number 계산
#==============================================================================

from sklearn.datasets import make_regression
from statsmodels.regression import linear_model as sm
import pandas as pd

X0, y, coef = make_regression(n_samples=100, n_features=1, noise=20, coef=True, random_state=0) #noise : standard deviation of the gaussian noise applied to the output.
dfX0 = pd.DataFrame(X0, columns=['X1'])
dfX = sm.add_constant(dfX0) # 독립변수 앞에 1을 넣어서 절편용 열을 하나 만들어준다.
dfy = pd.DataFrame(y, columns=['y'])

model = sm.OLS(dfy, dfX)
result = model.fit()
print(result.summary())


# 다중공선성(독립변수중 어떤 변수는 다른 변수들로 설명 가능한 경우, full rank가 안되게 되어 회귀계수 추정에 문제가 발생함)을 해결하는 방법
# 1. 변수 선택법으로 의존적인 변수 제거
# 2. PCA로 새로운 변수를 추출
# 3. regulize 방법론 적용 (Lasso, Ridge, ElasticNet 등)

from statsmodels.datasets.longley import load_pandas
import seaborn as sns
import matplotlib.pyplot as plt
Beispiel #16
0
# Use sds to make vector of noise.
eta = random.randn(len(s)) * sds        #   In MatLabversion: eta = randn(size(s)).*sds;
# Pretend some data points are extreme.
eta[7]=-1
eta[8]=-3
eta[10]=-3

# Find observed values of x (with noise added).
x = m*s + c + eta

# Weighted Least Squares regression. We could find the solution using the smallest value in the Farray below also.
# Find weightings w (discount) for each data point.
vars0 = sds ** 2
w=1/vars0                               #   In MatLabversion: w=1./vars0
#w=ones(size(w)) # un-comment this line to get solution based on uniform noise terms.
ss = sm.add_constant(s)                 #   In MatLabversion: ss=[ones(size(s)) s] # prepend column of ones so that solution includes intercept term.
model = sm.WLS(ss, x, weights=w)        #   In MatLabversion: [params,stdx,mse,S] =lscov(ss,x,w)
results = model.fit()
mest2, cest2 = results.params[0]
cest2 = cest2 + c # +c to compensate for apparent incorrect intercept returned from WLS
########
xest2 = mest2 * s + cest2               #   In MatLabversion: xest2 = mest2.*s + cest2;
c0 = cest2;
m0 = mest2;

# Plot fitted line xest (=xhat in text) and data points.
fig1 = plt.figure() # ; clf;
plt.plot(s,x, 'k*', s, xest2, 'k')
plt.xlabel('Salary, ' + r'$s$' + ' (groats)'); # use trick to get italic font
plt.ylabel('Height, ' + r'$x$' + ' (feet)');
plt.xlim((0,12))
sds = sds * arange(1, 12) / 10.0
# Use noise values copied from book (based on sds above).
eta = [
    -0.0023, -0.0728, 0.1104, 0.6076, -0.3034, -0.2237, 0.7407, -1.0, -3.0,
    -2.4653, -3.0
]
# Find observed values of x (with noise added).
x = m * s + c + eta

# Weighted Least Squares regression.
# Find weightings w (discount) for each data point.
vars0 = sds**2
w = 1 / vars0
# Un-comment next line for solution based on un-weighted regression.
#w=ones(size(w))
ss = sm.add_constant(s)  # Add column of 1s for regression.
model = sm.WLS(x, ss, weights=w)
results = model.fit()
cest2, mest2 = results.params
print('Estimated slope = %.3f,' % mest2)
print(' estimated intercept = %.3f.' % cest2)

# Make line xest2 based on fitted slope and intercept.
s2 = arange(0, 13)
xest2 = mest2 * s2 + cest2

# Plot fitted line xest, data points, and error bars.
fig1 = plt.figure()
plt.errorbar(s, x, yerr=sds, fmt='o', color='k')
plt.plot(s, x, 'k*', s2, xest2, 'k--')
plt.xlabel('Salary, $s$ (groats)')

WeekDayNo_dummies = pd.get_dummies(df['WeekDayNo']).rename(columns=lambda x: 'WeekDayNo_' + str(x))
WeekDayNo_dummies = pd.DataFrame(WeekDayNo_dummies)
df = pd.concat([df, WeekDayNo_dummies], axis=1) 
df = df.drop('WeekDayNo',axis=1)

Event_dummies = pd.get_dummies(df['Event']).rename(columns=lambda x: 'Event_' + str(x))
Event_dummies = pd.DataFrame(Event_dummies)
df = pd.concat([df, Event_dummies], axis=1) 
df = df.drop('Event',axis=1)

df_predictor = df[['Holiday','WeekDayNo_1','WeekDayNo_2','WeekDayNo_3','WeekDayNo_4','WeekDayNo_5','WeekDayNo_6','WeekDayNo_7']]
y_target = df['DailyLongTerm_Vessey']

df_predictor = lm.add_constant(df_predictor)
df_fit, df_eval, y_fit, y_eval= train_test_split( df_predictor, y_target, test_size=.2, random_state=1 )

ols_model = lm.OLS(y_fit,df_fit,).fit()
prediction = ols_model.predict(df_eval)
print(ols_model.summary())

prediction = pd.DataFrame(prediction)
prediction.columns = ['predicted_values']
y_eval = y_eval.reset_index(drop=True)
y_eval.columns = ['DailyLongTerm_Vessey']

RMSE = mean_squared_error(y_eval, prediction)**0.5
result_compare = pd.concat([prediction,y_eval], axis=1)
print(result_compare,RMSE)
Beispiel #19
0
def model(model_name, train_x, train_y, test_x,alpha = 0.1,):
        summary = None
        from sklearn.neural_network import MLPRegressor
        from sklearn.svm import SVR
        import sklearn
        import statsmodels.regression.linear_model as sm
        if model_name == 'Random':
            test_y = pd.Series(np.random.random_sample((len(test_x),)), index=test_x.index)
        if model_name == 'None':
            test_y = test_x.iloc[:,0]
        if model_name == 'MLPRegressor':
            mlp = MLPRegressor(hidden_layer_sizes=(20, 20))
            mlp.fit(train_x, train_y)
            y_pred = mlp.predict(test_x)
            test_y = pd.Series(y_pred, index=test_x.index)
        if model_name == 'Lasso':
            model = sklearn.linear_model.Lasso(0.001,fit_intercept = False)
            lasso = model.fit(train_x, train_y)
            test_y = pd.Series(lasso.predict(test_x), index=test_x.index)
            summary = lasso.score(train_x,train_y)
        if model_name == 'Ridge':
            model = sklearn.linear_model.Ridge(1.0,fit_intercept = False)
            ridge = model.fit(train_x, train_y)
            test_y = pd.Series(ridge.predict(test_x), index=test_x.index)
            summary = ridge.score(train_x, train_y)
        if model_name == 'SVR':
            svr_rbf = SVR(kernel='rbf', C=1, gamma=0.0001, epsilon=0.1)
            svr_rbf.fit(train_x, train_y)
            y_pred_rbf = svr_rbf.predict(test_x)
            test_y = pd.Series(y_pred_rbf, index=test_x.index)
        if model_name == 'StepWise':

            feature_col = list(train_x.columns.values)
            length = len(feature_col)
            final_feature = []
            for i in range(length):
                pvalue_min = 1
                column_min = ""
                for feature in feature_col:
                    temp_feature = final_feature + [feature]
                    x = sm.add_constant(train_x.loc[:,temp_feature])
                    model = sm.OLS(train_y, x)
                    pvalue = model.fit().pvalues[i + 1]
                    # print(pvalue)
                    if pvalue < pvalue_min and pvalue < alpha:
                        pvalue_min = pvalue
                        column_min = feature

                if column_min != "":
                    feature_col.remove(column_min)
                    final_feature.append(column_min)
                else:
                    break

            X = sm.add_constant(train_x.loc[:,final_feature])
            model = sm.OLS(train_y, X)
            res = model.fit()
            summary = pd.Series(res.pvalues, index=['const'] + final_feature)
            if ~np.isnan(res.f_pvalue):
                summary['f_test'] = res.f_pvalue
            if ~np.isnan(res.rsquared_adj):
                summary['score'] = res.rsquared_adj
            xx = sm.add_constant(test_x.loc[:,final_feature],has_constant='raise')
            test_y = res.predict(xx)

        if model_name == 'AdaBoost':
            from sklearn.ensemble import AdaBoostRegressor
            model = AdaBoostRegressor(n_estimators=100,learning_rate = 0.5)
            adaboost = model.fit(train_x,train_y)
            test_y = pd.Series(adaboost.predict(test_x),index = test_x.index)

        if model_name == 'RandomForestRegressor':
            from sklearn.ensemble import RandomForestRegressor
            rfr = RandomForestRegressor(n_estimators=100, criterion='mse',max_features='auto')
            rfr.fit(train_x, train_y)
            y_pred_rfr = rfr.predict(test_x)
            test_y = pd.Series(y_pred_rfr, index=test_x.index)


        return test_y,summary
Beispiel #20
0
def __model(model_name, train_x, train_y, test_x,alpha = 0.1,*args,**kwargs):
    summary = None
    from sklearn.neural_network import MLPRegressor
    from sklearn.svm import SVR
    import sklearn
    import statsmodels.regression.linear_model as sm
    from sklearn.model_selection import TimeSeriesSplit

    cv = TimeSeriesSplit(n_splits=3)
    if model_name == 'Random':
        test_y = pd.Series(np.random.random_sample((len(test_x),)), index=test_x.index)
    if model_name == 'None':
        test_y = test_x.iloc[:,0]
    if model_name == 'MLPRegressor':
        mlp = MLPRegressor(hidden_layer_sizes=(20, 20))
        mlp.fit(train_x, train_y)
        y_pred = mlp.predict(test_x)
        test_y = pd.Series(y_pred, index=test_x.index)
    if model_name == 'Lasso':
        model = sklearn.linear_model.Lasso(0.001,fit_intercept = False)
        lasso = model.fit(train_x, train_y)
        test_y = pd.Series(lasso.predict(test_x), index=test_x.index)
        summary = lasso.score(train_x,train_y)
        # print(test_y.head())
        # model = sklearn.linear_model.Lasso()
        # param_grid = {'alpha':[1e-5,0.5*1e-4,1e-4,1e-3,1e-2,1e-1]}
        # opt = sklearn.model_selection.GridSearchCV(model,param_grid,cv=cv)
        # opt = opt.fit(train_x,train_y)
        # test_y = pd.Series(opt.predict(test_x),index=test_x.index)
        # summary = opt.score(train_x,train_y)
        # print(opt.best_params_, summary)
    if model_name == 'Ridge':
        model = sklearn.linear_model.Ridge(1.0,fit_intercept = False)
        ridge = model.fit(train_x, train_y)
        test_y = pd.Series(ridge.predict(test_x), index=test_x.index)

        summary = ridge.score(train_x, train_y)
    if model_name == 'SVR':
        # param_grid = {'gamma':list(1.0/k*np.array([1e-4,1e-3,1e-2])),\
        #               'C':[0.01,0.05,0.25,1.25]}
        # param_grid = {
        #               'C':[0.002,0.01,0.05,0.25,1.25]}
        # opt = sklearn.model_selection.GridSearchCV(svr_rbf,param_grid,cv=cv)
        # opt = opt.fit(train_x,train_y)
        # y_pred_rbf = opt.predict(test_x)
        # summary = opt.score(train_x,train_y)
        # print(opt.best_params_, summary)

        k = len(train_x.columns)
        svr_rbf = SVR(kernel='rbf', C=0.05, gamma=1.0/k*1e-4,epsilon = 0.005, max_iter = 5000)
        svr_rbf = svr_rbf.fit(train_x, train_y)
        y_pred_rbf = svr_rbf.predict(test_x)
        test_y = pd.Series(y_pred_rbf, index=test_x.index)
        summary = svr_rbf.score(train_x,train_y)
        # print(test_y.head())

    if model_name == 'StepWise':

        feature_col = list(train_x.columns.values)
        length = len(feature_col)
        final_feature = []
        for i in range(length):
            pvalue_min = 1
            column_min = ""
            for feature in feature_col:
                temp_feature = final_feature + [feature]
                x = sm.add_constant(train_x.loc[:,temp_feature])
                model = sm.OLS(train_y, x)
                pvalue = model.fit().pvalues[i + 1]
                # print(pvalue)
                if pvalue < pvalue_min and pvalue < alpha:
                    pvalue_min = pvalue
                    column_min = feature

            if column_min != "":
                feature_col.remove(column_min)
                final_feature.append(column_min)
            else:
                break

        X = sm.add_constant(train_x.loc[:,final_feature])
        model = sm.OLS(train_y, X)
        res = model.fit()
        summary = pd.Series(res.pvalues, index=['const'] + final_feature)
        if ~np.isnan(res.f_pvalue):
            summary['f_test'] = res.f_pvalue
        if ~np.isnan(res.rsquared_adj):
            summary['score'] = res.rsquared_adj
        xx = sm.add_constant(test_x.loc[:,final_feature],has_constant='raise')
        test_y = res.predict(xx)


    if model_name == 'AdaBoost':
        from sklearn.ensemble import  AdaBoostRegressor
        model = AdaBoostRegressor(n_estimators=100,learning_rate = 0.1)
        adaboost = model.fit(train_x,train_y)
        test_y = pd.Series(adaboost.predict(test_x),index = test_x.index)
        summary = adaboost.score(train_x,train_y)

    if model_name == 'RandomForestRegressor':
        from sklearn.ensemble import RandomForestRegressor
        rfr = RandomForestRegressor(n_estimators=100, criterion='mse',max_features='auto')
        rfr.fit(train_x, train_y)
        y_pred_rfr = rfr.predict(test_x)
        test_y = pd.Series(y_pred_rfr, index=test_x.index)

    return test_y,summary
Beispiel #21
0
# -*- coding: utf-8 -*-
"""
  Name     : c7_03_random_OLS.py
  Book     : Python for Finance (2nd ed.)
  Publisher: Packt Publishing Ltd. 
  Author   : Yuxing Yan
  Date     : 6/6/2017
  email    : [email protected]
             [email protected]
"""

import numpy as np
import scipy as sp
import statsmodels.regression.linear_model as sm

n = 100
sp.random.seed(12345)

y = [1, 2, 3, 4, 2, 3, 4]
x1 = range(1, 8)
x2 = [4, 2, -1, 4, 2, 3, 5]
x3 = [0, 2, 3, 4, 2, 4, -1]
x = [x1, x2, x3]
x = sm.add_constant(x)
#est = sm.OLS(formula='Sales ~ TV + Radio', data=df_adv).fit()
results = sm.OLS(y, x1).fit()
print(results.summary())
Beispiel #22
0
#%%
#FIRST DATASET REGRESSIONS

#Regression by Asset Class on all Indicators
start_date, end_date = "2000 01 01", "2008 01 01"

Y_assets = data_returns(asset_classes, start_date, end_date, "M", 1)
X_macro = data_lagged(
    macro_data[[
        "Monetary Policy", "International Trade", "Risk Sentiment", "Growth",
        "Inflation"
    ]], start_date, end_date, "M", 1)

# regression on each asset class
for asset in Y_assets.columns:
    res = sm.OLS(Y_assets[asset], sm.add_constant(X_macro)).fit()
    print(res.summary())

#%%

start_date, end_date = "1990 01 01", "2008 01 01"

Y_assets = data_returns(asset_classes, start_date, end_date, "M", 1)
X_macro = data_lagged(
    macro_data[[
        "Monetary Policy", "International Trade", "Risk Sentiment", "Growth",
        "Inflation"
    ]], start_date, end_date, "M", 1)

#Regression Per Indicator
for asset in Y_assets.columns:
Beispiel #23
0
xx = np.linspace(-10, 10, 1000)
plt.plot(xx, (1 / (1 + np.exp(-xx))) * 2 - 1, label='logistic (scaled)')
plt.plot(xx, np.tanh(xx), label='tanh')
plt.legend(loc=2)
plt.show()

#sklearn logistic regression
from sklearn.datasets import make_classification
import statsmodels.regression.linear_model as sm

X0, y = make_classification(n_features=1,
                            n_redundant=0,
                            n_informative=1,
                            n_clusters_per_class=1,
                            random_state=4)
X = sm.add_constant(X0)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X0, y)

import matplotlib as mpl
xx = np.linspace(-3, 3, 100)
sigm = 1.0 / (1 + np.exp(-model.coef_[0][0] * xx - model.intercept_[0]))
plt.plot(xx, sigm)
plt.scatter(X0, y, marker='o', c=y,
            s=100)  #(X0,y) 점들을 찍고, 형태는 o이고, 색은 y값에 따라, 사이즈는 100
plt.scatter(X0,
            model.predict(X0),
            marker='x',
            c=y,
            s=200,