Esempio n. 1
0
def backward_elimination(X_opt, y, significant_level):
    import statsmodels.regression.linear_model as sm

    num_var = len(X_opt[0])
    while (num_var > 0):
        regressor = sm.OLS(y, X_opt).fit()
        max_Pvalue = max(regressor.pvalues).astype(float)
        if (max_Pvalue > significant_level):
            real_num_var = num_var
            for i in range(num_var):
                prev_X_opt = X_opt
                cur_adjusted_rvalue = regressor.rsquared_adj

                if (regressor.pvalues[i] == max_Pvalue):
                    X_opt = np.delete(X_opt, i, 1)

                    temp_regressor = sm.OLS(y, X_opt).fit()
                    new_adjusted_rvalue = temp_regressor.rsquared_adj
                    if (new_adjusted_rvalue < cur_adjusted_rvalue):
                        return prev_X_opt

                    real_num_var -= 1

            num_var = real_num_var
        else:
            break

    return X_opt
Esempio n. 2
0
def backwards_elimination(X, y, sl):
    #print("\n\n\n")
    X = np.append(
        arr=np.ones((50, 1)).astype(int), values=X, axis=1
    )  #adding x0=1 for coeff b0 in the multiple linear regression equation
    #X_opt=X[:,[0,3]]                                                               #starting with all independent variables, eliminating ivs one by one
    X_opt = X
    SL = 0.05
    maxP = 0
    rows, cols = X_opt.shape
    #print("\t\tX_opt\n\n\n",X_opt)

    for i in range(0, cols):
        #print("\n\n")
        regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
        p_len = len(regressor_OLS.pvalues)
        #print("pvalue array is = ",regressor_OLS.pvalues)
        #print("length of p value array is ",p_len)

        maxP = find_max(regressor_OLS.pvalues, p_len)
        #maxP = max(regressor_OLS.pvalues).astype(float)
        if (maxP > SL):
            #print("inside mxp>sl")
            for j in range(0, cols - i):
                #print("inside j loop")
                if (regressor_OLS.pvalues[j].astype(float) == maxP):
                    #print("inside main condition")
                    #print("print col to be deleted is ",X_opt[:,j])
                    X_opt = np.delete(X_opt, j, 1)
    #print("\n\nfinal X_opt is \n",X_opt)
    regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
    return regressor_OLS
Esempio n. 3
0
 def vif_sub(x, regressors):
     x_i = regressors.iloc[:, x]
     mask = np.arange(k) != x
     x_not_i = regressors.iloc[:, mask]
     rsq = linear_model.OLS(x_i, x_not_i, missing="drop").fit().rsquared_adj
     vif = 1.0 / (1.0 - rsq)
     return vif
def backwardElimination(
        x,
        y,
        sl,
        con=True):  # con implies if the constent matrix is added ao not
    import matplotlib.pyplot as plt
    import numpy as np
    import statsmodels.regression.linear_model as lmd
    if con == False:
        x = np.append(arr=np.ones((len(x), 1)).astype(int), values=x, axis=1)

    numVars = len(x[0])
    r_sq = 0
    for i in range(0, numVars):
        regressor_OLS = lmd.OLS(endog=y, exog=x).fit()
        if regressor_OLS.rsquared_adj > r_sq:  # To see the Complete graph replace r_sq by 0
            r_sq = regressor_OLS.rsquared_adj
            maxVar = max(regressor_OLS.pvalues).astype(float)
            if maxVar > sl:
                for j in range(0, numVars - i):
                    if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                        x = np.delete(x, j, 1)
            print(regressor_OLS.rsquared_adj)
            plt.scatter(i, regressor_OLS.rsquared_adj)
            plt.draw()
            plt.pause(0.2)
    plt.show()
    print(regressor_OLS.summary())
    return x
Esempio n. 5
0
def cluster_vs_meta_granger_TM(c, X, M, Ml, lags=7, thresh=0.05):
    # use the Toda Yamamoto method (environmental data is stationary, but clusters are not)
    x1 = X[c].sum(0)
    adf = stattools.adfuller(x1, maxlag=lags)
    if (adf[0] > adf[4]['5%']):
        m1 = adf[2]
    else:
        m1 = 0
    R = []
    for j, x2 in enumerate(M):
        have_values = np.isfinite(x2)
        xi = x1[have_values]
        x2i = x2[have_values]
        adf = stattools.adfuller(x2i, maxlag=lags)
        if (adf[0] > adf[4]['5%']):
            m2 = adf[2]
        else:
            m2 = 0
        m = max(m1, m2)
        y = [
            xi[i + max(0, m2 - m1):len(xi) + i - (m1 + lags)]
            for i in range(m1 + lags)
        ] + [
            x2i[i + max(0, m1 - m2):len(xi) + i - (m2 + lags)]
            for i in range(m2 + lags)
        ]
        y = np.array(y).T
        lm = linear_model.OLS(xi[max(m1, m2) + lags:], y)
        result = lm.fit()
        Restr = np.eye(y.shape[1])[m + lags:]
        wald = result.wald_test(Restr)
        if wald.pvalue < thresh:
            R.append((wald.pvalue, Ml[j]))
    return m, sorted(R)
Esempio n. 6
0
def _za_thread(x, regression, start, end, nobs, basecols, baselags, res,
               residx):
    # first-diff y
    dy = np.diff(x, axis=0)[:, 0]
    zastat = bpidx = np.inf
    for bp in range(start, end):
        # reserve exog space
        exog = np.zeros((dy[baselags:].shape[0], basecols + baselags))
        # constant
        exog[:, 0] = 1
        # intercept dummy / trend / trend dummy
        if regression != 't':
            exog[(bp - (baselags + 1)):, 1] = 1
            exog[:, 2] = np.arange(baselags + 2, nobs + 1)
            if regression == 'ct':
                exog[(bp - (baselags + 1)):, 3] = np.arange(1, nobs - bp + 1)
        else:
            exog[:, 1] = np.arange(baselags + 2, nobs + 1)
            exog[(bp - (baselags + 1)):, 2] = np.arange(1, nobs - bp + 1)
        # lagged y
        exog[:, basecols - 1] = x[baselags:(nobs - 1), 0]
        # lagged dy
        exog[:, basecols:] = tsa.lagmat(
            dy, baselags, trim='none')[baselags:exog.shape[0] + baselags]
        stat = lm.OLS(dy[baselags:], exog).fit().tvalues[basecols - 1]
        if stat < zastat:
            zastat = stat
            bpidx = bp - 1
            crit = zacrit.za_crit(zastat, regression)
            pval = crit[0]
            cvdict = crit[1]
    res[residx] = [zastat, pval, cvdict, bpidx]
def polynomial_regression() -> List[float]:
    """ Define the model directly through the design matrix.
        Similar to MATLAB's "regress" command.
        
        Returns
        -------
        params : coefficients for the quadratic model
        """

    # Generate the data: a noisy second order polynomial

    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)

    t = np.arange(0, 10, 0.1)
    y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t))

    # --- >>> START stats <<< ---
    # Make the fit. Note that this is another "OLS" than the one in
    # "model_formulas", as it works directly with the design matrix!
    M = np.column_stack((np.ones(len(t)), t, t**2))
    res = sm.OLS(y, M).fit()
    # --- >>> STOP stats <<< ---

    # Display the results
    print('Summary:')
    print((res.summary()))
    print(('The fit parameters are: {0}'.format(str(res.params))))
    print('The confidence intervals are:')
    print((res.conf_int()))

    return res.params  # should be [ 4.74244177,  2.60675788,  2.03793634]
Esempio n. 8
0
def test_arch_lm(simulated_data):
    zm = ZeroMean(simulated_data, volatility=GARCH())
    res = zm.fit(disp=DISPLAY)
    wald = res.arch_lm_test()
    nobs = simulated_data.shape[0]
    df = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0)))
    assert wald.df == df
    assert "Standardized" not in wald.null
    assert "Standardized" not in wald.alternative
    assert "H0: Standardized" not in wald.__repr__()
    assert "heteroskedastic" in wald.__repr__()

    resids2 = pd.Series(res.resid ** 2)
    data = [resids2.shift(i) for i in range(df + 1)]
    data = pd.concat(data, 1).dropna()
    lhs = data.iloc[:, 0]
    rhs = smtools.add_constant(data.iloc[:, 1:])
    ols_res = smlm.OLS(lhs, rhs).fit()
    assert_almost_equal(wald.stat, nobs * ols_res.rsquared)
    assert len(wald.critical_values) == 3
    assert "10%" in wald.critical_values

    wald = res.arch_lm_test(lags=5)
    assert wald.df == 5
    assert_almost_equal(wald.pval, 1 - stats.chi2(5).cdf(wald.stat))

    wald = res.arch_lm_test(standardized=True)
    assert wald.df == df
    assert "Standardized" in wald.null
    assert "Standardized" in wald.alternative
    assert_almost_equal(wald.pval, 1 - stats.chi2(df).cdf(wald.stat))
    assert "H0: Standardized" in wald.__repr__()
Esempio n. 9
0
def cohort_1d_regressions(configs):

    for fpath in sorted(
            glob.glob(os.path.join(configs["metagroup_path"], "*.csv"))):
        groupname = fpath.split("/")[-1][:-4]
        print("Group: {}".format(groupname))
        with open(fpath, 'r') as fp:
            csv_fp = csv.reader(fp, delimiter=',')
            next(csv_fp)
            x = []
            y = []
            for line in csv_fp:
                if line[0].strip(
                ) == "held_out" or line[1] == "" or line[2] == "":
                    continue
                x.extend([1, 0])
                y.extend([float(line[1]), float(line[2])])

        lm = statslm.OLS(y,
                         statstools.add_constant(
                             np.array(x).reshape(len(x), 1)),
                         missing='raise',
                         hasconst=True)
        results = lm.fit()
        print(
            "Group result for group {} intercept/group effect: {}, t-test p-val: {}"
            .format(groupname, results.params, results.pvalues))
Esempio n. 10
0
def interaction_effects(data, dep_variable, threshold):
    # remove date variable and booking/click depending on dep_variable
    data = data.iloc[:, 1:].copy()
    if dep_variable == "click_bool":
        data = data.drop("booking_bool", axis=1)
    elif dep_variable == "booking_bool":
        data = data.drop("click_bool", axis=1)

    # choose dependent variable
    X = data.drop(dep_variable, axis=1)
    y = data[dep_variable]
    # Generate interaction terms
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    X_interaction = poly.fit_transform(X)
    # Get names of these terms
    names = PolynomialFeatureNames(poly.get_feature_names(), X)
    # Fit model to check importance of features
    model = linear_model.OLS(y, X_interaction).fit()
    # save results
    with open("output/interaction_effects/modelsummary.csv", "w") as f:
        f.write(model.summary().as_csv())
    # show significant results
    results = pd.read_csv(
        "output/interaction_effects/modelsummary.csv",
        skiprows=10,
        skipfooter=10,
        index_col=0,
    )
    results.index = names
    sign_results = results[results["P>|t| "] < threshold]
    print(sign_results.sort_values(by="   coef   ", ascending=False))
    def calculate(self, key, return_data=False):
        lob = self._lobagent.get(key).get_data()
        price = self._priceagent.get(key).get_data().price[0]

        lob_buy = lob.loc[lob.buysell == 'B',
                          ['price', 'volume']].sort_values(by='price',
                                                           ascending=False)
        lob_sell = lob.loc[lob.buysell == 'S',
                           ['price', 'volume']].sort_values(by='price',
                                                            ascending=True)

        tr_costs_sell = copy.copy(lob_buy)
        tr_costs_buy = copy.copy(lob_sell)

        tr_costs_sell['costs'] = np.cumsum(
            np.abs(tr_costs_sell.price - price) * tr_costs_sell.volume)
        tr_costs_sell.volume = np.cumsum(tr_costs_sell.volume)

        tr_costs_buy['costs'] = np.cumsum(
            np.abs(tr_costs_buy.price - price) * tr_costs_buy.volume)
        tr_costs_buy.volume = -np.cumsum(tr_costs_buy.volume)

        tr_costs = pd.concat(
            (tr_costs_buy,
             tr_costs_sell)).sort_values(by='volume').reset_index(drop=True)
        tr_costs = tr_costs[['volume', 'costs']]
        if return_data:
            return tr_costs

        try:
            DIV = DIVIDER[key[1]]
        except KeyError:
            DIV = 1
        V = np.array(tr_costs.volume) / DIV
        costs = np.array(tr_costs.costs)

        y = costs
        X = np.stack([V, V**2, V**3, V**4], axis=1)

        model = linear_model.OLS(y, X)
        results = model.fit()

        params = np.array([results.params])
        covparams = results.cov_params()

        column_names = self._transactioncoststable.get_column_names()
        key_names = self._transactioncoststable.get_key()

        row = pd.DataFrame(columns=column_names)
        row[key_names[0]] = [key[0]]
        row[key_names[1]] = [key[1]]
        row[key_names[2]] = [key[2]]
        row['params'] = [params.tolist()]
        row['cov_params'] = [covparams.tolist()]

        transactioncosts = TransactionCosts()
        transactioncosts.set_key(key)
        transactioncosts.set_data(row)

        return transactioncosts
Esempio n. 12
0
def get_vif(exog):
    vif = {}
    for col in exog.columns:
        endog = exog.loc[:, col]
        exog_coef = exog.drop([col], axis=1)
        exog_coef = sm.add_constant(exog_coef)
        model = smlr.OLS(endog, exog_coef).fit()
        vif[col] = 1 / (1 - model.rsquared)
    return vif
Esempio n. 13
0
def regression_affine(X, Y, details=True):
    Xreg = add_constant(X)  #Add a constant to fit tan affine model

    model = linear_model.OLS(Y, Xreg)  #Linear regression
    results = model.fit()
    [b, a] = results.params  #parameters of the affine curve
    Yreg = a * X + b  #regression curve

    return Yreg, a, b, results.summary()
Esempio n. 14
0
def calculate_vif(data):
    vif_df = pd.DataFrame(columns=['Var', 'Vif'])
    x_var_names = data.columns
    for i in range(0, x_var_names.shape[0]):
        y = data[x_var_names[i]]
        x = data[x_var_names.drop([x_var_names[i]])]
        r_squared = sm.OLS(y, x).fit().rsquared
        vif = round(1 / (1 - r_squared), 2)
        vif_df.loc[i] = [x_var_names[i], vif]
    return vif_df.sort_values(by='Vif', axis=0, ascending=False, inplace=False)
def stepwise_selection(X,
                       y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out=0.05,
                       verbose=True):
    """ Perform a forward-backward feature selection based on p-values"""
    included = list(initial_list)
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(
                y, sm.add_constant(pd.DataFrame(X[included +
                                                  [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(
                    best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(
                    worst_feature, worst_pval))
        if not changed:
            break
    return included
def backwardElimination(X, sl):
    epochs = len(X[0])
    for i in range(0, epochs):
        regressor_OLS = sm.OLS(y, X).fit()
        varWithMaxPValue = max(regressor_OLS.pvalues).astype(float)
        if varWithMaxPValue > sl:
            for j in range(0, epochs - 1):
                if (regressor_OLS.pvalues[j].astype(float) == varWithMaxPValue):
                    X = np.delete(X, j, 1)
    regressor_OLS.summary()
    return X
Esempio n. 17
0
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
Esempio n. 18
0
def CAPM(ticker, benchmark, start='2016-09-27'):
    import statsmodels.regression.linear_model as lm
    import statsmodels.tools.tools as ct

    df = make_portfolio([ticker, benchmark], start=start)
    ticker_df = df[ticker].pct_change()[1:]
    bench_df = df[benchmark].pct_change()[1:]
    const = ct.add_constant(bench_df)
    capm = lm.OLS(ticker_df, const)
    results = capm.fit()
    return results.summary()
Esempio n. 19
0
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    mdl = regressor_OLS.get_robustcov_results(cov_type='HAC', maxlags=1)
    print(mdl.summary())
    return x
Esempio n. 20
0
def get_regression(x, y) -> RegressionResults:

    y = y.values
    x = x.values
    x = add_constant(x)

    # intialize the model
    model = linear_model.OLS(y, x, missing='drop')

    # fit the regression
    fit_result = model.fit()

    return fit_result
Esempio n. 21
0
def backwardelimination(x, sl):
    numvar = len(x[1])
    for i in range(0, numvar):
        regressor_ols = lm.OLS(y, x).fit()
        maxpval = max(regressor_ols.pvalues).astype(float)

        if maxpval > sl:
            for j in range(0, numvar - i):
                if (regressor_ols.pvalues[j].astype(float) == maxpval):
                    x = np.delete(x, j, 1)

    regressor_ols.summary()
    return x
Esempio n. 22
0
def backwardElimination(x, SL):    
    numVars = len(x[0])    
    temp = np.zeros((50,6)).astype(int)    
    for i in range(0, numVars):        
        regressor_OLS = sm.OLS(y, x.tolist()).fit()        
        maxVar = max(regressor_OLS.pvalues).astype(float)        
        adjR_before = regressor_OLS.rsquared_adj.astype(float)        
        if maxVar > SL:            
            for j in range(0, numVars - i):                
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):                    
                    temp[:,j] = x[:, j]                    
                    x = np.delete(x, j, 1)                    
                    tmp_regressor = sm.OLS(y, x.tolist()).fit()                    
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)                    
                    if (adjR_before >= adjR_after):                        
                        x_rollback = np.hstack((x, temp[:,[0,j]]))                        
                        x_rollback = np.delete(x_rollback, j, 1)     
                        print (regressor_OLS.summary())                        
                        return x_rollback                    
                    else:                        
                        continue    
    regressor_OLS.summary()    
    return x 
Esempio n. 23
0
def cohort_one_regression(configs):

    group_idxs = []

    for idx, fpath in enumerate(
            sorted(glob.glob(os.path.join(configs["metagroup_path"],
                                          "*.csv")))):
        groupname = fpath.split("/")[-1][:-4]
        group_idxs.append((idx, groupname))

    x = []
    y = []

    for fpath in sorted(
            glob.glob(os.path.join(configs["metagroup_path"], "*.csv"))):
        groupname = fpath.split("/")[-1][:-4]
        print("Group: {}".format(groupname))
        with open(fpath, 'r') as fp:
            csv_fp = csv.reader(fp, delimiter=',')
            next(csv_fp)
            for line in csv_fp:
                if line[1] == "":
                    continue
                x.append([(1 if group_idxs[idx][1] == groupname else 0)
                          for idx in range(len(group_idxs))])
                y.extend([float(line[1])])

    y.append(0.9)
    y = np.array(y)
    x.append(np.zeros(len(group_idxs)))
    x = np.array(x)
    print("Size of design matrix: {}".format(x.shape))
    lm = statslm.OLS(y,
                     statstools.add_constant(x),
                     missing='raise',
                     hasconst=None)

    results = lm.fit()

    print("Intercept, Coefficient: {}, p-value: {}".format(
        results.params[0], results.pvalues[0]))

    for idx in range(len(group_idxs)):
        pval = results.pvalues[idx + 1]
        print("Group: {}, Coefficient: {}, p-value: {}".format(
            group_idxs[idx][1], results.params[idx + 1],
            results.pvalues[idx + 1]))
Esempio n. 24
0
    def fit(self, x, y, cens, verbose=False):
        """
        Fit a maximum-likelihood Tobit regression
        :param x: Pandas DataFrame (n_samples, n_features): Data
        :param y: Pandas Series (n_samples,): Target
        :param cens: Pandas Series (n_samples,): -1 indicates left-censored samples, 0 for uncensored, 1 for right-censored
        :param verbose: boolean, show info from minimization
        :return:
        """
        x_copy = x.copy()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            init_reg = sm.OLS(y, x_copy).fit()
            b0 = init_reg.params.values
            y_pred = init_reg.predict(x_copy)

        resid = y - y_pred
        resid_var = np.var(resid)
        s0 = np.sqrt(resid_var)
        params0 = np.append(b0, s0)

        #params0 = b0
        xs, ys = split_left_right_censored(x_copy, y, cens)

        result = minimize(
            lambda params: tobit_neg_log_likelihood(xs, ys, params),
            params0,
            method='BFGS',
            jac=lambda params: tobit_neg_log_likelihood_der(xs, ys, params),
            options={'disp': verbose})

        if verbose:
            print(result)
        self.ols_coef_ = b0[1:]
        self.ols_intercept = b0[0]
        if self.fit_intercept:
            self.intercept_ = result.x[1]
            self.coef_ = result.x[1:-1]
        else:
            self.coef_ = result.x[:-1]
            self.intercept_ = 0
        self.sigma_ = result.x[-1]

        self.result = result

        return self
def backwardElimination(x, Y, sl, columns):
    numVars = len(columns.columns)
    numVars2 = len(columns.columns)
    print("\nStarting backward elimination...")
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(endog = Y.astype(float), exog = columns.astype(float)).fit()
        maxVar = float(max(regressor_OLS.pvalues))
        print("Highest P value is: "+str(maxVar))
        if maxVar > sl:
            for j in range(0, numVars2):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    print("Removing column "+str(j))
                    columns = columns.drop(columns.columns[j], axis=1)
                    numVars2 -= 1
        
    print(regressor_OLS.summary())
    return columns
Esempio n. 26
0
def polynomial_regression():
    ''' Define the model directly through the design matrix. Similar to MATLAB's "regress" command '''

    # Generate the data
    t = np.arange(0, 10, 0.1)
    y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t))

    # Make the fit. Note that this is another "OLS" than the one in "model_formulas"!
    M = np.column_stack((np.ones(len(t)), t, t**2))
    res = sm.OLS(y, M).fit()

    # Display the results
    print 'Summary:'
    print res.summary()
    print 'The fit parameters are: {0}'.format(str(res.params))
    print 'The confidence intervals are:'
    print res.conf_int()
Esempio n. 27
0
def smart_regression(df, ignore_cols):
    endog = df.iloc[:, 0]
    exog = df.iloc[:, 1:]
    exog = exog.drop(ignore_cols, axis=1)
    vifs = get_vif(exog)
    large_vifs = {k: vifs[k] for k in vifs.keys() if vifs[k] > 10}
    coef = {}

    if len(large_vifs) != 0:
        max_vif = max(large_vifs, key=large_vifs.get)
        ignore_cols.append(max_vif)
        coef = smart_regression(df, ignore_cols)
    else:
        exog = sm.add_constant(exog)
        model = smlr.OLS(endog, exog).fit()
        coef = model.params
    return coef
Esempio n. 28
0
    def apply_OLS(self, ):

        y_fit = np.log(self.data[self.column_y])

        if not hasattr(self, 'X'):
            self.create_X_variables()

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            model = sm.OLS(y_fit, self.X.astype(float)).fit()
            predictions = model.predict(self.X.astype(float))
            print_model = model.summary()
            prstd, iv_l, iv_u = wls_prediction_std(model)

            # salvando informacoes do OLS
            self.fit_ols_predictions = predictions
            self.fit_ols_stats = print_model
            self.fit_ols_iv_l = iv_l
            self.fit_ols_iv_u = iv_u
            self.fit_ols_stats = prstd

        return model
Esempio n. 29
0
def iteration(e0, depth):
    y_star = y_data
    Wald_iter_df = pd.DataFrame(index=range(depth), columns=y_star.columns)
    for d in range(depth):
        e = e0.sample(frac=1).reset_index(drop=True)
        for i in range(34):
            ei = e.iloc[:, i]
            y_star_lag = pd.Series(index=y_star.index)
            y_star_lag[1:] = y_star.iloc[:-1, i]

            y_star.iloc[1:, i] = ei + Params_df.loc[
                'const', i] + Params_df.loc[0, i] * y_star_lag[1:]  # 生成y-star
            # y_star.iloc[0, i] = y_data.iloc[0, i]

            x_lag1 = pd.Series(index=y_star.index)
            y_lag1 = pd.Series(index=y_star.index)
            x_lag1[1:] = X_data.iloc[:-1, i]
            y_lag1[1:] = y_star.iloc[:-1, i]
            y_reg = y_star.iloc[:, i]

            X_exo = pd.concat([y_lag1, x_lag1], axis=1)
            X_exo = lm.add_constant(X_exo)
            model = lm.OLS(y_reg, X_exo, missing='drop')
            res = model.fit()
            R = np.eye(len(res.params))[2]
            # print(res.wald_test(R).fvalue[0][0])
            # Wald_iter_df.iloc[d, i] = res.wald_test(R).fvalue[0][0]

            try:
                wald_i = res.wald_test(R).fvalue[0][0]
            except ValueError:
                Wald_iter_df.iloc[d, i] = np.nan
                print(d, i, "Appear")
                # print(X_exo)
            else:
                Wald_iter_df.iloc[d, i] = wald_i

    return Wald_iter_df
Esempio n. 30
0
def polynomial_regression():
    ''' Define the model directly through the design matrix. Similar to MATLAB's "regress" command '''

    # Generate the data

    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)

    t = np.arange(0, 10, 0.1)
    y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t))

    # Make the fit. Note that this is another "OLS" than the one in "model_formulas"!
    M = np.column_stack((np.ones(len(t)), t, t**2))
    res = sm.OLS(y, M).fit()

    # Display the results
    print 'Summary:'
    print res.summary()
    print 'The fit parameters are: {0}'.format(str(res.params))
    print 'The confidence intervals are:'
    print res.conf_int()

    return res.params  # should be [ 4.74244177,  2.60675788,  2.03793634]