Python WLSの例、statsmodels.api.WLS Pythonの例

コード例 #1

0

ファイルを表示

    conv_factor = float(2.41e+8)  #moved to test regression line on abs gain
    #conv_factor = float(1./6)
    for i in range(len(Gain)):
        AbsGain.append(Gain[i] * conv_factor)

    print 'AbsGain ', AbsGain

    for i in range(len(Temp)):
        #regression line for Vb = 28V - 30V , 0.5V steps
        X = Volt  #[2:10:]#4:1]
        X = sm.add_constant(X)
        Y = AbsGain[i]  #[2:10:]#[2:]#4:1]
        y_err = Error[i]  #[2:10:]#[2:]#4:1]
        weights = pd.Series(y_err)
        wls_model = sm.WLS(Y, X, weights=1 / weights)
        results = wls_model.fit()
        print 'results', results.params
        inter, slo = results.params
        slope.append(slo)
        intercept.append(inter)
        vbreak = -intercept[i] / slope[i]
        #if (vbreak<60) or (vbreak>70):vbreak = 60
        Vbr.append(vbreak)
        Vbrmean = np.mean(Vbr)

        OVlist.append(np.array(Volt) - np.array(Vbr[i]))

    Vbr = np.asarray(Vbr)
    print 'Vbr ', Vbr
    print 'OVlist ', OVlist

コード例 #2

0

ファイルを表示

ファイル: chi2_fitting.py プロジェクト: timgates42/statsmodels

#
# Note that `exog` must be a 2-dimensional array with `x` as a column and
# an extra column of ones. Adding this column of ones means you want to fit
# the model `y = a * x + b`, leaving it off means you want to fit the model
# `y = a * x`.
#
# And you have to use the option `cov_type='fixed scale'` to tell
# `statsmodels` that you really have measurement errors with an absolute
# scale. If you do not, `statsmodels` will treat the weights as relative
# weights between the data points and internally re-scale them so that the
# best-fit model will have `chi**2 / ndf = 1`.

exog = sm.add_constant(data["x"])
endog = data["y"]
weights = 1.0 / (data["y_err"]**2)
wls = sm.WLS(endog, exog, weights)
results = wls.fit(cov_type="fixed scale")
print(results.summary())

# ### Check against scipy.optimize.curve_fit

# You can use `scipy.optimize.curve_fit` to get the best-fit parameters
# and parameter errors.
from scipy.optimize import curve_fit


def f(x, a, b):
    return a * x + b


xdata = data["x"]

コード例 #3

0

ファイルを表示

ファイル: Plot_All_Area_ref.py プロジェクト: bmgebhardt/MPIK_SPIM_CODE

        if i == 0:
            G_ign = 7
            Gainsave = Gain[0,G_ign]
            Gain[0,G_ign]=(Gain[0,G_ign-1]+Gain[0,G_ign+1])/2
        '''
        #regression line for Gain

        X = Volt[regrleft:regrright:]  #4:1]
        #print X.dtype
        X = sm.add_constant(X)
        Y = Gain[i][regrleft:regrright:]  #[2:]#4:1]
        #print Gain[i]
        y_err = Error[i][regrleft:regrright:]  #[2:]#4:1]
        weights1 = pd.Series(y_err)
        #print type(weights1)
        wls_model1 = sm.WLS(Y, X, weights=1 / weights1)
        results1 = wls_model1.fit()
        print 'results ', results1.params
        #print results1.params.shape
        inter1, slo1 = results1.params
        slope.append(slo1)
        intercept.append(inter1)
        vbreak1 = -intercept[i] / slope[i]

        #vbreak1 = 64.2
        #if (vbreak<60) or (vbreak>70):vbreak = 60
        Vbr.append(vbreak1)
        #err.append()
        '''
        if i == 0:
            Gain[0,G_ign]=Gainsave

コード例 #4

0

ファイルを表示

ファイル: EM.py プロジェクト: ninoch/DoGR

def EM(data, number_of_components, debug=False):
    N, D = np.shape(data)
    D = D - 1

    X, Y = data[:, :-1], data[:, -1]

    if D == 1:
        X = X.reshape((N, 1))
    Y = Y.reshape((N, 1))
    if debug:
        print("N = ", N, " D = ", D, " K = ", number_of_components)
        print("Data size: ", data.shape)
        print("X size: ", X.shape)
        print("Y size: ", Y.shape)

    if len(data) > MAX_DF_SIZE:
        n = min(len(data), number_of_components * 1000)
        sampled_data = data[np.random.randint(data.shape[0], size=n), :]
        priors, mu, sigma, coefficients, y_sigma = EM_init(
            sampled_data, number_of_components)
    else:
        priors, mu, sigma, coefficients, y_sigma = EM_init(
            data, number_of_components)

    if debug:
        print("Priors0 ", priors.shape, " =\n", priors)
        print("Mu0 ", mu.shape, " =\n", mu)
        print("Sigma0 ", sigma.shape, " =\n", sigma)
        print("Coefficients0 ", coefficients.shape, " =\n", coefficients)
        print("Ysigma0 ", y_sigma.shape, " =\n", y_sigma)

    gamma = np.ndarray(shape=(N, number_of_components))

    min_value = sys.float_info.min
    max_value = sys.float_info.max
    loglikelihood_threshold = 1e-10
    old_loglikelihood = -1 * max_value
    loglikelihood = 0

    iteration = 1

    while True:
        # Expectation
        for k in range(number_of_components):
            mu_large = np.append(mu[k, :], 0)
            sigma_large = np.insert(np.insert(sigma[k, :, :], D, 0, axis=1),
                                    D,
                                    0,
                                    axis=0)
            sigma_large[-1, -1] = y_sigma[k]

            keep_y = np.copy(data[:, -1])
            data[:, -1] -= coefficients[k, 0] + np.dot(data[:, :-1],
                                                       coefficients[k, 1:])
            gamma[:, k] = priors[k] * scipy.stats.multivariate_normal.pdf(
                data, mu_large, sigma_large, allow_singular=True)
            data[:, -1] = np.copy(keep_y)

        denominator = np.sum(gamma, axis=1)  # CHECK
        denominator = denominator.reshape((N, 1))

        denominator = np.where(denominator < min_value, min_value, denominator)

        gamma = gamma / denominator

        if debug:
            print("GAMMA ~ min: {}, max: {}".format(np.min(gamma),
                                                    np.max(gamma)))

        n_component = np.sum(gamma, axis=0)
        n_component = n_component.reshape((number_of_components, 1))

        # Maximization
        for k in range(number_of_components):
            priors[k] = n_component[k] / N
            mu[k, :] = np.dot(gamma[:, k], X) / n_component[k]
            sigma[k, :, :] = np.matmul(
                (X - mu[k]).T, gamma[:, k, np.newaxis] *
                (X - mu[k])) / n_component[k] + 0.00001 * np.diag(
                    np.diag(np.ones((D, D))))

            model = sm.WLS(Y, sm.add_constant(X), weights=gamma[:, k])
            res = model.fit()
            coefficients[k, :] = res.params
            y_sigma[k] = np.dot(
                gamma[:, k, np.newaxis].T,
                np.power(
                    Y - np.dot(coefficients[k, :],
                               sm.add_constant(X).T)[:, np.newaxis],
                    2)) / n_component[k]  # SQRT?

        if debug:
            print("\t Updated all parameters!")

        # Log-likelihood
        new_gamma = np.ndarray(shape=(N, number_of_components))
        for k in range(number_of_components):
            mu_large = np.append(mu[k, :], 0)
            sigma_large = np.insert(np.insert(sigma[k, :, :], D, 0, axis=1),
                                    D,
                                    0,
                                    axis=0)
            sigma_large[-1, -1] = y_sigma[k]

            keep_y = np.copy(data[:, -1])
            data[:, -1] -= coefficients[k, 0] + np.dot(data[:, :-1],
                                                       coefficients[k, 1:])
            new_gamma[:, k] = scipy.stats.multivariate_normal.pdf(
                data, mu_large, sigma_large,
                allow_singular=True)  # CHECK * priors[k]
            data[:, -1] = np.copy(keep_y)

        if debug:
            print("\t Computed New Gamma!")

        probs = np.dot(new_gamma, priors)

        probs = np.where(probs < min_value, min_value, probs)
        probs = np.reshape(probs, (N, 1))

        if debug:
            print("\t Computed probs!")
        loglikelihood = np.mean(np.log10(probs), 0)

        ret_ll = np.sum(np.log(probs))
        if np.absolute((loglikelihood / old_loglikelihood) -
                       1) < loglikelihood_threshold:
            break

        iteration += 1
        if iteration > 1000:
            break
        old_loglikelihood = loglikelihood
    return priors, mu, sigma, coefficients, y_sigma, ret_ll

コード例 #5

0

ファイルを表示

 def setup(self):
     #fit for each test, because results will be changed by test
     x = self.exog
     np.random.seed(987689)
     y = x.sum(1) + np.random.randn(x.shape[0])
     self.results = sm.WLS(y, self.exog, weights=np.ones(len(y))).fit()

コード例 #6

0

ファイルを表示

 def set(self, endog, exog=None):
     self.model = sm.WLS(endog, exog=exog, **self.rmodelparams)

コード例 #7

0

ファイルを表示

    def factor_ret(self):
        weight = self.cap ** 0.5
        weight = 1/weight
        weight_tr = weight.loc[:, self.today:self.today]
        df_weight = weight_tr.unstack()
        self.rtn = []
        self.pct = []
        self.expos = []
        ret_tr = self.data.iloc[self.today:self.today,:,6]
        self.df_ret = ret_tr.unstack()
        self.tot_ret = self.ret.loc["ret_pct",self.today]

        if (len(ret_tr)) and len(self.bigsize) and len(self.medsize) \
                and len(self.retv) and len(self.turn) and len(self.wgt_rt) \
                and len(self.halpha) and len(self.increase) and len(self.EY) \
                and len(self.ROE) and len(self.BLEV) and len(self.B800) \
                and len(self.R800) and len(self.KDJ):
            df_regression = pd.DataFrame([self.df_ret.values,self.df_bigsize.values,
                                          self.df_medsize.values,self.df_retv.values,
                                          self.df_turn.values,self.df_wgt_rt.values,
                                          self.df_halpha.values,self.df_increase.values,
                                          self.df_EY.values,self.df_ROE.values,self.df_BLEV.values,
                                          self.df_B800.values,self.df_R800.values,self.df_KDJ.values],index=['ret','bigsize',
                                         'medsize','retv','turn','wgt_rt','halpha','increase','EY','ROE','BLEV','B800','R800','KDJ'])
            df_regression = df_regression.T
            for i in range(0, len(self.Industry)):
                df_regression[self.Industry[i].unstack().T.index.name] = self.Industry[i].values
            df_regression['weight'] = df_weight.values
            df_regression = df_regression.dropna()
            y = df_regression.iloc[:, 0].tolist()
            temp = []
            for i in range(1, df_regression.shape[1] - 1):
                temp.append(df_regression.iloc[:, i].tolist())
            X = np.column_stack(temp)
            W = df_regression.iloc[:, -1].tolist()
            fit = sm.WLS(y, X, weight=W).fit()
            self.pfl_W = self.hold.iloc[:,self.today:self.today]
            if (self.pfl_W.empty == False):
                exp_bigsize = np.dot(self.pfl_W.iloc[:, 0].values,self.df_bigsize.fillna(0).values)
                ret_bigsize = exp_bigsize * fit.params[0]
                pct_bigsize = ret_bigsize/self.tot_ret
                self.rtn.append(ret_bigsize)
                self.pct.append(pct_bigsize)
                self.expos.append(exp_bigsize)

                exp_medsize = np.dot(self.pfl_W.iloc[:, 0].values,self.df_medsize.fillna(0).values)
                ret_medsize = exp_medsize * fit.params[1]
                pct_medsize = ret_medsize/self.tot_ret
                self.rtn.append(ret_medsize)
                self.pct.append(pct_medsize)
                self.expos.append(exp_medsize)

                exp_retv = np.dot(self.pfl_W.iloc[:, 0].values,self.df_retv.fillna(0).values)
                ret_retv = exp_retv * fit.params[2]
                pct_retv = ret_retv/self.tot_ret
                self.rtn.append(ret_retv)
                self.pct.append(pct_retv)
                self.expos.append(exp_retv)

                exp_turn = np.dot(self.pfl_W.iloc[:, 0].values,self.df_turn.fillna(0).values)
                ret_turn = exp_turn * fit.params[3]
                pct_turn = ret_turn/self.tot_ret
                self.rtn.append(ret_turn)
                self.pct.append(pct_turn)
                self.expos.append(exp_turn)

                exp_wgt_rt = np.dot(self.pfl_W.iloc[:, 0].values,self.df_wgt_rt.fillna(0).values)
                ret_wgt_rt = exp_wgt_rt * fit.params[4]
                pct_wgt_rt = ret_wgt_rt/self.tot_ret
                self.rtn.append(ret_wgt_rt)
                self.pct.append(pct_wgt_rt)
                self.expos.append(exp_wgt_rt)

                exp_halpha = np.dot(self.pfl_W.iloc[:, 0].values,self.df_halpha.fillna(0).values)
                ret_halpha = exp_halpha * fit.params[5]
                pct_halpha = ret_halpha/self.tot_ret
                self.rtn.append(ret_halpha)
                self.pct.append(pct_halpha)
                self.expos.append(exp_halpha)

                exp_increase = np.dot(self.pfl_W.iloc[:, 0].values,self.df_increase.fillna(0).values)
                ret_increase = exp_increase * fit.params[6]
                pct_increase = ret_increase/self.tot_ret
                self.rtn.append(ret_increase)
                self.pct.append(pct_increase)
                self.expos.append(exp_increase)

                exp_EY = np.dot(self.pfl_W.iloc[:, 0].values,self.df_EY.fillna(0).values)
                ret_EY = exp_EY * fit.params[7]
                pct_EY = ret_EY/self.tot_ret
                self.rtn.append(ret_EY)
                self.pct.append(pct_EY)
                self.expos.append(exp_EY)

                exp_ROE = np.dot(self.pfl_W.iloc[:, 0].values, self.df_ROE.fillna(0).values)
                ret_ROE = exp_ROE * fit.params[8]
                pct_ROE = ret_ROE / self.tot_ret
                self.rtn.append(ret_ROE)
                self.pct.append(pct_ROE)
                self.expos.append(exp_ROE)

                exp_BLEV = np.dot(self.pfl_W.iloc[:, 0].values, self.df_BLEV.fillna(0).values)
                ret_BLEV = exp_BLEV * fit.params[9]
                pct_BLEV = ret_BLEV / self.tot_ret
                self.rtn.append(ret_BLEV)
                self.pct.append(pct_BLEV)
                self.expos.append(exp_BLEV)

                exp_B800 = np.dot(self.pfl_W.iloc[:, 0].values, self.df_B800.fillna(0).values)
                ret_B800 = exp_B800 * fit.params[10]
                pct_B800 = ret_B800 / self.tot_ret
                self.rtn.append(ret_B800)
                self.pct.append(pct_B800)
                self.expos.append(exp_B800)

                exp_R800 = np.dot(self.pfl_W.iloc[:, 0].values, self.df_R800.fillna(0).values)
                ret_R800 = exp_R800 * fit.params[11]
                pct_R800 = ret_R800 / self.tot_ret
                self.rtn.append(ret_R800)
                self.pct.append(pct_R800)
                self.expos.append(exp_R800)

                exp_KDJ = np.dot(self.pfl_W.iloc[:, 0].values, self.df_KDJ.fillna(0).values)
                ret_KDJ = exp_KDJ * fit.params[12]
                pct_KDJ = ret_KDJ / self.tot_ret
                self.rtn.append(ret_KDJ)
                self.pct.append(pct_KDJ)
                self.expos.append(exp_KDJ)

                for i in range(0, len(self.Industry)):
                    exp_tr = np.dot(self.pfl_W.iloc[:, 0].values, self.Industry[i].fillna(0).values)
                    ret_tr = exp_tr * fit.params[i + 13]
                    pct_tr = ret_tr/self.tot_ret
                    self.rtn.append(ret_tr)
                    self.pct.append(pct_tr)
                    self.expos.append(exp_tr)

                self.result = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\rtn.xlsx", sheetname=0,
                                            index_col=0)
                self.percentage = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\pct.xlsx", sheetname=0,
                                                index_col=0)
                self.exposure = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\expos.xlsx", sheetname=0,
                                              index_col=0)

                self.result[format(self.today, "%Y-%m-%d")] = self.rtn
                self.result.to_excel(r"C:\DELL\internship\CICC\Trans\result\rtn.xlsx")
                self.percentage[format(self.today, "%Y-%m-%d")] = self.pct
                self.percentage.to_excel(r"C:\DELL\internship\CICC\Trans\result\pct.xlsx")
                self.exposure[format(self.today, "%Y-%m-%d")] = self.expos
                self.exposure.to_excel(r"C:\DELL\internship\CICC\Trans\result\expos.xlsx")
            else:
                pass
        else:
            pass

コード例 #8

0

ファイルを表示

ファイル: WLS.py プロジェクト: xiaoyyy/Self_Barra_System

                this_stock_factor_list = []
                i = 0
                while i < len(factor_list):
                    this_stock_factor_list.append(factor_dict[(stock,
                                                               date)][i])
                    i += 1
                i = 0
                while i < len(this_stock_dummy_list):
                    this_stock_factor_list.append(this_stock_dummy_list[i])
                    i += 1
                this_whole_stock_factors_list.append(this_stock_factor_list)
            else:
                pass

    X = sm.add_constant(this_whole_stock_factors_list)
    wls_model = sm.WLS(ROR_list, X, weights=sqrt_liquid_list)
    results = wls_model.fit()
    U_list_temp = results.resid  #残差项，也就是特质因子序列
    T_value_list_temp = results.tvalues  #模型对每个因子的t值
    R_squared_temp = results.rsquared  #模型R2值
    R_squared_adj_temp = results.rsquared_adj  #模型调整后的R2值
    f_list_temp = results.params  #模型参数，也就是因子收益率序列
    U_list.append(U_list_temp)
    T_value_list.append(T_value_list_temp)
    R_squared.append(R_squared_temp)
    R_squared_adj.append(R_squared_adj_temp)
    f_list.append(f_list_temp)
    U2_list_temp = xyk_common_data_processing.element_cal_between_list(
        U_list_temp, U_list_temp, "*")
    WU2_list_temp = xyk_common_data_processing.element_cal_between_list(
        U2_list_temp, sqrt_liquid_list, "*")

コード例 #9

0

ファイルを表示

ファイル: simple_analyze.py プロジェクト: sapatha2/cuo

def analyze(df, save=False):

    pca_parms = ['mo_t_pi', 'mo_t_ds', 'mo_t_dz', 'mo_t_sz']
    pca = PCA(n_components=len(pca_parms))
    pca.fit(df[pca_parms])
    print(pca.explained_variance_ratio_)
    Xbar = df[pca_parms]
    #Xbar=pca.transform(Xbar)
    Xbar = np.dot(Xbar, pca.components_.T)

    for i in range(len(pca_parms)):
        df['x' + str(i)] = Xbar[:, i]

    parms = ['mo_n_3d', 'mo_n_2ppi', 'mo_n_2pz', 'Us', 'x0',
             'x1']  #,'x2','x3']
    X = df[parms]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.OLS(y, X).fit()
    print(ols.summary())

    print(ols.params)
    pca_params = ols.params[-2:].values
    pca_params = list(pca_params) + [0, 0]

    reg_params = np.dot(pca_params, pca.components_)
    print(pca_params)
    print(reg_params)

    exit(0)

    df['resid'] = df['energy'] - ols.predict()
    #model=['mo_n_3d','mo_n_4s','mo_n_2ppi','mo_n_2pz','Us']
    #model=['mo_t_pi','mo_t_ds','mo_t_dz','mo_t_sz']
    sns.pairplot(df, vars=['resid', 'x0', 'x1', 'x2',
                           'x3'])  #,vars=['resid']+model)
    plt.show()
    exit(0)

    beta = 2

    model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_pi', 'Us']
    X = df[model]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit()
    params = ols.params[1:]
    ed = diagonalize(list(params[:-2]) + [params[-2], 0, 0, 0, 0, params[-1]])
    print(ols.summary())
    print(ed)

    model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_dz', 'Us']
    X = df[model]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit()
    params = ols.params[1:]
    ed = diagonalize(list(params[:-2]) + [0, params[-2], 0, 0, 0, params[-1]])
    print(ols.summary())
    print(ed)

    model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_sz', 'Us']
    X = df[model]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit()
    params = ols.params[1:]
    ed = diagonalize(list(params[:-2]) + [0, 0, params[-2], 0, 0, params[-1]])
    print(ols.summary())
    print(ed)

    model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_ds', 'Us']
    X = df[model]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit()
    params = ols.params[1:]
    ed = diagonalize(list(params[:-2]) + [0, 0, 0, params[-2], 0, params[-1]])
    print(ols.summary())
    print(ed)

    model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'Us']
    X = df[model]
    X = sm.add_constant(X)
    y = df['energy']
    ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit()
    params = ols.params[1:]
    params = list(params[:3]) + [0] + list(
        params[3:-1]) + [0, 0, 0, 0, params[-1]]
    ed = diagonalize(params)
    print(ols.summary())
    print(ed)

    exit(0)

コード例 #10

0

ファイルを表示

# plt.ylabel(r'$\frac{dj}{dE}$')
# plt.savefig('standard_specter.png')
# plt.show()
#
#
#
# plt.plot(x1, x2, x1, x3)
# plt.legend(['krovna plast ', 'sredica'],loc=4)
# plt.grid()
# plt.xlabel('energija')
# plt.ylabel(r'$\frac{dj}{dE}$')
# plt.savefig('meritve_specter.png')
# plt.show()

mod_wls = sm.WLS(
    x3,
    A,
)
res_wls = mod_wls.fit()
parametri = res_wls.params

print(res_wls.summary())

razlika1 = x2 - np.dot(A, parametri)

mod_wls = sm.WLS(
    x2,
    A,
)
res_wls = mod_wls.fit()
parametri = res_wls.params

コード例 #11

0

ファイルを表示

ファイル: base_pspec2.py プロジェクト: niyaa/TurbuStat

    def fit_pspec(self,
                  brk=None,
                  log_break=False,
                  low_cut=None,
                  high_cut=None,
                  min_fits_pts=10,
                  weighted_fit=False,
                  bootstrap=False,
                  bootstrap_kwargs={},
                  verbose=False):
        '''
        Fit the 1D Power spectrum using a segmented linear model. Note that
        the current implementation allows for only 1 break point in the
        model. If the break point is estimated via a spline, the breaks are
        tested, starting from the largest, until the model finds a good fit.

        Parameters
        ----------
        brk : float or None, optional
            Guesses for the break points. If given as a list, the length of
            the list sets the number of break points to be fit. If a choice is
            outside of the allowed range from the data, Lm_Seg will raise an
            error. If None, a spline is used to estimate the breaks.
        log_break : bool, optional
            Sets whether the provided break estimates are log-ed (base 10)
            values. This is disabled by default. When enabled, the brk must
            be a unitless `~astropy.units.Quantity`
            (`u.dimensionless_unscaled`).
        low_cut : `~astropy.units.Quantity`, optional
            Lowest frequency to consider in the fit.
        high_cut : `~astropy.units.Quantity`, optional
            Highest frequency to consider in the fit.
        min_fits_pts : int, optional
            Sets the minimum number of points needed to fit. If not met, the
            break found is rejected.
        weighted_fit : bool, optional
            Fit using weighted least-squares. The weights are
            the inverse-squared standard deviations in each radial bin.
        bootstrap : bool, optional
            Bootstrap using the model residuals to estimate the parameter
            standard errors. This tends to give more realistic intervals than
            the covariance matrix.
        bootstrap_kwargs : dict, optional
            Pass keyword arguments to `~turbustat.statistics.fitting_utils.residual_bootstrap`.
        verbose : bool, optional
            Enables verbose mode in Lm_Seg.
        '''

        self._bootstrap_flag = bootstrap

        # Make the data to fit to
        if low_cut is None:
            # Default to the largest frequency, since this is just 1 pixel
            # in the 2D PSpec.
            self.low_cut = 1. / (0.5 * float(max(self.ps2D.shape)) * u.pix)
        else:
            self.low_cut = self._to_pixel_freq(low_cut)

        if high_cut is None:
            self.high_cut = self.freqs.max().value / u.pix
        else:
            self.high_cut = self._to_pixel_freq(high_cut)

        x = np.log10(self.freqs[clip_func(self.freqs.value, self.low_cut.value,
                                          self.high_cut.value)].value)

        clipped_ps1D = self.ps1D[clip_func(self.freqs.value,
                                           self.low_cut.value,
                                           self.high_cut.value)]
        y = np.log10(clipped_ps1D)

        if weighted_fit:

            clipped_stddev = self.ps1D_stddev[clip_func(
                self.freqs.value, self.low_cut.value, self.high_cut.value)]

            clipped_stddev[clipped_stddev == 0.] = np.NaN

            y_err = 0.434 * clipped_stddev / clipped_ps1D

        if brk is not None:
            # Try the fit with a break in it.
            if not log_break:
                brk = self._to_pixel_freq(brk).value
                brk = np.log10(brk)
            else:
                # A value given in log shouldn't have dimensions
                if hasattr(brk, "unit"):
                    assert brk.unit == u.dimensionless_unscaled
                    brk = brk.value

            if weighted_fit:
                weights = 1 / y_err**2
            else:
                weights = None

            brk_fit = Lm_Seg(x, y, brk, weights=weights)
            brk_fit.fit_model(verbose=verbose, cov_type='HC3')

            if brk_fit.params.size == 5:

                # Check to make sure this leaves enough to fit to.
                if sum(x < brk_fit.brk) < min_fits_pts:
                    warnings.warn("Not enough points to fit to." +
                                  " Ignoring break.")

                    self._brk = None
                else:
                    good_pts = x.copy() < brk_fit.brk
                    x = x[good_pts]
                    y = y[good_pts]

                    self._brk = 10**brk_fit.brk / u.pix

                    self._slope = brk_fit.slopes

                    if bootstrap:
                        stderrs = residual_bootstrap(brk_fit.fit,
                                                     **bootstrap_kwargs)

                        self._slope_err = stderrs[1:-1]
                        self._brk_err = np.log(10) * self.brk.value * \
                            stderrs[-1] / u.pix

                    else:
                        self._slope_err = brk_fit.slope_errs
                        self._brk_err = np.log(10) * self.brk.value * \
                            brk_fit.brk_err / u.pix

                    self.fit = brk_fit.fit
                    self._model = brk_fit

            else:
                self._brk = None
                # Break fit failed, revert to normal model
                warnings.warn("Model with break failed, reverting to model\
                               without break.")
        else:
            self._brk = None
            self._brk_err = None

        if self.brk is None:
            x = sm.add_constant(x)

            if weighted_fit:
                model = sm.WLS(y, x, missing='drop', weights=1 / y_err**2)
            else:
                model = sm.OLS(y, x, missing='drop')

            self.fit = model.fit(cov_type='HC3')

            self._slope = self.fit.params[1]

            if bootstrap:
                stderrs = residual_bootstrap(self.fit, **bootstrap_kwargs)
                self._slope_err = stderrs[1]

            else:
                self._slope_err = self.fit.bse[1]

コード例 #12

0

ファイルを表示

ファイル: main_models.py プロジェクト: wuwangchuxinybf/multiple_factors_models

 def WLS_regression(x, y, w):
     #加权最小二乘法回归
     X = sm.add_constant(x)
     regr = sm.WLS(y, X, weights=w).fit()
     #results.tvalues T值 regr.resid,残差；regr.params，beta值;results.t_test([1,0])
     return regr

コード例 #13

0

ファイルを表示

def signal_factor_test(data, current_price, market_values):
    """
    单因子测试：回归、IC
    :param data:
    data: 时间+行业(列名industry)+因子+股票收益率(y)
    current_price： 个股流通市值,当作wls的权重
    market_values: 市值因子,IC回归的变量之一
    :return:
    table_reg: 估值因子回归测试结果
    table_ic： 估值因子IC值分析
    """
    # 1.数据处理
    industy = pd.get_dummies(data.iloc[:, 1])  # 类别特征抽取
    y = DataFrame(data.iloc[:, -1])  # 提取回归模型中的Y,先添加类别特征数据框，再加回去（让因变量在最后一列）
    yname = y.columns.values
    newdata = data.drop(["industry", yname[0]], axis=1)  # 删除行业列和Y
    newdata = pd.merge(newdata,
                       industy,
                       left_index=True,
                       right_index=True,
                       how="outer")  # 数据合并(时间+因子+行业虚拟变量)
    # wls回归的数据整理
    newdata_reg = pd.merge(newdata,
                           y,
                           left_index=True,
                           right_index=True,
                           how="outer")  # 数据合并(时间+因子+行业虚拟变量+y)
    newdata_wls = pd.merge(newdata_reg,
                           current_price,
                           left_index=True,
                           right_index=True,
                           how="outer")  # 数据合并(时间+因子+行业虚拟变量+y+权重股票流通价值)
    # IC回归的数据整理
    newdata_mfv = pd.merge(newdata,
                           market_values,
                           left_index=True,
                           right_index=True,
                           how="outer")  # 数据合并(时间+因子+行业虚拟变量+市值因子)
    newdata_ic = pd.merge(newdata_mfv,
                          y,
                          left_index=True,
                          right_index=True,
                          how="outer")  # 数据合并(时间+因子+行业虚拟变量+市值因子+y)

    # 2.拟合回归方程,提取结果,生成表格
    time = data.iloc[:, 0].unique()  # 提取数据中所有的时间类型
    fnum = data.shape[1] - 3  # 因子数量
    table_reg = DataFrame(np.zeros((1, 7)),
                          columns=[
                              "因子", "|t|均值", "|t|>2占比", "t均值", "t均值/t标准差",
                              "因子收益率均值", "因子收益率序列t检验"
                          ])
    table_ic = DataFrame(
        np.zeros((1, 6)),
        columns=["因子", "IC序列均值", "IC序列标准差", "IR比率", "IC>0占比", "|IC|>0.02占比"])
    fig, ax = plt.subplots(1, 2)
    for i in range(fnum):
        # 每次因子循环列表清空
        tlist = []  # t值列表
        rlist = []  # 因子收益率列表
        iclist = []  # ic值列表
        for j in range(len(time)):

            # wls,创建属于某一个因子的其中一个一个截面的所有数据集(股票+因子+虚拟变量+y+股票流通价值)
            newdata2 = newdata_wls[newdata_wls.iloc[:, 0].isin([time[j]])]
            # IC的ols,创建属于某一个因子的其中一个截面的所有数据集(股票+因子+虚拟变量+市值因子+y)
            newdata3 = newdata_ic[newdata_ic.iloc[:, 0].isin([time[j]])]
            # 创建回归方程中的自变量名字列表
            col = list(industy.columns.values)
            factor_data = DataFrame(newdata2.iloc[:, i + 1])
            global factor_name  # 全局变量
            factor_name = list(factor_data.columns.values)[0]
            col.append(factor_name)  # 单因子自变量的名字列表,list.append没有返回值,直接修改col
            # wls回归
            y = newdata2.iloc[:, -2]
            x = sm.add_constant(newdata2.loc[:, col])  # 增加常数项
            reg = sm.WLS(y, x, weights=newdata2.iloc[:,
                                                     -1])  # loc是列名索引,exog增加截距
            model = reg.fit()
            # IC的ols回归
            col.append(list(DataFrame(market_values).columns.values)[0])
            x_ic = sm.add_constant(newdata3.loc[:, col])
            reg_ic = sm.WLS(y, x_ic)
            model_ic = reg_ic.fit()
            # 提取wls回归模型结果
            tvalues = DataFrame(model.tvalues).iloc[-1, :]
            weight = DataFrame(model.params)
            tlist.append(DataFrame(model.tvalues).iloc[-1, :])
            rlist.append(weight.iloc[-1, :])
            # 提取IC的ols回归模型结果
            iclist.append(np.sqrt(1 - model_ic.rsquared))
        # wls结果合并
        tarr = np.array(tlist)
        rarr = np.array(rlist)
        table = {
            "因子": factor_name,
            "|t|均值": np.mean(np.abs(tarr)),
            "|t|>2占比":
            list(np.where(np.abs(tarr) > 2, 1, 0)).count(1) / len(time),
            "t均值": np.mean(tarr),
            "t均值/t标准差": np.mean(tarr) / np.std(tarr),
            "因子收益率均值": np.mean(rarr),
            "因子收益率序列t检验": np.std(rarr)
        }
        table_reg0 = DataFrame(table,
                               columns=[
                                   "因子", "|t|均值", "|t|>2占比", "t均值", "t均值/t标准差",
                                   "因子收益率均值", "因子收益率序列t检验"
                               ],
                               index=[i + 1])

        ax[0].plot(time, rarr.cumsum(), label=factor_name)

        # IC结果合并
        table_reg = pd.concat([table_reg, table_reg0])  # 纵向合并
        icarr = np.array(iclist)
        table1 = {
            "因子":
            factor_name,
            "IC序列均值":
            np.mean(icarr),
            "IC序列标准差":
            np.std(icarr),
            "IR比率":
            np.mean(icarr) / np.std(icarr),
            "IC>0占比":
            list(np.where(icarr > 0, 1, 0)).count(1) / len(icarr),
            "|IC|>0.02占比":
            list(np.where(np.abs(icarr) > 0.02, 1, 0)).count(1) / len(icarr)
        }

        table_ic0 = DataFrame(table1,
                              columns=[
                                  "因子", "IC序列均值", "IC序列标准差", "IR比率", "IC>0占比",
                                  "|IC|>0.02占比"
                              ],
                              index=[i + 1])
        table_ic = pd.concat([table_ic, table_ic0])  # 纵向合并
        ax[1].plot(time, icarr.cumsum(), "b-")
    table_reg = table_reg.iloc[1:, :].set_index("因子")  # set_index 列变索引
    table_ic = table_ic.iloc[1:, :].set_index("因子")
    plt.legend(loc="upper center")
    plt.show()
    return table_reg, table_ic

コード例 #14

0

ファイルを表示

 def __init__(self,x,y,w):
     self.x = x
     self.y = y
     self.w = w
     self.model = sm.WLS(y,sm.add_constant(x),weights = self.w)
     self.fit = self.model.fit()

コード例 #15

0

ファイルを表示

            dS = np.min(np.stack(
                [abs(np.ceil(dY) - dY),
                 abs(dY - np.floor(dY))]),
                        axis=0)
            d = np.stack([dY, dQ, dS])
            if n > 100:
                hh = np.repeat(h[:, None], n, axis=1)
                bW = False
                while ~bW:
                    bW = np.min(np.sum((hh - d) > 0, axis=1)) > 100
                    hh = hh * 1.1 if not bW else hh
            else:
                htemp = np.max(d, axis=1) * 1.1
                hh = np.repeat(htemp[:, None], n, axis=1)
            w = (1 - (d / hh)**3)**3
            w[w < 0] = 0
            wAll = w[0] * w[1] * w[2]
            ind = np.where(wAll > 0)[0]
            ww = wAll[ind]
            # fit WLS
            Y = df1.iloc[ind][code].values
            X = df1.iloc[ind][xVarLst].values
            model = sm.WLS(Y, X, weights=ww).fit()
            xp = df2.loc[t][xVarLst].values
            yp = model.predict(xp)[0]
            dfYP.loc[t][code] = np.exp(yp) - sn
        t1 = time.time()
        print(k, siteNo, code, t1 - t0)
    saveName = os.path.join(dirOut, siteNo)
    dfYP.to_csv(saveName)

コード例 #16

0

ファイルを表示

                             data_day_dummies,
                             how='left',
                             left_index=True,
                             right_index=True,
                             sort=False)
 industry_t = list(data_day_style_t.loc[loc_t, 'INDUSTRY'].unique())
 columns_t = industry_t + style
 x = data_day_style_t.loc[loc_t, columns_t].values
 X = sm.add_constant(x)
 y = data_day_style_t.loc[
     loc_t,
     'specific_risk_raw'].values  # notice: 写下一个相同的循环时，需修改此处 specific_risk_NW
 Y = np.log(y)
 stock_weights = data_day_style_t.loc[loc_t, 'WEIGHT'].values
 stock_weights = np.sqrt(stock_weights)
 wls_model = sm.WLS(Y, X,
                    weights=stock_weights)  # Notice: stock_weights
 wls_results = wls_model.fit()
 params_t = wls_results.params
 #
 x_predict = data_day_style_t.loc[~loc_t, columns_t].values
 X_predict = sm.add_constant(x_predict)
 Y_predict = np.mat(X_predict) * np.mat(params_t.reshape(-1, 1))
 y_predict = np.array(Y_predict.T)[0]
 y_predict = E_0 * np.exp(y_predict)
 #
 data_day_style_t.loc[loc_t,
                      'specific_risk_SM_1'] = data_day_style_t.loc[
                          loc_t, 'specific_risk_raw']
 specific_risk_t = data_day_style_t.loc[
     ~loc_t, 'coordination_coef'].values * data_day_style_t.loc[
         ~loc_t, 'specific_risk_raw'].values

コード例 #17

0

ファイルを表示

#wh3=sm.stats.diagnostic.het_
#white(res3.resid,res3.model.exog)
##########Linearidade
ln1=sm.stats.diagnostic.linear_reset(res1, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None)
ln2=sm.stats.diagnostic.linear_reset(res2, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None)
ln3=sm.stats.diagnostic.linear_reset(res3, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None)



#####################Corrigindo a autocorrelação


x = sm.add_constant(x)
y1=dados['Taxa de Transmissão -B1']
y2=dados['Taxa de Transmissão -B2']
wls_model = sm.WLS(y1,x, weights=list(range(1,99)))
results = wls_model.fit()

wls_model2=sm.WLS(y2,x,weights=list(range(1,99)))
results2=wls_model2.fit()


###############Testes
###########Heterogeneidade

bp1=sm.stats.diagnostic.het_breuschpagan(results.resid,results.model.exog)
bp2=sm.stats.diagnostic.het_breuschpagan(results2.resid,results2.model.exog)

#############
bg1=sm.stats.diagnostic.acorr_breusch_godfrey(results)
bg1=sm.stats.diagnostic.acorr_breusch_godfrey(results2)

コード例 #18

0

ファイルを表示

ファイル: lib_for_growth_model.py プロジェクト: julierozz/poverty_climate_model_public

def estime_income(hhcat, finalhhframe):
    '''
	Estimates income brought by each category of adults/elderly. The objective is not to have a good model of income but rather to find a starting point for making the income grow based on the household's composition. If the income of the unemployed or elderly is found negative, it is put equal to zero and we re-estimate.
	If the coefficients are non significant, we try different categories (by grouping existing categories) and keep the new coefficients only if they become significant.
	We ignore the richest 5%.
	'''
    select = finalhhframe.Y < float(
        perc_with_spline(finalhhframe.Y,
                         finalhhframe.weight * finalhhframe.nbpeople, 0.95))
    X = finalhhframe.ix[select, [
        'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers',
        'cat5workers', 'cat6workers', 'cat7workers', 'old'
    ]].copy()
    w = finalhhframe.ix[select, 'weight'].copy()
    w[w == 0] = 10**(-10)
    Y = (finalhhframe.ix[select, 'Y'] * finalhhframe.ix[select, 'nbpeople'])
    result = sm.WLS(Y, X, weights=1 / w).fit()
    inc = result.params
    nonworkers = inc[['cat7workers', 'old']].copy()
    negs = nonworkers[nonworkers < 0].index
    if len(negs) > 0:
        X.drop(negs.values, axis=1, inplace=True)
        result = sm.WLS(Y, X, weights=1 / w).fit()
        inc = result.params
        for ii in negs:
            inc[ii] = 0
    a = result.pvalues
    nonsign1 = a[a > 0.05].index
    nonsign2 = []
    nonsign3 = []
    if len(nonsign1) > 0:
        X = finalhhframe.ix[select, [
            'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers',
            'cat5workers', 'cat6workers', 'cat7workers', 'old'
        ]].copy()
        X['serv'] = X['cat1workers'] + X['cat2workers']
        X['ag'] = X['cat3workers'] + X['cat4workers']
        X['manu'] = X['cat5workers'] + X['cat6workers']
        X.drop([
            'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers',
            'cat5workers', 'cat6workers'
        ],
               axis=1,
               inplace=True)
        result3 = sm.WLS(Y, X, weights=1 / w).fit()
        a3 = result3.pvalues
        nonsign3 = a3[a3 > 0.05].index
        if (len(nonsign3) == 0):
            inctemp = result3.params
            inc['cat2workers'] = inctemp['serv']
            inc['cat4workers'] = inctemp['ag']
            inc['cat6workers'] = inctemp['manu']
            inc['cat1workers'] = inctemp['serv']
            inc['cat3workers'] = inctemp['ag']
            inc['cat5workers'] = inctemp['manu']
        else:
            X = finalhhframe.ix[select, [
                'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers',
                'cat5workers', 'cat6workers', 'cat7workers', 'old'
            ]].copy()
            X['skilled'] = X['cat2workers'] + X['cat4workers'] + X[
                'cat6workers']
            X['unskilled'] = X['cat1workers'] + X['cat3workers'] + X[
                'cat5workers']
            X.drop([
                'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers',
                'cat5workers', 'cat6workers'
            ],
                   axis=1,
                   inplace=True)
            result2 = sm.WLS(Y, X, weights=1 / w).fit()
            a2 = result2.pvalues
            nonsign2 = a2[a2 > 0.05].index
            if len(nonsign2) == 0 | ((len(nonsign2) < len(nonsign1)) &
                                     (len(nonsign2) < len(nonsign3))):
                inctemp = result2.params
                inc['cat2workers'] = inctemp['skilled']
                inc['cat4workers'] = inctemp['skilled']
                inc['cat6workers'] = inctemp['skilled']
                inc['cat1workers'] = inctemp['unskilled']
                inc['cat3workers'] = inctemp['unskilled']
                inc['cat5workers'] = inctemp['unskilled']
            else:
                if (len(nonsign3) < len(nonsign1)) & (len(nonsign3) <
                                                      len(nonsign2)):
                    inctemp = result3.params
                    inc['cat2workers'] = inctemp['serv']
                    inc['cat4workers'] = inctemp['ag']
                    inc['cat6workers'] = inctemp['manu']
                    inc['cat1workers'] = inctemp['serv']
                    inc['cat3workers'] = inctemp['ag']
                    inc['cat5workers'] = inctemp['manu']
    return inc

コード例 #19

0

ファイルを表示

ファイル: match.py プロジェクト: karpov-sv/favor2ext

    def match(self, obj=None, cat=None, sr=5./3600, verbose=False, predict=True,
              ra=None, dec=None, x=None, y=None, mag=None, magerr=None, flags=None,
              filter_name='V', order=4, bg_order=None, color_order=None,
              hard_mag_limit=99, mag_id=0, magerr0=0.02, sn=None, thresh=5.0,
              mask=None, good_flags=0x0):

        """Match a set of points with catalogue"""

        self.success = False
        self.ngoodstars = 0

        self.order = order
        self.bg_order = bg_order
        self.color_order = color_order
        self.mag_id = mag_id

        self.filter_name = filter_name
        if filter_name in ['B', 'V', 'R', 'I', 'g', 'r', 'i', 'z']:
            # Generic names
            cmag,cmagerr = cat[filter_name], cat[filter_name + 'err']
            self.cat_filter_name = filter_name
        elif filter_name == 'Clear':
            # Mini-MegaTORTORA
            cmag,cmagerr = cat['V'], cat['Verr']
            self.cat_filter_name = 'V'
        elif filter_name == 'N':
            # FRAMs
            cmag,cmagerr = cat['R'], cat['Rerr']
            self.cat_filter_name = 'R'
        else:
            if verbose:
                print('Unsupported filter name: %s' % filter_name)
            return False

        # TODO: make it configurable?..
        color = cat['B'] - cat['V']
        self.cat_color_name = 'B - V'

        # Objects to match
        if obj is not None:
            ra = obj['ra']
            dec = obj['dec']
            x = obj['x']
            y = obj['y']
            mag = obj['mag']
            magerr = obj['magerr']
            flags = obj['flags']

        else:
            if ra is None or dec is None or x is None or y is None or mag is None:
                raise ValueError('Data for matching are missing')

            if magerr is None:
                magerr = np.ones_like(mag)*np.std(mag)

            if flags is None:
                flags = np.zeros_like(ra, dtype=np.int)

        if self.width is None or self.height is None:
            self.x0,self.y0,self.width,self.height = np.mean(x), np.mean(y), np.max(x)-np.min(x), np.max(y) - np.min(y)

        # Match stars
        h = htm.HTM(10)
        oidx,cidx,dist = h.match(ra, dec, cat['ra'],cat['dec'], sr, maxmatch=0)

        if verbose:
            print(len(oidx), 'matches between', len(ra), 'objects and', len(cat['ra']), 'stars, sr = %.1f arcsec' % (3600.0*sr))

        self.oidx,self.cidx,self.dist = oidx, cidx, dist

        self.cmag = cmag[cidx]
        self.cmagerr = cmagerr[cidx]
        self.color = color[cidx]

        self.ox,self.oy = x[oidx], y[oidx]
        self.oflags = flags[oidx]
        self.omag,self.omagerr = mag[oidx], magerr[oidx]
        if len(self.omag.shape) > 1:
            # If we are given a multi-aperture magnitude column
            self.omag,self.omagerr = self.omag[:, mag_id], self.omagerr[:, mag_id]

        # Scaled spatial coordinates for fitting
        sx = (self.ox - self.x0)*2/self.width
        sy = (self.oy - self.y0)*2/self.height

        # Optimal magnitude cutoff for fitting, as a mean mag where S/N = 10
        idx = (1.0/self.omagerr > 5) & (1.0/self.omagerr < 15)
        if np.sum(idx) > 10:
            X = make_series(1.0, sx, sy, order=order)
            X = np.vstack(X).T
            Y = self.cmag

            self.C_mag_limit = sm.RLM(Y[idx], X[idx]).fit()
            mag_limit = np.sum(X*self.C_mag_limit.params, axis=1)
        else:
            if verbose:
                print('Not enough matches with SN~10:', np.sum(idx))
            self.C_mag_limit = None
            mag_limit = 99.0*np.ones_like(cmag)

        self.zero = self.cmag - self.omag # We will build a model for this variable

        self.zeroerr = np.hypot(self.omagerr, self.cmagerr)
        self.zeroerr = np.hypot(self.zeroerr, magerr0)

        self.weights = 1.0/self.zeroerr**2

        X = make_series(1.0, sx, sy, order=self.order)
        if self.bg_order is not None:
            X += make_series(-2.5/np.log(10)/10**(-0.4*self.omag), sx, sy, order=self.bg_order)

        if self.color_order is not None:
            X += make_series(self.color, sx, sy, order=self.color_order)
            X += make_series(self.color**2, sx, sy, order=self.color_order)
            X += make_series(self.color**3, sx, sy, order=self.color_order)

        X = np.vstack(X).T

        self.idx0 = ((self.oflags & (~good_flags)) == 0) & (self.cmag < hard_mag_limit) & (self.cmag < mag_limit)

        if mask is not None:
            # Exclude masked objects
            self.idx0 &= ~mask

        if sn is not None:
            self.idx0 &= (self.omagerr < 1.0/sn)

        # Actual fitting
        self.idx = self.idx0.copy()

        for iter in range(3):
            if np.sum(self.idx) < 3:
                if verbose:
                    print("Fit failed - %d objects" % np.sum(self.idx))
                return False

            self.C = sm.WLS(self.zero[self.idx], X[self.idx], weights=self.weights[self.idx]).fit()
            # self.C = sm.RLM(self.zero[self.idx], X[self.idx]).fit()

            self.zero_model = np.sum(X*self.C.params, axis=1)

            self.idx = self.idx0.copy()
            if thresh and thresh > 0:
                self.idx &= (np.abs((self.zero - self.zero_model)/self.zeroerr) < thresh)

        self.std = np.std((self.zero - self.zero_model)[self.idx])
        self.ngoodstars = np.sum(self.idx)
        self.success = True

        if verbose:
            print('Fit finished:', self.ngoodstars, 'stars, rms', self.std)

        if predict:
            self.predict(obj=obj, x=x, y=y, mag=mag, magerr=magerr, mag_id=mag_id, verbose=verbose)

        return True

コード例 #20

0

ファイルを表示

 Wchengfen = pd.DataFrame(flow_ev.iloc[i, :].copy())
 Wchengfen = Wchengfen.drop(['000061'], axis=0)
 Wchengfen = pd.DataFrame(Wchengfen.replace(0, 1000))
 summ = np.array(Wchengfen)
 temp_Hp = Wchengfen / sum(summ)
 temp_HpT = temp_Hp.transpose()
 Hp[i] = temp_Hp
 for j in range(120):
     row = i + j
     temp_Y = pd.DataFrame(newret.iloc[row, :])
     #WLS
     #将剩下的用于回归，因为前面扔掉了最后三个股票的财务因子，以流通市值开根号为权重，WLS回归
     temp_W = pd.DataFrame((flow_ev.iloc[row, :].copy())**0.5)
     temp_W = temp_W.drop(['000063'], axis=0)
     temp_W = pd.DataFrame(temp_W.replace(0, 1000))
     mod_wls = sm.WLS(temp_Y, temp_X, weights=1. / temp_W)
     res_wls = mod_wls.fit()
     residual_here = pd.DataFrame(res_wls.resid)
     temp_residual[j, :] = residual_here.transpose()
     temp_factor_return[j, :] = res_wls.params
 #记录WLS的回归权重
 WLS_weight[i, :] = temp_W.transpose()
 #收录残差和因子收益率进入字典
 residual[i] = temp_residual
 factor_return[i] = temp_factor_return
 #计算组合总方差
 temp_residual_cov = pd.DataFrame(np.cov(temp_residual.transpose()))
 X = temp_X
 XT = X.transpose()
 #因子收益率方差协方差矩阵
 temp_factor_return_cov = pd.DataFrame(

コード例 #21

0

ファイルを表示

ファイル: robust_models_1.py プロジェクト: mussabota/time-series-analysis

sidak2.sort_values('unadj_p', inplace=True)
print(sidak2)

fdr2 = ols_model.outlier_test('fdr_bh')
fdr2.sort_values('unadj_p', inplace=True)
print(fdr2)

# * Let's delete that line

l = ax.lines[-1]
l.remove()
del l

weights = np.ones(len(X))
weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
wls_model = sm.WLS(y, X, weights=weights).fit()
abline_plot(model_results=wls_model, ax=ax, color='green')

# * MM estimators are good for this type of problem, unfortunately, we
# don't yet have these yet.
# * It's being worked on, but it gives a good excuse to look at the R cell
# magics in the notebook.

yy = y.values[:, None]
xx = X['log.Te'].values[:, None]

print(params)

abline_plot(intercept=params[0], slope=params[1], ax=ax, color='red')

# ### Exercise: Breakdown points of M-estimator

コード例 #22

0

ファイルを表示

    def is_variable_long_term(self,
                              date_col="Julian Date",
                              radvel_col="Radial Velocity (m/s)",
                              err_col="Error (m/s)"):
        """
        TODO:: Get this program double-checked with Zechmeister et al. 2009 AND Dr. Haywood

        Checks whether the radial velocity data for a star contains a long-term trend, as described by Zechmeister et.
        al (2009). The default format for xcol, ycol, and err_col is based off of the HiRES publicly available radial
        velocity data (Butler, Vogt, Laughlin et al. 2017).

        Note:: If the dataset contains less than 6 datapoints, the function returns "0", as it is too small of a sample
        size to run the statistical tests used.

        :param date_col: string, the name of the column in the pandas DataFrame that contains the Julian Date.
        :param radvel_col: string,the name of the column in the pandas DataFrame that contains the radial velocity data.
        :param err_col: string, the name of the column in the pandas DataFrame that contains the measurement errors.
        :return: boolean true or false (true if the star has significant long-term variability
        and false if it does not).
        """

        y = self.df[radvel_col].to_numpy()
        X = self.df[date_col].to_numpy()
        w = self.df[err_col]

        if y.size <= 5:

            return 0

        else:

            # Fit a linear line of best fit (weighted least squares).
            mod_wls = sm.WLS(y, X, weights=w)
            res_wls = mod_wls.fit()
            print(res_wls.summary())
            m = res_wls.params[0]

            # Calculate chi-squared statistic for line of best fit and constant model.
            # Stack overflow ref: https://stackoverflow.com/questions/35730534/numpy-generate-data-from-linear-function
            x = np.arange(
                y.size
            )  # Generate data using the linear function (Garret R, Stack overflow)
            delta = np.random.uniform(-1 * np.amax(w), np.amax(w), size=y.size)
            y_ = np.add(m * x, delta)

            pslope = scipy.stats.chisquare(y_, self.weighted_mean)[1]
            pconstant = scipy.stats.chisquare(y, self.weighted_mean)[1]

            # Calculate F-statistic for p values of slope and constant models (Zechmeister et al. 2009)
            fslope = (y.size - 2) * ((pconstant - pslope) / pslope)

            # p-value from F-statistic
            p = 1 - scipy.stats.f.cdf(fslope, y.size, y.size)

            check = check_p(p, self.alpha)

            y_diff = []
            if check:
                for i in range(1, y.size):
                    y_diff.append(y[i] - y[i - 1])

            return check

コード例 #23

0

ファイルを表示

ファイル: SBS_cal_unique_factors.py プロジェクト: xiaoyyy/Self_Barra_System

                        hq_dict_no_suspension[stock][i - j - 1][2] - 1.0)
                    if SHIBOR_dict.has_key(
                            hq_dict_no_suspension[stock][i - j][0]) == True:
                        this_shibor_list.append(
                            SHIBOR_dict[hq_dict_no_suspension[stock][i -
                                                                     j][0]])
                    else:
                        this_shibor_list.append(SHIBOR_dict['20061008'])
                    this_index_ROR_list.append(index_return_dict[(
                        Now_Index, hq_dict_no_suspension[stock][i - j][0])])
                    j += 1
                this_minus_list = xyk_common_data_processing.element_cal_between_list(
                    temp_ROR_list, this_shibor_list, "-")
                X = sm.add_constant(this_index_ROR_list)
                Y = this_minus_list
                wls_model = sm.WLS(Y, X, weights=half_life_list)
                results = wls_model.fit()
                this_beta = float(results.params[1])
                resid_list = results.resid
                this_resid_mean = sum(resid_list) / float(len(resid_list))
                this_treated_list = []
                for resid_data in resid_list:
                    this_treated_list.append((resid_data - this_resid_mean) *
                                             (resid_data - this_resid_mean))
                this_HSIGMA = math.sqrt(
                    xyk_common_data_processing.weighted_mean(this_treated_list,
                                                             half_life_list,
                                                             use_df=1))
                result_list.append([stock, data[0], this_beta, this_HSIGMA])
'''
***输出至DB***

コード例 #24

0

ファイルを表示

ファイル: lilliefors_critical_value_simulation.py プロジェクト: arnab0000/Internships

def simulations(sim_type, save=False):
    rs = np.random.RandomState(seed)
    remaining = NUM_SIM
    results = defaultdict(list)
    start = dt.datetime.now()
    while remaining > 0:
        this_iter = min(remaining, MAX_SIM_SIZE)
        remaining -= this_iter
        if sim_type == 'normal':
            dist = rs.standard_normal
        else:
            dist = rs.standard_exponential
        rvs = dist((MAX_SIZE, this_iter))
        sample_sizes = [
            ss for ss in SAMPLE_SIZES if ss >= MIN_SAMPLE_SIZE[sim_type]
        ]
        for ss in sample_sizes:
            sample = rvs[:ss]
            mu = sample.mean(0)
            if sim_type == 'normal':
                std = sample.std(0, ddof=1)
                z = (sample - mu) / std
                cdf_fn = stats.norm.cdf
            else:
                z = sample / mu
                cdf_fn = stats.expon.cdf
            z = np.sort(z, axis=0)
            nobs = ss
            cdf = cdf_fn(z)
            plus = np.arange(1.0, nobs + 1) / nobs
            d_plus = (plus[:, None] - cdf).max(0)
            minus = np.arange(0.0, nobs) / nobs
            d_minus = (cdf - minus[:, None]).max(0)
            d = np.max(np.abs(np.c_[d_plus, d_minus]), 1)
            results[ss].append(d)
        logging.log(
            logging.INFO,
            'Completed {0}, remaining {1}'.format(NUM_SIM - remaining,
                                                  remaining))
        elapsed = dt.datetime.now() - start
        rem = elapsed.total_seconds() / (NUM_SIM - remaining) * remaining
        logging.log(logging.INFO,
                    '({0}) Time remaining {1:0.1f}s'.format(sim_type, rem))

    for key in results:
        results[key] = np.concatenate(results[key])

    if save:
        file_name = 'lilliefors-sim-{0}-results.pkl.gz'.format(sim_type)
        with gzip.open(file_name, 'wb', 5) as pkl:
            pickle.dump(results, pkl)

    crit_vals = {}
    for key in results:
        crit_vals[key] = np.percentile(results[key], PERCENTILES)

    start = 20
    num = len([k for k in crit_vals if k >= start])
    all_x = np.zeros((num * len(PERCENTILES), len(PERCENTILES) + 2))
    all_y = np.zeros(num * len(PERCENTILES))
    loc = 0
    for i, perc in enumerate(PERCENTILES):
        y = pd.DataFrame(results).quantile(perc / 100.)
        y = y.loc[start:]
        all_y[loc:loc + len(y)] = np.log(y)
        x = y.index.values.astype(np.float)
        all_x[loc:loc + len(y), -2:] = np.c_[np.log(x), np.log(x)**2]
        all_x[loc:loc + len(y), i:(i + 1)] = 1
        loc += len(y)
    w = np.ones_like(all_y).reshape(len(PERCENTILES), -1)
    w[6:, -5:] = 3
    w = w.ravel()
    res = sm.WLS(all_y, all_x, weights=w).fit()
    params = []
    for i in range(len(PERCENTILES)):
        params.append(np.r_[res.params[i], res.params[-2:]])
    params = np.array(params)

    df = pd.DataFrame(params).T
    df.columns = PERCENTILES
    asymp_crit_vals = {}
    for col in df:
        asymp_crit_vals[col] = df[col].values

    code = '{0}_crit_vals = '.format(sim_type)
    code += str(crit_vals).strip() + '\n\n'
    code += '\n# Coefficients are model '
    code += 'log(cv) = b[0] + b[1] log(n) + b[2] log(n)**2\n'
    code += '{0}_asymp_crit_vals = '.format(sim_type)
    code += str(asymp_crit_vals) + '\n\n'
    return code

コード例 #25

0

ファイルを表示

ファイル: WRTDS-W-par.py プロジェクト: fkwai/geolearn

def func(siteNo, fitAll=True):
    # prep data
    print(siteNo)
    saveName = os.path.join(dirOut, siteNo)
    if os.path.exists(saveName):
        return ()
    t0 = time.time()
    varQ = '00060'
    varLst = codeLst + [varQ]
    df = waterQuality.readSiteTS(siteNo, varLst=varLst, freq='W')
    dfYP = pd.DataFrame(index=df.index, columns=codeLst)
    dfX = pd.DataFrame({'date': df.index}).set_index('date')
    dfX = dfX.join(np.log(df[varQ] + sn)).rename(columns={varQ: 'logQ'})
    yr = dfX.index.year.values
    t = yr + dfX.index.dayofyear.values / 365
    dfX['sinT'] = np.sin(2 * np.pi * t)
    dfX['cosT'] = np.cos(2 * np.pi * t)
    dfX['yr'] = yr
    dfX['t'] = t
    xVarLst = ['yr', 'logQ', 'sinT', 'cosT']
    # train / test
    fitCodeLst = list()
    for code in codeLst:
        if siteNo in dictSite[code]:
            fitCodeLst.append(code)
    for code in fitCodeLst:
        ind1 = np.where(yr < 2010)[0]
        ind2 = np.where(yr >= 2010)[0]
        dfXY = dfX.join(np.log(df[code] + sn))
        df1 = dfXY.iloc[ind1].dropna()
        if fitAll:
            df2 = dfXY[xVarLst + ['t']].dropna()
        else:
            df2 = dfXY.iloc[ind2].dropna()  # only fit for observations now
        n = len(df1)
        if n == 0:
            break
        # calculate weight
        h = np.array([7, 2, 0.5])  # window [Y Q S] from EGRET
        tLst = df2.index.tolist()
        for t in tLst:
            dY = np.abs((df2.loc[t]['t'] - df1['t']).values)
            dQ = np.abs((df2.loc[t]['logQ'] - df1['logQ']).values)
            dS = np.min(np.stack(
                [abs(np.ceil(dY) - dY),
                 abs(dY - np.floor(dY))]),
                        axis=0)
            d = np.stack([dY, dQ, dS])
            if n > 100:
                hh = np.repeat(h[:, None], n, axis=1)
                bW = False
                while ~bW:
                    bW = np.min(np.sum((hh - d) > 0, axis=1)) > 100
                    hh = hh * 1.1 if not bW else hh
            else:
                htemp = np.max(d, axis=1) * 1.1
                hh = np.repeat(htemp[:, None], n, axis=1)
            w = (1 - (d / hh)**3)**3
            w[w < 0] = 0
            wAll = w[0] * w[1] * w[2]
            ind = np.where(wAll > 0)[0]
            ww = wAll[ind]
            # fit WLS
            Y = df1.iloc[ind][code].values
            X = df1.iloc[ind][xVarLst].values
            model = sm.WLS(Y, X, weights=ww).fit()
            xp = df2.loc[t][xVarLst].values
            yp = model.predict(xp)[0]
            dfYP.loc[t][code] = np.exp(yp) - sn
        t1 = time.time()
        print(siteNoLst.index(siteNo), siteNo, code, t1 - t0)
    saveName = os.path.join(dirOut, siteNo)
    dfYP.to_csv(saveName)
    return

コード例 #26

0

ファイルを表示

ファイル: test.py プロジェクト: SanjayGorur/Applied-Data-Science

df['females_education'] = np.log1p(df['females_education'])
df['life_expectancy'] = np.log1p(df['life_expectancy'])
df['unemployment'] = np.log1p(df['unemployment'])

#declare variable
y = df[['Adult_victims']]
x = df[["life_expectancy", "policy_index", "females_education"]]
X = sm.add_constant(x)
x2 = df[[
    "policy_index", "females_education", "life_expectancy", "unemployment"
]]
X2 = sm.add_constant(x2)

# Model preparation
w = len(df['country'])
mod_wls = sm.WLS(y, X2, weights=1. / w)
res_wls = mod_wls.fit()
print(res_wls.summary())

#new dataset
df_new = pd.read_csv("new.csv")
df_new = df_new.replace(np.nan, 0)
df_new.rename(columns={
    'persons prosecuted': 'persons_prosecuted',
    'policy index': 'policy_index',
    'child victims': 'child_victims',
    'Adult victims': 'Adult_victims',
    'life expectancy': 'life_expectancy',
    '% females in primary education': 'females_education'
},
              inplace=True)

コード例 #27

0

ファイルを表示

nsample = 25
#x = np.linspace(0, 25, nsample)
x = df.plate_maria
X = np.column_stack((x, (x - 5)**2))
X = sm.add_constant(X)
beta = [5., 0.5, -0.01]
sig = 0.5
w = np.ones(nsample)
w[nsample * 6 // 10:] = 3
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + sig * w * e
X = X[:, [0, 1]]

# Step-3.
mod_wls = sm.WLS(y, X, weights=1. / (w**2))
res_wls = mod_wls.fit()
print(res_wls.summary())

# Step-4.
res_ols = sm.OLS(y, X).fit()
print(res_ols.params)
print(res_wls.params)

# Step-5.
se = np.vstack([[res_wls.bse], [res_ols.bse], [res_ols.HC0_se],
                [res_ols.HC1_se], [res_ols.HC2_se], [res_ols.HC3_se]])
se = np.round(se, 4)
colnames = ['x1', 'const']
rownames = ['WLS', 'OLS', 'OLS_HC0', 'OLS_HC1', 'OLS_HC3', 'OLS_HC3']
tabl = SimpleTable(se, colnames, rownames, txt_fmt=default_txt_fmt)

コード例 #28

0

ファイルを表示

plt.ylabel('residual')
plt.title('Scatter of ypred and residual')
plt.subplots_adjust(hspace=.5, wspace = .5)
plt.savefig(save_path + 'Scatter_res.jpg')
plt.close()
# plt.show()


est = sm.OLS(np.abs(residual), x[[0,2,6]])
est1 = est.fit()
print(est1.summary())
sigma_pred = est1.predict(x[[0,2,6]])
# print(sigma_pred.shape)


mod_wls = sm.WLS(y, x, weights=1./(sigma_pred ** 2))
res_wls = mod_wls.fit()
print(res_wls.summary())
print(res_wls.summary().as_latex())

w = 1./(sigma_pred ** 2)
ypred = res_wls.predict(x)
residual = y - ypred
RSS = sum(w * (residual)**2)
weighted_mean = sum(w * y)/sum(w)
TSS = sum(w * (y - weighted_mean)**2)
print(TSS, RSS)
print(1 - RSS/TSS)

plt.plot(1/sigma_pred**2,'.')
plt.show()

コード例 #29

0

ファイルを表示

ファイル: main.py プロジェクト: durbank/WAIS-wind

    coeff[i] = results.params[1]
    std_err[i] = results.bse[1]
    p_val[i] = results.pvalues[1]
    R2[i] = results.rsquared
toc = time.perf_counter()
print(f"Execution time of OLS: {toc-tic}s")



# Full stats (with diagnostics) WLS model
tic = time.perf_counter()
for i in range(lm_data.shape[0]):
    X = SM.add_constant(lm_data.columns)
    y = lm_data.iloc[i]
    w = 1/(std_data.iloc[i] ** 2)
    model = SM.WLS(y, X, weights=w)
    results = model.fit()
    coeff[i] = results.params[1]
    std_err[i] = results.bse[1]
    p_val[i] = results.pvalues[1]
    R2[i] = results.rsquared
toc = time.perf_counter()
print(f"Execution time of WLS: {toc-tic}s")



# Full stats (with diagnostics) Robust OLS model
tic = time.perf_counter()
for i in range(lm_data.shape[0]):
    X = SM.add_constant(lm_data.columns)
    y = lm_data.iloc[i]

コード例 #30

0

ファイルを表示

def predict_abc(interp,
                extrap,
                interp_index,
                extrap_index,
                weight,
                interp_weights,
                extrap_weights,
                cs,
                abc,
                verbose=True):

    # set up age range
    ages = range(22, 30) + range(31, 68)

    # set up dictionaries to store output
    params_interp = {}
    params_extrap = {}
    error_mat = {}

    # set up matrices for interpolation/extrapolation parameters, and errors
    for sex in ['pooled', 'male', 'female']:
        params_interp[sex] = pd.DataFrame(
            [[np.nan for j in range(len(cols.interp.predictors) + 3)]
             for k in range(22, 30)],
            index=range(22, 30))
        params_interp[sex].index.names = ['age']
        params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + [
            'y'
        ] + ['rmse']

        params_extrap[sex] = pd.DataFrame(
            [[np.nan for j in range(len(cols.extrap.predictors) + 3)]
             for k in range(31, 68)],
            index=range(31, 68))
        params_extrap[sex].index.names = ['age']
        params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + [
            'y'
        ] + ['rmse']
        error_mat[sex] = pd.DataFrame([])

    # obtain parameters for every age
    for age in ages:

        if age in range(22, 30):
            aux = deepcopy(interp.loc[interp_index, :])
            if age == 22:

                interp_weights.reset_index(inplace=True)
                del interp_weights['draw']
                interp_weights.set_index('id', inplace=True, drop=True)
                weight_array = deepcopy(
                    interp_weights.loc[pd.IndexSlice[interp_index], :])

            age_x = age - 1
            predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)]

        elif age in range(31, 68):
            aux = deepcopy(extrap.loc[extrap_index, :])
            if age == 31:
                age_x = 29
                predictors = cols.extrap.predictors + [
                    'inc_labor{}'.format(age_x)
                ]

            else:
                age_x = age - 1
                predictors = cols.extrap.predictors + [
                    'inc_labor{}'.format(age_x)
                ]

            if age == 31:
                extrap_index_weight = [x[1] for x in extrap_index]

                extrap_weights.reset_index(inplace=True)
                del extrap_weights['draw']
                extrap_weights.set_index('id', inplace=True, drop=True)
                weight_array = deepcopy(
                    extrap_weights.loc[extrap_index_weight, :])

        c = 'inc_labor{}'.format(age)

        # drop black
        # drop black
        aux = aux.loc[aux.black == 1]

        # obtain parameters for different sexes
        for sex in ['pooled', 'male', 'female']:

            if sex == 'pooled':
                data = aux
                abcd = abc
                abcd_count = abcd.shape[0]

            elif sex == 'male':
                data = aux.loc[aux.male == 1]
                abcd = abc.loc[abc.male == 1]
                abcd_count = abcd.loc[abcd['male'] == 1]['male'].count()

            else:
                data = aux.loc[aux.male == 0]
                abcd = abc.loc[abc.male == 0]
                abcd_count = abcd.loc[abcd['male'] == 0]['male'].count()

            if weight == 'treat':
                abcd = abcd.loc[abcd.R == 1]
            elif weight == 'control':
                abcd = abcd.loc[abcd.R == 0]

            # reset auxiliary index (because dmatrices won't use id)
            data.reset_index('id', drop=True, inplace=True)
            data.index = [j for j in range(data.shape[0])]

            weight_array.reset_index('id', drop=True, inplace=True)
            weight_array.index = [j for j in range(weight_array.shape[0])]

            #weight_array = weight_array[data.index]

            # create design matrix for regressions
            fmla = '{} ~ {}'.format(c, ' + '.join(predictors))
            endog, exog = dmatrices(fmla, data, return_type='dataframe')
            exog = sm.add_constant(exog)
            exog_index = [x for x in exog.index]
            weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]]
            weight_type = 'wtabc_allids_c' + cs + '_' + weight
            weight_forWLS = weight_forWLS.loc[:, weight_type]
            weight_forWLS.dropna(axis=0, inplace=True)

            exog = exog.loc[weight_forWLS.index, :]
            endog = endog.loc[weight_forWLS.index, :]
            # estimate coefficients
            fail_switch = 0
            try:
                model = sm.WLS(endog, exog, weights=weight_forWLS)
                fit = model.fit()
                params = fit.params
                resid = fit.resid
            except:
                fail_switch = 1
                if age in range(22, 30):
                    params = pd.Series(
                        [np.nan for j in range(1 + len(predictors))],
                        index=['Intercept'] + cols.interp.predictors + ['y'])
                else:
                    params = pd.Series(
                        [np.nan for j in range(1 + len(predictors))],
                        index=['Intercept'] + cols.extrap.predictors + ['y'])
                resid = pd.Series([np.nan for j in range(endog.shape[0])])

            # calculate RMSE
            rmse = resid * resid
            rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse'])
            params = pd.concat([params, rmse], axis=0)
            params.rename({'inc_labor{}'.format(age_x): 'y'}, inplace=True)
            if age in range(22, 30):
                params_interp[sex].loc[age, :] = params
            else:
                params_extrap[sex].loc[age, :] = params
            # resample the errors, and merge in with ABC IDs
            if fail_switch == 0:
                ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count))
            else:
                ehat = pd.DataFrame([np.nan for j in range(abcd_count)])
            abcd_ix = abcd.reset_index(level=0)
            ehat = pd.concat([abcd_ix.loc[:, 'id'], ehat], axis=1)
            ehat.columns = ['id', age]
            ehat.columns.name = 'age'
            ehat.set_index('id', inplace=True)
            error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1)

        if verbose:
            print 'Successful predictions, age {}, n={}'.format(
                age, exog.shape[0])

# add treatment indicator back into error matrix, add column names
    treat = abc.loc[:, 'R']
    for sex in ['pooled', 'male', 'female']:
        error_mat[sex] = pd.concat([error_mat[sex], treat],
                                   axis=1,
                                   join='inner')
        params_interp[sex].columns.name = 'variable'
        params_extrap[sex].columns.name = 'variable'

    male_interp_nix = abcd.loc[abcd.male == 1].loc[pd.isnull(
        abcd.loc[abcd.male == 1, cols.interp.predictors]).any(axis=1)].index
    female_interp_nix = abcd.loc[abcd.male == 0].loc[pd.isnull(
        abcd.loc[abcd.male == 0, cols.interp.predictors]).any(axis=1)].index

    male_extrap_nix = abcd.loc[abcd.male == 1].loc[pd.isnull(
        abcd.loc[abcd.male == 1, cols.extrap.predictors]).any(axis=1)].index
    female_extrap_nix = abcd.loc[abcd.male == 0].loc[pd.isnull(
        abcd.loc[abcd.male == 0, cols.extrap.predictors]).any(axis=1)].index

    # remove errors for ABC individuals for whom we do not predict earnings
    # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes)

    error_mat['male'].loc[male_interp_nix, slice(0, 8)] = np.nan
    error_mat['female'].loc[female_interp_nix, slice(0, 8)] = np.nan
    error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix),
                            slice(0, 8)] = np.nan
    # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes)

    error_mat['male'].loc[male_extrap_nix, slice(9, 45)] = np.nan
    error_mat['female'].loc[female_extrap_nix, slice(9, 45)] = np.nan
    error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix),
                            slice(9, 45)] = np.nan

    # predict earnings
    projection_interp = {}
    projection_extrap = {}
    abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])]

    for sex in ['pooled', 'male', 'female']:
        if sex == 'pooled':
            abcd = abc

        elif sex == 'male':
            abcd = abc.loc[abc.male == 1]

        else:
            abcd = abc.loc[abc.male == 0]

        abcd_interp = abcd.loc[:,
                               ['Intercept'] + cols.interp.predictors + ['y']]
        abcd_extrap = abcd.loc[:,
                               ['Intercept'] + cols.extrap.predictors + ['y']]

        projection_interp[sex] = pd.DataFrame([])
        projection_extrap[sex] = pd.DataFrame([])

        for age in ages:
            if age in range(22, 30):
                if age == 22:
                    abcd_interp['y'] = 0
                params_interp_trans = pd.DataFrame(
                    params_interp[sex].loc[age].drop('rmse').T)
                interp_dot = abcd_interp.dot(
                    params_interp_trans) + error_mat[sex][[age]]
                abcd_interp['y'] = interp_dot
                projection_interp[sex] = pd.concat(
                    [projection_interp[sex], interp_dot], axis=1)

            else:

                if age == 31:
                    params_extrap[sex].loc[31]['y'] = 0
                    abcd_extrap['y'] = interp_dot
                    abcd_extrap['y'].fillna(value=0, inplace=True)
                params_extrap_trans = pd.DataFrame(
                    params_extrap[sex].loc[age].drop('rmse').T)
                extrap_dot = abcd_extrap.dot(
                    params_extrap_trans) + error_mat[sex][[age]]
                abcd_extrap['y'] = extrap_dot
                projection_extrap[sex] = pd.concat(
                    [projection_extrap[sex], extrap_dot], axis=1)

    return params_interp, params_extrap, error_mat, projection_interp, projection_extrap