conv_factor = float(2.41e+8) #moved to test regression line on abs gain #conv_factor = float(1./6) for i in range(len(Gain)): AbsGain.append(Gain[i] * conv_factor) print 'AbsGain ', AbsGain for i in range(len(Temp)): #regression line for Vb = 28V - 30V , 0.5V steps X = Volt #[2:10:]#4:1] X = sm.add_constant(X) Y = AbsGain[i] #[2:10:]#[2:]#4:1] y_err = Error[i] #[2:10:]#[2:]#4:1] weights = pd.Series(y_err) wls_model = sm.WLS(Y, X, weights=1 / weights) results = wls_model.fit() print 'results', results.params inter, slo = results.params slope.append(slo) intercept.append(inter) vbreak = -intercept[i] / slope[i] #if (vbreak<60) or (vbreak>70):vbreak = 60 Vbr.append(vbreak) Vbrmean = np.mean(Vbr) OVlist.append(np.array(Volt) - np.array(Vbr[i])) Vbr = np.asarray(Vbr) print 'Vbr ', Vbr print 'OVlist ', OVlist
# # Note that `exog` must be a 2-dimensional array with `x` as a column and # an extra column of ones. Adding this column of ones means you want to fit # the model `y = a * x + b`, leaving it off means you want to fit the model # `y = a * x`. # # And you have to use the option `cov_type='fixed scale'` to tell # `statsmodels` that you really have measurement errors with an absolute # scale. If you do not, `statsmodels` will treat the weights as relative # weights between the data points and internally re-scale them so that the # best-fit model will have `chi**2 / ndf = 1`. exog = sm.add_constant(data["x"]) endog = data["y"] weights = 1.0 / (data["y_err"]**2) wls = sm.WLS(endog, exog, weights) results = wls.fit(cov_type="fixed scale") print(results.summary()) # ### Check against scipy.optimize.curve_fit # You can use `scipy.optimize.curve_fit` to get the best-fit parameters # and parameter errors. from scipy.optimize import curve_fit def f(x, a, b): return a * x + b xdata = data["x"]
if i == 0: G_ign = 7 Gainsave = Gain[0,G_ign] Gain[0,G_ign]=(Gain[0,G_ign-1]+Gain[0,G_ign+1])/2 ''' #regression line for Gain X = Volt[regrleft:regrright:] #4:1] #print X.dtype X = sm.add_constant(X) Y = Gain[i][regrleft:regrright:] #[2:]#4:1] #print Gain[i] y_err = Error[i][regrleft:regrright:] #[2:]#4:1] weights1 = pd.Series(y_err) #print type(weights1) wls_model1 = sm.WLS(Y, X, weights=1 / weights1) results1 = wls_model1.fit() print 'results ', results1.params #print results1.params.shape inter1, slo1 = results1.params slope.append(slo1) intercept.append(inter1) vbreak1 = -intercept[i] / slope[i] #vbreak1 = 64.2 #if (vbreak<60) or (vbreak>70):vbreak = 60 Vbr.append(vbreak1) #err.append() ''' if i == 0: Gain[0,G_ign]=Gainsave
def EM(data, number_of_components, debug=False): N, D = np.shape(data) D = D - 1 X, Y = data[:, :-1], data[:, -1] if D == 1: X = X.reshape((N, 1)) Y = Y.reshape((N, 1)) if debug: print("N = ", N, " D = ", D, " K = ", number_of_components) print("Data size: ", data.shape) print("X size: ", X.shape) print("Y size: ", Y.shape) if len(data) > MAX_DF_SIZE: n = min(len(data), number_of_components * 1000) sampled_data = data[np.random.randint(data.shape[0], size=n), :] priors, mu, sigma, coefficients, y_sigma = EM_init( sampled_data, number_of_components) else: priors, mu, sigma, coefficients, y_sigma = EM_init( data, number_of_components) if debug: print("Priors0 ", priors.shape, " =\n", priors) print("Mu0 ", mu.shape, " =\n", mu) print("Sigma0 ", sigma.shape, " =\n", sigma) print("Coefficients0 ", coefficients.shape, " =\n", coefficients) print("Ysigma0 ", y_sigma.shape, " =\n", y_sigma) gamma = np.ndarray(shape=(N, number_of_components)) min_value = sys.float_info.min max_value = sys.float_info.max loglikelihood_threshold = 1e-10 old_loglikelihood = -1 * max_value loglikelihood = 0 iteration = 1 while True: # Expectation for k in range(number_of_components): mu_large = np.append(mu[k, :], 0) sigma_large = np.insert(np.insert(sigma[k, :, :], D, 0, axis=1), D, 0, axis=0) sigma_large[-1, -1] = y_sigma[k] keep_y = np.copy(data[:, -1]) data[:, -1] -= coefficients[k, 0] + np.dot(data[:, :-1], coefficients[k, 1:]) gamma[:, k] = priors[k] * scipy.stats.multivariate_normal.pdf( data, mu_large, sigma_large, allow_singular=True) data[:, -1] = np.copy(keep_y) denominator = np.sum(gamma, axis=1) # CHECK denominator = denominator.reshape((N, 1)) denominator = np.where(denominator < min_value, min_value, denominator) gamma = gamma / denominator if debug: print("GAMMA ~ min: {}, max: {}".format(np.min(gamma), np.max(gamma))) n_component = np.sum(gamma, axis=0) n_component = n_component.reshape((number_of_components, 1)) # Maximization for k in range(number_of_components): priors[k] = n_component[k] / N mu[k, :] = np.dot(gamma[:, k], X) / n_component[k] sigma[k, :, :] = np.matmul( (X - mu[k]).T, gamma[:, k, np.newaxis] * (X - mu[k])) / n_component[k] + 0.00001 * np.diag( np.diag(np.ones((D, D)))) model = sm.WLS(Y, sm.add_constant(X), weights=gamma[:, k]) res = model.fit() coefficients[k, :] = res.params y_sigma[k] = np.dot( gamma[:, k, np.newaxis].T, np.power( Y - np.dot(coefficients[k, :], sm.add_constant(X).T)[:, np.newaxis], 2)) / n_component[k] # SQRT? if debug: print("\t Updated all parameters!") # Log-likelihood new_gamma = np.ndarray(shape=(N, number_of_components)) for k in range(number_of_components): mu_large = np.append(mu[k, :], 0) sigma_large = np.insert(np.insert(sigma[k, :, :], D, 0, axis=1), D, 0, axis=0) sigma_large[-1, -1] = y_sigma[k] keep_y = np.copy(data[:, -1]) data[:, -1] -= coefficients[k, 0] + np.dot(data[:, :-1], coefficients[k, 1:]) new_gamma[:, k] = scipy.stats.multivariate_normal.pdf( data, mu_large, sigma_large, allow_singular=True) # CHECK * priors[k] data[:, -1] = np.copy(keep_y) if debug: print("\t Computed New Gamma!") probs = np.dot(new_gamma, priors) probs = np.where(probs < min_value, min_value, probs) probs = np.reshape(probs, (N, 1)) if debug: print("\t Computed probs!") loglikelihood = np.mean(np.log10(probs), 0) ret_ll = np.sum(np.log(probs)) if np.absolute((loglikelihood / old_loglikelihood) - 1) < loglikelihood_threshold: break iteration += 1 if iteration > 1000: break old_loglikelihood = loglikelihood return priors, mu, sigma, coefficients, y_sigma, ret_ll
def setup(self): #fit for each test, because results will be changed by test x = self.exog np.random.seed(987689) y = x.sum(1) + np.random.randn(x.shape[0]) self.results = sm.WLS(y, self.exog, weights=np.ones(len(y))).fit()
def set(self, endog, exog=None): self.model = sm.WLS(endog, exog=exog, **self.rmodelparams)
def factor_ret(self): weight = self.cap ** 0.5 weight = 1/weight weight_tr = weight.loc[:, self.today:self.today] df_weight = weight_tr.unstack() self.rtn = [] self.pct = [] self.expos = [] ret_tr = self.data.iloc[self.today:self.today,:,6] self.df_ret = ret_tr.unstack() self.tot_ret = self.ret.loc["ret_pct",self.today] if (len(ret_tr)) and len(self.bigsize) and len(self.medsize) \ and len(self.retv) and len(self.turn) and len(self.wgt_rt) \ and len(self.halpha) and len(self.increase) and len(self.EY) \ and len(self.ROE) and len(self.BLEV) and len(self.B800) \ and len(self.R800) and len(self.KDJ): df_regression = pd.DataFrame([self.df_ret.values,self.df_bigsize.values, self.df_medsize.values,self.df_retv.values, self.df_turn.values,self.df_wgt_rt.values, self.df_halpha.values,self.df_increase.values, self.df_EY.values,self.df_ROE.values,self.df_BLEV.values, self.df_B800.values,self.df_R800.values,self.df_KDJ.values],index=['ret','bigsize', 'medsize','retv','turn','wgt_rt','halpha','increase','EY','ROE','BLEV','B800','R800','KDJ']) df_regression = df_regression.T for i in range(0, len(self.Industry)): df_regression[self.Industry[i].unstack().T.index.name] = self.Industry[i].values df_regression['weight'] = df_weight.values df_regression = df_regression.dropna() y = df_regression.iloc[:, 0].tolist() temp = [] for i in range(1, df_regression.shape[1] - 1): temp.append(df_regression.iloc[:, i].tolist()) X = np.column_stack(temp) W = df_regression.iloc[:, -1].tolist() fit = sm.WLS(y, X, weight=W).fit() self.pfl_W = self.hold.iloc[:,self.today:self.today] if (self.pfl_W.empty == False): exp_bigsize = np.dot(self.pfl_W.iloc[:, 0].values,self.df_bigsize.fillna(0).values) ret_bigsize = exp_bigsize * fit.params[0] pct_bigsize = ret_bigsize/self.tot_ret self.rtn.append(ret_bigsize) self.pct.append(pct_bigsize) self.expos.append(exp_bigsize) exp_medsize = np.dot(self.pfl_W.iloc[:, 0].values,self.df_medsize.fillna(0).values) ret_medsize = exp_medsize * fit.params[1] pct_medsize = ret_medsize/self.tot_ret self.rtn.append(ret_medsize) self.pct.append(pct_medsize) self.expos.append(exp_medsize) exp_retv = np.dot(self.pfl_W.iloc[:, 0].values,self.df_retv.fillna(0).values) ret_retv = exp_retv * fit.params[2] pct_retv = ret_retv/self.tot_ret self.rtn.append(ret_retv) self.pct.append(pct_retv) self.expos.append(exp_retv) exp_turn = np.dot(self.pfl_W.iloc[:, 0].values,self.df_turn.fillna(0).values) ret_turn = exp_turn * fit.params[3] pct_turn = ret_turn/self.tot_ret self.rtn.append(ret_turn) self.pct.append(pct_turn) self.expos.append(exp_turn) exp_wgt_rt = np.dot(self.pfl_W.iloc[:, 0].values,self.df_wgt_rt.fillna(0).values) ret_wgt_rt = exp_wgt_rt * fit.params[4] pct_wgt_rt = ret_wgt_rt/self.tot_ret self.rtn.append(ret_wgt_rt) self.pct.append(pct_wgt_rt) self.expos.append(exp_wgt_rt) exp_halpha = np.dot(self.pfl_W.iloc[:, 0].values,self.df_halpha.fillna(0).values) ret_halpha = exp_halpha * fit.params[5] pct_halpha = ret_halpha/self.tot_ret self.rtn.append(ret_halpha) self.pct.append(pct_halpha) self.expos.append(exp_halpha) exp_increase = np.dot(self.pfl_W.iloc[:, 0].values,self.df_increase.fillna(0).values) ret_increase = exp_increase * fit.params[6] pct_increase = ret_increase/self.tot_ret self.rtn.append(ret_increase) self.pct.append(pct_increase) self.expos.append(exp_increase) exp_EY = np.dot(self.pfl_W.iloc[:, 0].values,self.df_EY.fillna(0).values) ret_EY = exp_EY * fit.params[7] pct_EY = ret_EY/self.tot_ret self.rtn.append(ret_EY) self.pct.append(pct_EY) self.expos.append(exp_EY) exp_ROE = np.dot(self.pfl_W.iloc[:, 0].values, self.df_ROE.fillna(0).values) ret_ROE = exp_ROE * fit.params[8] pct_ROE = ret_ROE / self.tot_ret self.rtn.append(ret_ROE) self.pct.append(pct_ROE) self.expos.append(exp_ROE) exp_BLEV = np.dot(self.pfl_W.iloc[:, 0].values, self.df_BLEV.fillna(0).values) ret_BLEV = exp_BLEV * fit.params[9] pct_BLEV = ret_BLEV / self.tot_ret self.rtn.append(ret_BLEV) self.pct.append(pct_BLEV) self.expos.append(exp_BLEV) exp_B800 = np.dot(self.pfl_W.iloc[:, 0].values, self.df_B800.fillna(0).values) ret_B800 = exp_B800 * fit.params[10] pct_B800 = ret_B800 / self.tot_ret self.rtn.append(ret_B800) self.pct.append(pct_B800) self.expos.append(exp_B800) exp_R800 = np.dot(self.pfl_W.iloc[:, 0].values, self.df_R800.fillna(0).values) ret_R800 = exp_R800 * fit.params[11] pct_R800 = ret_R800 / self.tot_ret self.rtn.append(ret_R800) self.pct.append(pct_R800) self.expos.append(exp_R800) exp_KDJ = np.dot(self.pfl_W.iloc[:, 0].values, self.df_KDJ.fillna(0).values) ret_KDJ = exp_KDJ * fit.params[12] pct_KDJ = ret_KDJ / self.tot_ret self.rtn.append(ret_KDJ) self.pct.append(pct_KDJ) self.expos.append(exp_KDJ) for i in range(0, len(self.Industry)): exp_tr = np.dot(self.pfl_W.iloc[:, 0].values, self.Industry[i].fillna(0).values) ret_tr = exp_tr * fit.params[i + 13] pct_tr = ret_tr/self.tot_ret self.rtn.append(ret_tr) self.pct.append(pct_tr) self.expos.append(exp_tr) self.result = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\rtn.xlsx", sheetname=0, index_col=0) self.percentage = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\pct.xlsx", sheetname=0, index_col=0) self.exposure = pd.read_excel(r"C:\DELL\internship\CICC\Trans\result\expos.xlsx", sheetname=0, index_col=0) self.result[format(self.today, "%Y-%m-%d")] = self.rtn self.result.to_excel(r"C:\DELL\internship\CICC\Trans\result\rtn.xlsx") self.percentage[format(self.today, "%Y-%m-%d")] = self.pct self.percentage.to_excel(r"C:\DELL\internship\CICC\Trans\result\pct.xlsx") self.exposure[format(self.today, "%Y-%m-%d")] = self.expos self.exposure.to_excel(r"C:\DELL\internship\CICC\Trans\result\expos.xlsx") else: pass else: pass
this_stock_factor_list = [] i = 0 while i < len(factor_list): this_stock_factor_list.append(factor_dict[(stock, date)][i]) i += 1 i = 0 while i < len(this_stock_dummy_list): this_stock_factor_list.append(this_stock_dummy_list[i]) i += 1 this_whole_stock_factors_list.append(this_stock_factor_list) else: pass X = sm.add_constant(this_whole_stock_factors_list) wls_model = sm.WLS(ROR_list, X, weights=sqrt_liquid_list) results = wls_model.fit() U_list_temp = results.resid #残差项,也就是特质因子序列 T_value_list_temp = results.tvalues #模型对每个因子的t值 R_squared_temp = results.rsquared #模型R2值 R_squared_adj_temp = results.rsquared_adj #模型调整后的R2值 f_list_temp = results.params #模型参数,也就是因子收益率序列 U_list.append(U_list_temp) T_value_list.append(T_value_list_temp) R_squared.append(R_squared_temp) R_squared_adj.append(R_squared_adj_temp) f_list.append(f_list_temp) U2_list_temp = xyk_common_data_processing.element_cal_between_list( U_list_temp, U_list_temp, "*") WU2_list_temp = xyk_common_data_processing.element_cal_between_list( U2_list_temp, sqrt_liquid_list, "*")
def analyze(df, save=False): pca_parms = ['mo_t_pi', 'mo_t_ds', 'mo_t_dz', 'mo_t_sz'] pca = PCA(n_components=len(pca_parms)) pca.fit(df[pca_parms]) print(pca.explained_variance_ratio_) Xbar = df[pca_parms] #Xbar=pca.transform(Xbar) Xbar = np.dot(Xbar, pca.components_.T) for i in range(len(pca_parms)): df['x' + str(i)] = Xbar[:, i] parms = ['mo_n_3d', 'mo_n_2ppi', 'mo_n_2pz', 'Us', 'x0', 'x1'] #,'x2','x3'] X = df[parms] X = sm.add_constant(X) y = df['energy'] ols = sm.OLS(y, X).fit() print(ols.summary()) print(ols.params) pca_params = ols.params[-2:].values pca_params = list(pca_params) + [0, 0] reg_params = np.dot(pca_params, pca.components_) print(pca_params) print(reg_params) exit(0) df['resid'] = df['energy'] - ols.predict() #model=['mo_n_3d','mo_n_4s','mo_n_2ppi','mo_n_2pz','Us'] #model=['mo_t_pi','mo_t_ds','mo_t_dz','mo_t_sz'] sns.pairplot(df, vars=['resid', 'x0', 'x1', 'x2', 'x3']) #,vars=['resid']+model) plt.show() exit(0) beta = 2 model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_pi', 'Us'] X = df[model] X = sm.add_constant(X) y = df['energy'] ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit() params = ols.params[1:] ed = diagonalize(list(params[:-2]) + [params[-2], 0, 0, 0, 0, params[-1]]) print(ols.summary()) print(ed) model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_dz', 'Us'] X = df[model] X = sm.add_constant(X) y = df['energy'] ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit() params = ols.params[1:] ed = diagonalize(list(params[:-2]) + [0, params[-2], 0, 0, 0, params[-1]]) print(ols.summary()) print(ed) model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_sz', 'Us'] X = df[model] X = sm.add_constant(X) y = df['energy'] ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit() params = ols.params[1:] ed = diagonalize(list(params[:-2]) + [0, 0, params[-2], 0, 0, params[-1]]) print(ols.summary()) print(ed) model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'mo_t_ds', 'Us'] X = df[model] X = sm.add_constant(X) y = df['energy'] ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit() params = ols.params[1:] ed = diagonalize(list(params[:-2]) + [0, 0, 0, params[-2], 0, params[-1]]) print(ols.summary()) print(ed) model = ['mo_n_4s', 'mo_n_2ppi', 'mo_n_2pz', 'Us'] X = df[model] X = sm.add_constant(X) y = df['energy'] ols = sm.WLS(y, X, np.exp(-beta * (y - min(y)))).fit() params = ols.params[1:] params = list(params[:3]) + [0] + list( params[3:-1]) + [0, 0, 0, 0, params[-1]] ed = diagonalize(params) print(ols.summary()) print(ed) exit(0)
# plt.ylabel(r'$\frac{dj}{dE}$') # plt.savefig('standard_specter.png') # plt.show() # # # # plt.plot(x1, x2, x1, x3) # plt.legend(['krovna plast ', 'sredica'],loc=4) # plt.grid() # plt.xlabel('energija') # plt.ylabel(r'$\frac{dj}{dE}$') # plt.savefig('meritve_specter.png') # plt.show() mod_wls = sm.WLS( x3, A, ) res_wls = mod_wls.fit() parametri = res_wls.params print(res_wls.summary()) razlika1 = x2 - np.dot(A, parametri) mod_wls = sm.WLS( x2, A, ) res_wls = mod_wls.fit() parametri = res_wls.params
def fit_pspec(self, brk=None, log_break=False, low_cut=None, high_cut=None, min_fits_pts=10, weighted_fit=False, bootstrap=False, bootstrap_kwargs={}, verbose=False): ''' Fit the 1D Power spectrum using a segmented linear model. Note that the current implementation allows for only 1 break point in the model. If the break point is estimated via a spline, the breaks are tested, starting from the largest, until the model finds a good fit. Parameters ---------- brk : float or None, optional Guesses for the break points. If given as a list, the length of the list sets the number of break points to be fit. If a choice is outside of the allowed range from the data, Lm_Seg will raise an error. If None, a spline is used to estimate the breaks. log_break : bool, optional Sets whether the provided break estimates are log-ed (base 10) values. This is disabled by default. When enabled, the brk must be a unitless `~astropy.units.Quantity` (`u.dimensionless_unscaled`). low_cut : `~astropy.units.Quantity`, optional Lowest frequency to consider in the fit. high_cut : `~astropy.units.Quantity`, optional Highest frequency to consider in the fit. min_fits_pts : int, optional Sets the minimum number of points needed to fit. If not met, the break found is rejected. weighted_fit : bool, optional Fit using weighted least-squares. The weights are the inverse-squared standard deviations in each radial bin. bootstrap : bool, optional Bootstrap using the model residuals to estimate the parameter standard errors. This tends to give more realistic intervals than the covariance matrix. bootstrap_kwargs : dict, optional Pass keyword arguments to `~turbustat.statistics.fitting_utils.residual_bootstrap`. verbose : bool, optional Enables verbose mode in Lm_Seg. ''' self._bootstrap_flag = bootstrap # Make the data to fit to if low_cut is None: # Default to the largest frequency, since this is just 1 pixel # in the 2D PSpec. self.low_cut = 1. / (0.5 * float(max(self.ps2D.shape)) * u.pix) else: self.low_cut = self._to_pixel_freq(low_cut) if high_cut is None: self.high_cut = self.freqs.max().value / u.pix else: self.high_cut = self._to_pixel_freq(high_cut) x = np.log10(self.freqs[clip_func(self.freqs.value, self.low_cut.value, self.high_cut.value)].value) clipped_ps1D = self.ps1D[clip_func(self.freqs.value, self.low_cut.value, self.high_cut.value)] y = np.log10(clipped_ps1D) if weighted_fit: clipped_stddev = self.ps1D_stddev[clip_func( self.freqs.value, self.low_cut.value, self.high_cut.value)] clipped_stddev[clipped_stddev == 0.] = np.NaN y_err = 0.434 * clipped_stddev / clipped_ps1D if brk is not None: # Try the fit with a break in it. if not log_break: brk = self._to_pixel_freq(brk).value brk = np.log10(brk) else: # A value given in log shouldn't have dimensions if hasattr(brk, "unit"): assert brk.unit == u.dimensionless_unscaled brk = brk.value if weighted_fit: weights = 1 / y_err**2 else: weights = None brk_fit = Lm_Seg(x, y, brk, weights=weights) brk_fit.fit_model(verbose=verbose, cov_type='HC3') if brk_fit.params.size == 5: # Check to make sure this leaves enough to fit to. if sum(x < brk_fit.brk) < min_fits_pts: warnings.warn("Not enough points to fit to." + " Ignoring break.") self._brk = None else: good_pts = x.copy() < brk_fit.brk x = x[good_pts] y = y[good_pts] self._brk = 10**brk_fit.brk / u.pix self._slope = brk_fit.slopes if bootstrap: stderrs = residual_bootstrap(brk_fit.fit, **bootstrap_kwargs) self._slope_err = stderrs[1:-1] self._brk_err = np.log(10) * self.brk.value * \ stderrs[-1] / u.pix else: self._slope_err = brk_fit.slope_errs self._brk_err = np.log(10) * self.brk.value * \ brk_fit.brk_err / u.pix self.fit = brk_fit.fit self._model = brk_fit else: self._brk = None # Break fit failed, revert to normal model warnings.warn("Model with break failed, reverting to model\ without break.") else: self._brk = None self._brk_err = None if self.brk is None: x = sm.add_constant(x) if weighted_fit: model = sm.WLS(y, x, missing='drop', weights=1 / y_err**2) else: model = sm.OLS(y, x, missing='drop') self.fit = model.fit(cov_type='HC3') self._slope = self.fit.params[1] if bootstrap: stderrs = residual_bootstrap(self.fit, **bootstrap_kwargs) self._slope_err = stderrs[1] else: self._slope_err = self.fit.bse[1]
def WLS_regression(x, y, w): #加权最小二乘法回归 X = sm.add_constant(x) regr = sm.WLS(y, X, weights=w).fit() #results.tvalues T值 regr.resid,残差;regr.params,beta值;results.t_test([1,0]) return regr
def signal_factor_test(data, current_price, market_values): """ 单因子测试:回归、IC :param data: data: 时间+行业(列名industry)+因子+股票收益率(y) current_price: 个股流通市值,当作wls的权重 market_values: 市值因子,IC回归的变量之一 :return: table_reg: 估值因子回归测试结果 table_ic: 估值因子IC值分析 """ # 1.数据处理 industy = pd.get_dummies(data.iloc[:, 1]) # 类别特征抽取 y = DataFrame(data.iloc[:, -1]) # 提取回归模型中的Y,先添加类别特征数据框,再加回去(让因变量在最后一列) yname = y.columns.values newdata = data.drop(["industry", yname[0]], axis=1) # 删除行业列和Y newdata = pd.merge(newdata, industy, left_index=True, right_index=True, how="outer") # 数据合并(时间+因子+行业虚拟变量) # wls回归的数据整理 newdata_reg = pd.merge(newdata, y, left_index=True, right_index=True, how="outer") # 数据合并(时间+因子+行业虚拟变量+y) newdata_wls = pd.merge(newdata_reg, current_price, left_index=True, right_index=True, how="outer") # 数据合并(时间+因子+行业虚拟变量+y+权重股票流通价值) # IC回归的数据整理 newdata_mfv = pd.merge(newdata, market_values, left_index=True, right_index=True, how="outer") # 数据合并(时间+因子+行业虚拟变量+市值因子) newdata_ic = pd.merge(newdata_mfv, y, left_index=True, right_index=True, how="outer") # 数据合并(时间+因子+行业虚拟变量+市值因子+y) # 2.拟合回归方程,提取结果,生成表格 time = data.iloc[:, 0].unique() # 提取数据中所有的时间类型 fnum = data.shape[1] - 3 # 因子数量 table_reg = DataFrame(np.zeros((1, 7)), columns=[ "因子", "|t|均值", "|t|>2占比", "t均值", "t均值/t标准差", "因子收益率均值", "因子收益率序列t检验" ]) table_ic = DataFrame( np.zeros((1, 6)), columns=["因子", "IC序列均值", "IC序列标准差", "IR比率", "IC>0占比", "|IC|>0.02占比"]) fig, ax = plt.subplots(1, 2) for i in range(fnum): # 每次因子循环列表清空 tlist = [] # t值列表 rlist = [] # 因子收益率列表 iclist = [] # ic值列表 for j in range(len(time)): # wls,创建属于某一个因子的其中一个一个截面的所有数据集(股票+因子+虚拟变量+y+股票流通价值) newdata2 = newdata_wls[newdata_wls.iloc[:, 0].isin([time[j]])] # IC的ols,创建属于某一个因子的其中一个截面的所有数据集(股票+因子+虚拟变量+市值因子+y) newdata3 = newdata_ic[newdata_ic.iloc[:, 0].isin([time[j]])] # 创建回归方程中的自变量名字列表 col = list(industy.columns.values) factor_data = DataFrame(newdata2.iloc[:, i + 1]) global factor_name # 全局变量 factor_name = list(factor_data.columns.values)[0] col.append(factor_name) # 单因子自变量的名字列表,list.append没有返回值,直接修改col # wls回归 y = newdata2.iloc[:, -2] x = sm.add_constant(newdata2.loc[:, col]) # 增加常数项 reg = sm.WLS(y, x, weights=newdata2.iloc[:, -1]) # loc是列名索引,exog增加截距 model = reg.fit() # IC的ols回归 col.append(list(DataFrame(market_values).columns.values)[0]) x_ic = sm.add_constant(newdata3.loc[:, col]) reg_ic = sm.WLS(y, x_ic) model_ic = reg_ic.fit() # 提取wls回归模型结果 tvalues = DataFrame(model.tvalues).iloc[-1, :] weight = DataFrame(model.params) tlist.append(DataFrame(model.tvalues).iloc[-1, :]) rlist.append(weight.iloc[-1, :]) # 提取IC的ols回归模型结果 iclist.append(np.sqrt(1 - model_ic.rsquared)) # wls结果合并 tarr = np.array(tlist) rarr = np.array(rlist) table = { "因子": factor_name, "|t|均值": np.mean(np.abs(tarr)), "|t|>2占比": list(np.where(np.abs(tarr) > 2, 1, 0)).count(1) / len(time), "t均值": np.mean(tarr), "t均值/t标准差": np.mean(tarr) / np.std(tarr), "因子收益率均值": np.mean(rarr), "因子收益率序列t检验": np.std(rarr) } table_reg0 = DataFrame(table, columns=[ "因子", "|t|均值", "|t|>2占比", "t均值", "t均值/t标准差", "因子收益率均值", "因子收益率序列t检验" ], index=[i + 1]) ax[0].plot(time, rarr.cumsum(), label=factor_name) # IC结果合并 table_reg = pd.concat([table_reg, table_reg0]) # 纵向合并 icarr = np.array(iclist) table1 = { "因子": factor_name, "IC序列均值": np.mean(icarr), "IC序列标准差": np.std(icarr), "IR比率": np.mean(icarr) / np.std(icarr), "IC>0占比": list(np.where(icarr > 0, 1, 0)).count(1) / len(icarr), "|IC|>0.02占比": list(np.where(np.abs(icarr) > 0.02, 1, 0)).count(1) / len(icarr) } table_ic0 = DataFrame(table1, columns=[ "因子", "IC序列均值", "IC序列标准差", "IR比率", "IC>0占比", "|IC|>0.02占比" ], index=[i + 1]) table_ic = pd.concat([table_ic, table_ic0]) # 纵向合并 ax[1].plot(time, icarr.cumsum(), "b-") table_reg = table_reg.iloc[1:, :].set_index("因子") # set_index 列变索引 table_ic = table_ic.iloc[1:, :].set_index("因子") plt.legend(loc="upper center") plt.show() return table_reg, table_ic
def __init__(self,x,y,w): self.x = x self.y = y self.w = w self.model = sm.WLS(y,sm.add_constant(x),weights = self.w) self.fit = self.model.fit()
dS = np.min(np.stack( [abs(np.ceil(dY) - dY), abs(dY - np.floor(dY))]), axis=0) d = np.stack([dY, dQ, dS]) if n > 100: hh = np.repeat(h[:, None], n, axis=1) bW = False while ~bW: bW = np.min(np.sum((hh - d) > 0, axis=1)) > 100 hh = hh * 1.1 if not bW else hh else: htemp = np.max(d, axis=1) * 1.1 hh = np.repeat(htemp[:, None], n, axis=1) w = (1 - (d / hh)**3)**3 w[w < 0] = 0 wAll = w[0] * w[1] * w[2] ind = np.where(wAll > 0)[0] ww = wAll[ind] # fit WLS Y = df1.iloc[ind][code].values X = df1.iloc[ind][xVarLst].values model = sm.WLS(Y, X, weights=ww).fit() xp = df2.loc[t][xVarLst].values yp = model.predict(xp)[0] dfYP.loc[t][code] = np.exp(yp) - sn t1 = time.time() print(k, siteNo, code, t1 - t0) saveName = os.path.join(dirOut, siteNo) dfYP.to_csv(saveName)
data_day_dummies, how='left', left_index=True, right_index=True, sort=False) industry_t = list(data_day_style_t.loc[loc_t, 'INDUSTRY'].unique()) columns_t = industry_t + style x = data_day_style_t.loc[loc_t, columns_t].values X = sm.add_constant(x) y = data_day_style_t.loc[ loc_t, 'specific_risk_raw'].values # notice: 写下一个相同的循环时,需修改此处 specific_risk_NW Y = np.log(y) stock_weights = data_day_style_t.loc[loc_t, 'WEIGHT'].values stock_weights = np.sqrt(stock_weights) wls_model = sm.WLS(Y, X, weights=stock_weights) # Notice: stock_weights wls_results = wls_model.fit() params_t = wls_results.params # x_predict = data_day_style_t.loc[~loc_t, columns_t].values X_predict = sm.add_constant(x_predict) Y_predict = np.mat(X_predict) * np.mat(params_t.reshape(-1, 1)) y_predict = np.array(Y_predict.T)[0] y_predict = E_0 * np.exp(y_predict) # data_day_style_t.loc[loc_t, 'specific_risk_SM_1'] = data_day_style_t.loc[ loc_t, 'specific_risk_raw'] specific_risk_t = data_day_style_t.loc[ ~loc_t, 'coordination_coef'].values * data_day_style_t.loc[ ~loc_t, 'specific_risk_raw'].values
#wh3=sm.stats.diagnostic.het_ #white(res3.resid,res3.model.exog) ##########Linearidade ln1=sm.stats.diagnostic.linear_reset(res1, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None) ln2=sm.stats.diagnostic.linear_reset(res2, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None) ln3=sm.stats.diagnostic.linear_reset(res3, power=3, test_type='fitted', use_f=False, cov_type='nonrobust', cov_kwargs=None) #####################Corrigindo a autocorrelação x = sm.add_constant(x) y1=dados['Taxa de Transmissão -B1'] y2=dados['Taxa de Transmissão -B2'] wls_model = sm.WLS(y1,x, weights=list(range(1,99))) results = wls_model.fit() wls_model2=sm.WLS(y2,x,weights=list(range(1,99))) results2=wls_model2.fit() ###############Testes ###########Heterogeneidade bp1=sm.stats.diagnostic.het_breuschpagan(results.resid,results.model.exog) bp2=sm.stats.diagnostic.het_breuschpagan(results2.resid,results2.model.exog) ############# bg1=sm.stats.diagnostic.acorr_breusch_godfrey(results) bg1=sm.stats.diagnostic.acorr_breusch_godfrey(results2)
def estime_income(hhcat, finalhhframe): ''' Estimates income brought by each category of adults/elderly. The objective is not to have a good model of income but rather to find a starting point for making the income grow based on the household's composition. If the income of the unemployed or elderly is found negative, it is put equal to zero and we re-estimate. If the coefficients are non significant, we try different categories (by grouping existing categories) and keep the new coefficients only if they become significant. We ignore the richest 5%. ''' select = finalhhframe.Y < float( perc_with_spline(finalhhframe.Y, finalhhframe.weight * finalhhframe.nbpeople, 0.95)) X = finalhhframe.ix[select, [ 'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers', 'cat5workers', 'cat6workers', 'cat7workers', 'old' ]].copy() w = finalhhframe.ix[select, 'weight'].copy() w[w == 0] = 10**(-10) Y = (finalhhframe.ix[select, 'Y'] * finalhhframe.ix[select, 'nbpeople']) result = sm.WLS(Y, X, weights=1 / w).fit() inc = result.params nonworkers = inc[['cat7workers', 'old']].copy() negs = nonworkers[nonworkers < 0].index if len(negs) > 0: X.drop(negs.values, axis=1, inplace=True) result = sm.WLS(Y, X, weights=1 / w).fit() inc = result.params for ii in negs: inc[ii] = 0 a = result.pvalues nonsign1 = a[a > 0.05].index nonsign2 = [] nonsign3 = [] if len(nonsign1) > 0: X = finalhhframe.ix[select, [ 'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers', 'cat5workers', 'cat6workers', 'cat7workers', 'old' ]].copy() X['serv'] = X['cat1workers'] + X['cat2workers'] X['ag'] = X['cat3workers'] + X['cat4workers'] X['manu'] = X['cat5workers'] + X['cat6workers'] X.drop([ 'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers', 'cat5workers', 'cat6workers' ], axis=1, inplace=True) result3 = sm.WLS(Y, X, weights=1 / w).fit() a3 = result3.pvalues nonsign3 = a3[a3 > 0.05].index if (len(nonsign3) == 0): inctemp = result3.params inc['cat2workers'] = inctemp['serv'] inc['cat4workers'] = inctemp['ag'] inc['cat6workers'] = inctemp['manu'] inc['cat1workers'] = inctemp['serv'] inc['cat3workers'] = inctemp['ag'] inc['cat5workers'] = inctemp['manu'] else: X = finalhhframe.ix[select, [ 'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers', 'cat5workers', 'cat6workers', 'cat7workers', 'old' ]].copy() X['skilled'] = X['cat2workers'] + X['cat4workers'] + X[ 'cat6workers'] X['unskilled'] = X['cat1workers'] + X['cat3workers'] + X[ 'cat5workers'] X.drop([ 'cat1workers', 'cat2workers', 'cat3workers', 'cat4workers', 'cat5workers', 'cat6workers' ], axis=1, inplace=True) result2 = sm.WLS(Y, X, weights=1 / w).fit() a2 = result2.pvalues nonsign2 = a2[a2 > 0.05].index if len(nonsign2) == 0 | ((len(nonsign2) < len(nonsign1)) & (len(nonsign2) < len(nonsign3))): inctemp = result2.params inc['cat2workers'] = inctemp['skilled'] inc['cat4workers'] = inctemp['skilled'] inc['cat6workers'] = inctemp['skilled'] inc['cat1workers'] = inctemp['unskilled'] inc['cat3workers'] = inctemp['unskilled'] inc['cat5workers'] = inctemp['unskilled'] else: if (len(nonsign3) < len(nonsign1)) & (len(nonsign3) < len(nonsign2)): inctemp = result3.params inc['cat2workers'] = inctemp['serv'] inc['cat4workers'] = inctemp['ag'] inc['cat6workers'] = inctemp['manu'] inc['cat1workers'] = inctemp['serv'] inc['cat3workers'] = inctemp['ag'] inc['cat5workers'] = inctemp['manu'] return inc
def match(self, obj=None, cat=None, sr=5./3600, verbose=False, predict=True, ra=None, dec=None, x=None, y=None, mag=None, magerr=None, flags=None, filter_name='V', order=4, bg_order=None, color_order=None, hard_mag_limit=99, mag_id=0, magerr0=0.02, sn=None, thresh=5.0, mask=None, good_flags=0x0): """Match a set of points with catalogue""" self.success = False self.ngoodstars = 0 self.order = order self.bg_order = bg_order self.color_order = color_order self.mag_id = mag_id self.filter_name = filter_name if filter_name in ['B', 'V', 'R', 'I', 'g', 'r', 'i', 'z']: # Generic names cmag,cmagerr = cat[filter_name], cat[filter_name + 'err'] self.cat_filter_name = filter_name elif filter_name == 'Clear': # Mini-MegaTORTORA cmag,cmagerr = cat['V'], cat['Verr'] self.cat_filter_name = 'V' elif filter_name == 'N': # FRAMs cmag,cmagerr = cat['R'], cat['Rerr'] self.cat_filter_name = 'R' else: if verbose: print('Unsupported filter name: %s' % filter_name) return False # TODO: make it configurable?.. color = cat['B'] - cat['V'] self.cat_color_name = 'B - V' # Objects to match if obj is not None: ra = obj['ra'] dec = obj['dec'] x = obj['x'] y = obj['y'] mag = obj['mag'] magerr = obj['magerr'] flags = obj['flags'] else: if ra is None or dec is None or x is None or y is None or mag is None: raise ValueError('Data for matching are missing') if magerr is None: magerr = np.ones_like(mag)*np.std(mag) if flags is None: flags = np.zeros_like(ra, dtype=np.int) if self.width is None or self.height is None: self.x0,self.y0,self.width,self.height = np.mean(x), np.mean(y), np.max(x)-np.min(x), np.max(y) - np.min(y) # Match stars h = htm.HTM(10) oidx,cidx,dist = h.match(ra, dec, cat['ra'],cat['dec'], sr, maxmatch=0) if verbose: print(len(oidx), 'matches between', len(ra), 'objects and', len(cat['ra']), 'stars, sr = %.1f arcsec' % (3600.0*sr)) self.oidx,self.cidx,self.dist = oidx, cidx, dist self.cmag = cmag[cidx] self.cmagerr = cmagerr[cidx] self.color = color[cidx] self.ox,self.oy = x[oidx], y[oidx] self.oflags = flags[oidx] self.omag,self.omagerr = mag[oidx], magerr[oidx] if len(self.omag.shape) > 1: # If we are given a multi-aperture magnitude column self.omag,self.omagerr = self.omag[:, mag_id], self.omagerr[:, mag_id] # Scaled spatial coordinates for fitting sx = (self.ox - self.x0)*2/self.width sy = (self.oy - self.y0)*2/self.height # Optimal magnitude cutoff for fitting, as a mean mag where S/N = 10 idx = (1.0/self.omagerr > 5) & (1.0/self.omagerr < 15) if np.sum(idx) > 10: X = make_series(1.0, sx, sy, order=order) X = np.vstack(X).T Y = self.cmag self.C_mag_limit = sm.RLM(Y[idx], X[idx]).fit() mag_limit = np.sum(X*self.C_mag_limit.params, axis=1) else: if verbose: print('Not enough matches with SN~10:', np.sum(idx)) self.C_mag_limit = None mag_limit = 99.0*np.ones_like(cmag) self.zero = self.cmag - self.omag # We will build a model for this variable self.zeroerr = np.hypot(self.omagerr, self.cmagerr) self.zeroerr = np.hypot(self.zeroerr, magerr0) self.weights = 1.0/self.zeroerr**2 X = make_series(1.0, sx, sy, order=self.order) if self.bg_order is not None: X += make_series(-2.5/np.log(10)/10**(-0.4*self.omag), sx, sy, order=self.bg_order) if self.color_order is not None: X += make_series(self.color, sx, sy, order=self.color_order) X += make_series(self.color**2, sx, sy, order=self.color_order) X += make_series(self.color**3, sx, sy, order=self.color_order) X = np.vstack(X).T self.idx0 = ((self.oflags & (~good_flags)) == 0) & (self.cmag < hard_mag_limit) & (self.cmag < mag_limit) if mask is not None: # Exclude masked objects self.idx0 &= ~mask if sn is not None: self.idx0 &= (self.omagerr < 1.0/sn) # Actual fitting self.idx = self.idx0.copy() for iter in range(3): if np.sum(self.idx) < 3: if verbose: print("Fit failed - %d objects" % np.sum(self.idx)) return False self.C = sm.WLS(self.zero[self.idx], X[self.idx], weights=self.weights[self.idx]).fit() # self.C = sm.RLM(self.zero[self.idx], X[self.idx]).fit() self.zero_model = np.sum(X*self.C.params, axis=1) self.idx = self.idx0.copy() if thresh and thresh > 0: self.idx &= (np.abs((self.zero - self.zero_model)/self.zeroerr) < thresh) self.std = np.std((self.zero - self.zero_model)[self.idx]) self.ngoodstars = np.sum(self.idx) self.success = True if verbose: print('Fit finished:', self.ngoodstars, 'stars, rms', self.std) if predict: self.predict(obj=obj, x=x, y=y, mag=mag, magerr=magerr, mag_id=mag_id, verbose=verbose) return True
Wchengfen = pd.DataFrame(flow_ev.iloc[i, :].copy()) Wchengfen = Wchengfen.drop(['000061'], axis=0) Wchengfen = pd.DataFrame(Wchengfen.replace(0, 1000)) summ = np.array(Wchengfen) temp_Hp = Wchengfen / sum(summ) temp_HpT = temp_Hp.transpose() Hp[i] = temp_Hp for j in range(120): row = i + j temp_Y = pd.DataFrame(newret.iloc[row, :]) #WLS #将剩下的用于回归,因为前面扔掉了最后三个股票的财务因子,以流通市值开根号为权重,WLS回归 temp_W = pd.DataFrame((flow_ev.iloc[row, :].copy())**0.5) temp_W = temp_W.drop(['000063'], axis=0) temp_W = pd.DataFrame(temp_W.replace(0, 1000)) mod_wls = sm.WLS(temp_Y, temp_X, weights=1. / temp_W) res_wls = mod_wls.fit() residual_here = pd.DataFrame(res_wls.resid) temp_residual[j, :] = residual_here.transpose() temp_factor_return[j, :] = res_wls.params #记录WLS的回归权重 WLS_weight[i, :] = temp_W.transpose() #收录残差和因子收益率进入字典 residual[i] = temp_residual factor_return[i] = temp_factor_return #计算组合总方差 temp_residual_cov = pd.DataFrame(np.cov(temp_residual.transpose())) X = temp_X XT = X.transpose() #因子收益率方差协方差矩阵 temp_factor_return_cov = pd.DataFrame(
sidak2.sort_values('unadj_p', inplace=True) print(sidak2) fdr2 = ols_model.outlier_test('fdr_bh') fdr2.sort_values('unadj_p', inplace=True) print(fdr2) # * Let's delete that line l = ax.lines[-1] l.remove() del l weights = np.ones(len(X)) weights[X[X['log.Te'] < 3.8].index.values - 1] = 0 wls_model = sm.WLS(y, X, weights=weights).fit() abline_plot(model_results=wls_model, ax=ax, color='green') # * MM estimators are good for this type of problem, unfortunately, we # don't yet have these yet. # * It's being worked on, but it gives a good excuse to look at the R cell # magics in the notebook. yy = y.values[:, None] xx = X['log.Te'].values[:, None] print(params) abline_plot(intercept=params[0], slope=params[1], ax=ax, color='red') # ### Exercise: Breakdown points of M-estimator
def is_variable_long_term(self, date_col="Julian Date", radvel_col="Radial Velocity (m/s)", err_col="Error (m/s)"): """ TODO:: Get this program double-checked with Zechmeister et al. 2009 AND Dr. Haywood Checks whether the radial velocity data for a star contains a long-term trend, as described by Zechmeister et. al (2009). The default format for xcol, ycol, and err_col is based off of the HiRES publicly available radial velocity data (Butler, Vogt, Laughlin et al. 2017). Note:: If the dataset contains less than 6 datapoints, the function returns "0", as it is too small of a sample size to run the statistical tests used. :param date_col: string, the name of the column in the pandas DataFrame that contains the Julian Date. :param radvel_col: string,the name of the column in the pandas DataFrame that contains the radial velocity data. :param err_col: string, the name of the column in the pandas DataFrame that contains the measurement errors. :return: boolean true or false (true if the star has significant long-term variability and false if it does not). """ y = self.df[radvel_col].to_numpy() X = self.df[date_col].to_numpy() w = self.df[err_col] if y.size <= 5: return 0 else: # Fit a linear line of best fit (weighted least squares). mod_wls = sm.WLS(y, X, weights=w) res_wls = mod_wls.fit() print(res_wls.summary()) m = res_wls.params[0] # Calculate chi-squared statistic for line of best fit and constant model. # Stack overflow ref: https://stackoverflow.com/questions/35730534/numpy-generate-data-from-linear-function x = np.arange( y.size ) # Generate data using the linear function (Garret R, Stack overflow) delta = np.random.uniform(-1 * np.amax(w), np.amax(w), size=y.size) y_ = np.add(m * x, delta) pslope = scipy.stats.chisquare(y_, self.weighted_mean)[1] pconstant = scipy.stats.chisquare(y, self.weighted_mean)[1] # Calculate F-statistic for p values of slope and constant models (Zechmeister et al. 2009) fslope = (y.size - 2) * ((pconstant - pslope) / pslope) # p-value from F-statistic p = 1 - scipy.stats.f.cdf(fslope, y.size, y.size) check = check_p(p, self.alpha) y_diff = [] if check: for i in range(1, y.size): y_diff.append(y[i] - y[i - 1]) return check
hq_dict_no_suspension[stock][i - j - 1][2] - 1.0) if SHIBOR_dict.has_key( hq_dict_no_suspension[stock][i - j][0]) == True: this_shibor_list.append( SHIBOR_dict[hq_dict_no_suspension[stock][i - j][0]]) else: this_shibor_list.append(SHIBOR_dict['20061008']) this_index_ROR_list.append(index_return_dict[( Now_Index, hq_dict_no_suspension[stock][i - j][0])]) j += 1 this_minus_list = xyk_common_data_processing.element_cal_between_list( temp_ROR_list, this_shibor_list, "-") X = sm.add_constant(this_index_ROR_list) Y = this_minus_list wls_model = sm.WLS(Y, X, weights=half_life_list) results = wls_model.fit() this_beta = float(results.params[1]) resid_list = results.resid this_resid_mean = sum(resid_list) / float(len(resid_list)) this_treated_list = [] for resid_data in resid_list: this_treated_list.append((resid_data - this_resid_mean) * (resid_data - this_resid_mean)) this_HSIGMA = math.sqrt( xyk_common_data_processing.weighted_mean(this_treated_list, half_life_list, use_df=1)) result_list.append([stock, data[0], this_beta, this_HSIGMA]) ''' ***输出至DB***
def simulations(sim_type, save=False): rs = np.random.RandomState(seed) remaining = NUM_SIM results = defaultdict(list) start = dt.datetime.now() while remaining > 0: this_iter = min(remaining, MAX_SIM_SIZE) remaining -= this_iter if sim_type == 'normal': dist = rs.standard_normal else: dist = rs.standard_exponential rvs = dist((MAX_SIZE, this_iter)) sample_sizes = [ ss for ss in SAMPLE_SIZES if ss >= MIN_SAMPLE_SIZE[sim_type] ] for ss in sample_sizes: sample = rvs[:ss] mu = sample.mean(0) if sim_type == 'normal': std = sample.std(0, ddof=1) z = (sample - mu) / std cdf_fn = stats.norm.cdf else: z = sample / mu cdf_fn = stats.expon.cdf z = np.sort(z, axis=0) nobs = ss cdf = cdf_fn(z) plus = np.arange(1.0, nobs + 1) / nobs d_plus = (plus[:, None] - cdf).max(0) minus = np.arange(0.0, nobs) / nobs d_minus = (cdf - minus[:, None]).max(0) d = np.max(np.abs(np.c_[d_plus, d_minus]), 1) results[ss].append(d) logging.log( logging.INFO, 'Completed {0}, remaining {1}'.format(NUM_SIM - remaining, remaining)) elapsed = dt.datetime.now() - start rem = elapsed.total_seconds() / (NUM_SIM - remaining) * remaining logging.log(logging.INFO, '({0}) Time remaining {1:0.1f}s'.format(sim_type, rem)) for key in results: results[key] = np.concatenate(results[key]) if save: file_name = 'lilliefors-sim-{0}-results.pkl.gz'.format(sim_type) with gzip.open(file_name, 'wb', 5) as pkl: pickle.dump(results, pkl) crit_vals = {} for key in results: crit_vals[key] = np.percentile(results[key], PERCENTILES) start = 20 num = len([k for k in crit_vals if k >= start]) all_x = np.zeros((num * len(PERCENTILES), len(PERCENTILES) + 2)) all_y = np.zeros(num * len(PERCENTILES)) loc = 0 for i, perc in enumerate(PERCENTILES): y = pd.DataFrame(results).quantile(perc / 100.) y = y.loc[start:] all_y[loc:loc + len(y)] = np.log(y) x = y.index.values.astype(np.float) all_x[loc:loc + len(y), -2:] = np.c_[np.log(x), np.log(x)**2] all_x[loc:loc + len(y), i:(i + 1)] = 1 loc += len(y) w = np.ones_like(all_y).reshape(len(PERCENTILES), -1) w[6:, -5:] = 3 w = w.ravel() res = sm.WLS(all_y, all_x, weights=w).fit() params = [] for i in range(len(PERCENTILES)): params.append(np.r_[res.params[i], res.params[-2:]]) params = np.array(params) df = pd.DataFrame(params).T df.columns = PERCENTILES asymp_crit_vals = {} for col in df: asymp_crit_vals[col] = df[col].values code = '{0}_crit_vals = '.format(sim_type) code += str(crit_vals).strip() + '\n\n' code += '\n# Coefficients are model ' code += 'log(cv) = b[0] + b[1] log(n) + b[2] log(n)**2\n' code += '{0}_asymp_crit_vals = '.format(sim_type) code += str(asymp_crit_vals) + '\n\n' return code
def func(siteNo, fitAll=True): # prep data print(siteNo) saveName = os.path.join(dirOut, siteNo) if os.path.exists(saveName): return () t0 = time.time() varQ = '00060' varLst = codeLst + [varQ] df = waterQuality.readSiteTS(siteNo, varLst=varLst, freq='W') dfYP = pd.DataFrame(index=df.index, columns=codeLst) dfX = pd.DataFrame({'date': df.index}).set_index('date') dfX = dfX.join(np.log(df[varQ] + sn)).rename(columns={varQ: 'logQ'}) yr = dfX.index.year.values t = yr + dfX.index.dayofyear.values / 365 dfX['sinT'] = np.sin(2 * np.pi * t) dfX['cosT'] = np.cos(2 * np.pi * t) dfX['yr'] = yr dfX['t'] = t xVarLst = ['yr', 'logQ', 'sinT', 'cosT'] # train / test fitCodeLst = list() for code in codeLst: if siteNo in dictSite[code]: fitCodeLst.append(code) for code in fitCodeLst: ind1 = np.where(yr < 2010)[0] ind2 = np.where(yr >= 2010)[0] dfXY = dfX.join(np.log(df[code] + sn)) df1 = dfXY.iloc[ind1].dropna() if fitAll: df2 = dfXY[xVarLst + ['t']].dropna() else: df2 = dfXY.iloc[ind2].dropna() # only fit for observations now n = len(df1) if n == 0: break # calculate weight h = np.array([7, 2, 0.5]) # window [Y Q S] from EGRET tLst = df2.index.tolist() for t in tLst: dY = np.abs((df2.loc[t]['t'] - df1['t']).values) dQ = np.abs((df2.loc[t]['logQ'] - df1['logQ']).values) dS = np.min(np.stack( [abs(np.ceil(dY) - dY), abs(dY - np.floor(dY))]), axis=0) d = np.stack([dY, dQ, dS]) if n > 100: hh = np.repeat(h[:, None], n, axis=1) bW = False while ~bW: bW = np.min(np.sum((hh - d) > 0, axis=1)) > 100 hh = hh * 1.1 if not bW else hh else: htemp = np.max(d, axis=1) * 1.1 hh = np.repeat(htemp[:, None], n, axis=1) w = (1 - (d / hh)**3)**3 w[w < 0] = 0 wAll = w[0] * w[1] * w[2] ind = np.where(wAll > 0)[0] ww = wAll[ind] # fit WLS Y = df1.iloc[ind][code].values X = df1.iloc[ind][xVarLst].values model = sm.WLS(Y, X, weights=ww).fit() xp = df2.loc[t][xVarLst].values yp = model.predict(xp)[0] dfYP.loc[t][code] = np.exp(yp) - sn t1 = time.time() print(siteNoLst.index(siteNo), siteNo, code, t1 - t0) saveName = os.path.join(dirOut, siteNo) dfYP.to_csv(saveName) return
df['females_education'] = np.log1p(df['females_education']) df['life_expectancy'] = np.log1p(df['life_expectancy']) df['unemployment'] = np.log1p(df['unemployment']) #declare variable y = df[['Adult_victims']] x = df[["life_expectancy", "policy_index", "females_education"]] X = sm.add_constant(x) x2 = df[[ "policy_index", "females_education", "life_expectancy", "unemployment" ]] X2 = sm.add_constant(x2) # Model preparation w = len(df['country']) mod_wls = sm.WLS(y, X2, weights=1. / w) res_wls = mod_wls.fit() print(res_wls.summary()) #new dataset df_new = pd.read_csv("new.csv") df_new = df_new.replace(np.nan, 0) df_new.rename(columns={ 'persons prosecuted': 'persons_prosecuted', 'policy index': 'policy_index', 'child victims': 'child_victims', 'Adult victims': 'Adult_victims', 'life expectancy': 'life_expectancy', '% females in primary education': 'females_education' }, inplace=True)
nsample = 25 #x = np.linspace(0, 25, nsample) x = df.plate_maria X = np.column_stack((x, (x - 5)**2)) X = sm.add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6 // 10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:, [0, 1]] # Step-3. mod_wls = sm.WLS(y, X, weights=1. / (w**2)) res_wls = mod_wls.fit() print(res_wls.summary()) # Step-4. res_ols = sm.OLS(y, X).fit() print(res_ols.params) print(res_wls.params) # Step-5. se = np.vstack([[res_wls.bse], [res_ols.bse], [res_ols.HC0_se], [res_ols.HC1_se], [res_ols.HC2_se], [res_ols.HC3_se]]) se = np.round(se, 4) colnames = ['x1', 'const'] rownames = ['WLS', 'OLS', 'OLS_HC0', 'OLS_HC1', 'OLS_HC3', 'OLS_HC3'] tabl = SimpleTable(se, colnames, rownames, txt_fmt=default_txt_fmt)
plt.ylabel('residual') plt.title('Scatter of ypred and residual') plt.subplots_adjust(hspace=.5, wspace = .5) plt.savefig(save_path + 'Scatter_res.jpg') plt.close() # plt.show() est = sm.OLS(np.abs(residual), x[[0,2,6]]) est1 = est.fit() print(est1.summary()) sigma_pred = est1.predict(x[[0,2,6]]) # print(sigma_pred.shape) mod_wls = sm.WLS(y, x, weights=1./(sigma_pred ** 2)) res_wls = mod_wls.fit() print(res_wls.summary()) print(res_wls.summary().as_latex()) w = 1./(sigma_pred ** 2) ypred = res_wls.predict(x) residual = y - ypred RSS = sum(w * (residual)**2) weighted_mean = sum(w * y)/sum(w) TSS = sum(w * (y - weighted_mean)**2) print(TSS, RSS) print(1 - RSS/TSS) plt.plot(1/sigma_pred**2,'.') plt.show()
coeff[i] = results.params[1] std_err[i] = results.bse[1] p_val[i] = results.pvalues[1] R2[i] = results.rsquared toc = time.perf_counter() print(f"Execution time of OLS: {toc-tic}s") # Full stats (with diagnostics) WLS model tic = time.perf_counter() for i in range(lm_data.shape[0]): X = SM.add_constant(lm_data.columns) y = lm_data.iloc[i] w = 1/(std_data.iloc[i] ** 2) model = SM.WLS(y, X, weights=w) results = model.fit() coeff[i] = results.params[1] std_err[i] = results.bse[1] p_val[i] = results.pvalues[1] R2[i] = results.rsquared toc = time.perf_counter() print(f"Execution time of WLS: {toc-tic}s") # Full stats (with diagnostics) Robust OLS model tic = time.perf_counter() for i in range(lm_data.shape[0]): X = SM.add_constant(lm_data.columns) y = lm_data.iloc[i]
def predict_abc(interp, extrap, interp_index, extrap_index, weight, interp_weights, extrap_weights, cs, abc, verbose=True): # set up age range ages = range(22, 30) + range(31, 68) # set up dictionaries to store output params_interp = {} params_extrap = {} error_mat = {} # set up matrices for interpolation/extrapolation parameters, and errors for sex in ['pooled', 'male', 'female']: params_interp[sex] = pd.DataFrame( [[np.nan for j in range(len(cols.interp.predictors) + 3)] for k in range(22, 30)], index=range(22, 30)) params_interp[sex].index.names = ['age'] params_interp[sex].columns = ['Intercept'] + cols.interp.predictors + [ 'y' ] + ['rmse'] params_extrap[sex] = pd.DataFrame( [[np.nan for j in range(len(cols.extrap.predictors) + 3)] for k in range(31, 68)], index=range(31, 68)) params_extrap[sex].index.names = ['age'] params_extrap[sex].columns = ['Intercept'] + cols.extrap.predictors + [ 'y' ] + ['rmse'] error_mat[sex] = pd.DataFrame([]) # obtain parameters for every age for age in ages: if age in range(22, 30): aux = deepcopy(interp.loc[interp_index, :]) if age == 22: interp_weights.reset_index(inplace=True) del interp_weights['draw'] interp_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy( interp_weights.loc[pd.IndexSlice[interp_index], :]) age_x = age - 1 predictors = cols.interp.predictors + ['inc_labor{}'.format(age_x)] elif age in range(31, 68): aux = deepcopy(extrap.loc[extrap_index, :]) if age == 31: age_x = 29 predictors = cols.extrap.predictors + [ 'inc_labor{}'.format(age_x) ] else: age_x = age - 1 predictors = cols.extrap.predictors + [ 'inc_labor{}'.format(age_x) ] if age == 31: extrap_index_weight = [x[1] for x in extrap_index] extrap_weights.reset_index(inplace=True) del extrap_weights['draw'] extrap_weights.set_index('id', inplace=True, drop=True) weight_array = deepcopy( extrap_weights.loc[extrap_index_weight, :]) c = 'inc_labor{}'.format(age) # drop black # drop black aux = aux.loc[aux.black == 1] # obtain parameters for different sexes for sex in ['pooled', 'male', 'female']: if sex == 'pooled': data = aux abcd = abc abcd_count = abcd.shape[0] elif sex == 'male': data = aux.loc[aux.male == 1] abcd = abc.loc[abc.male == 1] abcd_count = abcd.loc[abcd['male'] == 1]['male'].count() else: data = aux.loc[aux.male == 0] abcd = abc.loc[abc.male == 0] abcd_count = abcd.loc[abcd['male'] == 0]['male'].count() if weight == 'treat': abcd = abcd.loc[abcd.R == 1] elif weight == 'control': abcd = abcd.loc[abcd.R == 0] # reset auxiliary index (because dmatrices won't use id) data.reset_index('id', drop=True, inplace=True) data.index = [j for j in range(data.shape[0])] weight_array.reset_index('id', drop=True, inplace=True) weight_array.index = [j for j in range(weight_array.shape[0])] #weight_array = weight_array[data.index] # create design matrix for regressions fmla = '{} ~ {}'.format(c, ' + '.join(predictors)) endog, exog = dmatrices(fmla, data, return_type='dataframe') exog = sm.add_constant(exog) exog_index = [x for x in exog.index] weight_forWLS = weight_array.loc[pd.IndexSlice[exog_index]] weight_type = 'wtabc_allids_c' + cs + '_' + weight weight_forWLS = weight_forWLS.loc[:, weight_type] weight_forWLS.dropna(axis=0, inplace=True) exog = exog.loc[weight_forWLS.index, :] endog = endog.loc[weight_forWLS.index, :] # estimate coefficients fail_switch = 0 try: model = sm.WLS(endog, exog, weights=weight_forWLS) fit = model.fit() params = fit.params resid = fit.resid except: fail_switch = 1 if age in range(22, 30): params = pd.Series( [np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.interp.predictors + ['y']) else: params = pd.Series( [np.nan for j in range(1 + len(predictors))], index=['Intercept'] + cols.extrap.predictors + ['y']) resid = pd.Series([np.nan for j in range(endog.shape[0])]) # calculate RMSE rmse = resid * resid rmse = pd.Series(sqrt(rmse.mean(axis=0)), index=['rmse']) params = pd.concat([params, rmse], axis=0) params.rename({'inc_labor{}'.format(age_x): 'y'}, inplace=True) if age in range(22, 30): params_interp[sex].loc[age, :] = params else: params_extrap[sex].loc[age, :] = params # resample the errors, and merge in with ABC IDs if fail_switch == 0: ehat = pd.DataFrame(np.random.choice(resid, size=abcd_count)) else: ehat = pd.DataFrame([np.nan for j in range(abcd_count)]) abcd_ix = abcd.reset_index(level=0) ehat = pd.concat([abcd_ix.loc[:, 'id'], ehat], axis=1) ehat.columns = ['id', age] ehat.columns.name = 'age' ehat.set_index('id', inplace=True) error_mat[sex] = pd.concat([error_mat[sex], ehat], axis=1) if verbose: print 'Successful predictions, age {}, n={}'.format( age, exog.shape[0]) # add treatment indicator back into error matrix, add column names treat = abc.loc[:, 'R'] for sex in ['pooled', 'male', 'female']: error_mat[sex] = pd.concat([error_mat[sex], treat], axis=1, join='inner') params_interp[sex].columns.name = 'variable' params_extrap[sex].columns.name = 'variable' male_interp_nix = abcd.loc[abcd.male == 1].loc[pd.isnull( abcd.loc[abcd.male == 1, cols.interp.predictors]).any(axis=1)].index female_interp_nix = abcd.loc[abcd.male == 0].loc[pd.isnull( abcd.loc[abcd.male == 0, cols.interp.predictors]).any(axis=1)].index male_extrap_nix = abcd.loc[abcd.male == 1].loc[pd.isnull( abcd.loc[abcd.male == 1, cols.extrap.predictors]).any(axis=1)].index female_extrap_nix = abcd.loc[abcd.male == 0].loc[pd.isnull( abcd.loc[abcd.male == 0, cols.extrap.predictors]).any(axis=1)].index # remove errors for ABC individuals for whom we do not predict earnings # interp (we only check age 22 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_interp_nix, slice(0, 8)] = np.nan error_mat['female'].loc[female_interp_nix, slice(0, 8)] = np.nan error_mat['pooled'].loc[female_interp_nix.append(male_interp_nix), slice(0, 8)] = np.nan # extrap (we only check age 31 since predicatablity of each year are based on the same set of outcomes) error_mat['male'].loc[male_extrap_nix, slice(9, 45)] = np.nan error_mat['female'].loc[female_extrap_nix, slice(9, 45)] = np.nan error_mat['pooled'].loc[female_extrap_nix.append(male_extrap_nix), slice(9, 45)] = np.nan # predict earnings projection_interp = {} projection_extrap = {} abc.loc[:, 'Intercept'] = [1 for j in range(abc.shape[0])] for sex in ['pooled', 'male', 'female']: if sex == 'pooled': abcd = abc elif sex == 'male': abcd = abc.loc[abc.male == 1] else: abcd = abc.loc[abc.male == 0] abcd_interp = abcd.loc[:, ['Intercept'] + cols.interp.predictors + ['y']] abcd_extrap = abcd.loc[:, ['Intercept'] + cols.extrap.predictors + ['y']] projection_interp[sex] = pd.DataFrame([]) projection_extrap[sex] = pd.DataFrame([]) for age in ages: if age in range(22, 30): if age == 22: abcd_interp['y'] = 0 params_interp_trans = pd.DataFrame( params_interp[sex].loc[age].drop('rmse').T) interp_dot = abcd_interp.dot( params_interp_trans) + error_mat[sex][[age]] abcd_interp['y'] = interp_dot projection_interp[sex] = pd.concat( [projection_interp[sex], interp_dot], axis=1) else: if age == 31: params_extrap[sex].loc[31]['y'] = 0 abcd_extrap['y'] = interp_dot abcd_extrap['y'].fillna(value=0, inplace=True) params_extrap_trans = pd.DataFrame( params_extrap[sex].loc[age].drop('rmse').T) extrap_dot = abcd_extrap.dot( params_extrap_trans) + error_mat[sex][[age]] abcd_extrap['y'] = extrap_dot projection_extrap[sex] = pd.concat( [projection_extrap[sex], extrap_dot], axis=1) return params_interp, params_extrap, error_mat, projection_interp, projection_extrap