def get_coef(data, pretype, econamelist, quatile): #获得分位数回归线性关系 #注意xnamelist 最多只能容纳5个变量,yname是str #n=len(xnamelist) n = len(econamelist) if n == 1: mod = smf.quantreg('%s ~ %s' % (pretype, econamelist[0]), data) elif n == 2: mod = smf.quantreg( '%s ~ %s+%s' % (pretype, econamelist[0], econamelist[1]), data) elif n == 3: mod = smf.quantreg( '%s ~ %s+%s+%s' % (pretype, econamelist[0], econamelist[1], econamelist[2]), data) elif n == 4: mod = smf.quantreg( '%s ~ %s+%s+%s+%s' % (pretype, econamelist[0], econamelist[1], econamelist[2], econamelist[3]), data) elif n == 5: mod = smf.quantreg( '%s ~ %s+%s+%s+%s+%s' % (pretype, econamelist[0], econamelist[1], econamelist[2], econamelist[3], econamelist[4]), data) res = mod.fit(q=quatile) # print(res.summary()) #返回分位点,截距,各个参数系数 和 各个参数lb,ub return quatile, res.params['Intercept'], res.params[ econamelist], res.conf_int().loc[econamelist]
def __construct_from_points(self, points): import statsmodels.formula.api as smf normed_pnts = self.__get_normed_pnts(points) dat_pnts = pd.DataFrame(normed_pnts, columns=('x', 'y', 'z')) mod_y = smf.quantreg('y ~ x + I(x**2.0) + I(x**3.0) + I(x**4.0)', dat_pnts) mod_z = smf.quantreg('z ~ x + I(x**2.0) + I(x**3.0) + I(x**4.0)', dat_pnts) self.res_y = mod_y.fit(q=0.5) self.res_z = mod_z.fit(q=0.5)
def fit(self, y_name, features_name): if np.linalg.matrix_rank(self.temp[['mileage', 'weight_mileage']]) == 1: return None if self.temp is not None: try: model = smf.quantreg(y_name + '~' + features_name, self.temp) res = model.fit(q=.5, max_iter=10000) self.res = res except: self.res = None return None if self.res.params['mileage'] <= 0 or \ self.res.params['weight_mileage'] <= 0 or \ self.res.params['Intercept'] <= 0: dirs = 'output/QR/' + str(self.start_cityid) + '/no/' if not os.path.exists(dirs): os.makedirs(dirs) self.show_result(dirs) return None dirs = 'output/QR/' + str(self.start_cityid) + '/yes/' if not os.path.exists(dirs): os.makedirs(dirs) self.show_result(dirs) return self.res
def calcuTD(x, O, Y, SeqDepth, Grid, Tau): TauGroup, D = Grid[x] D = int(D) try: polyX, centre, scale, alpha, beta = poly.poly(O, D) except Exception: polyX = None if polyX is not None: colVars = ['var_' + str(j) for j in range(D)] polydata = pd.concat([pd.DataFrame({'Y':Y}), pd.DataFrame(polyX, columns=colVars)], axis=1) try: rqfit = smf.quantreg('Y~' + '+'.join(colVars), polydata).fit(q=TauGroup) revX = poly.predict_poly(polyX, centre, scale, alpha, beta, SeqDepth) revX = pd.DataFrame(revX, columns=colVars) pdvalsrq = rqfit.predict(revX) if min(pdvalsrq) > 0: S = QuantReg(pdvalsrq.values, tools.add_constant(SeqDepth)).fit(q=Tau).params[1] else: S = -50 except Exception: S = -50 else: S = -50 return S
def quantile_regression(categorical_mrna, categorical_protein): data = pd.DataFrame(columns=['mrna', 'protein']) data['mrna'] = categorical_mrna data['protein'] = categorical_protein mod = smf.quantreg('mrna ~ protein', data) res = mod.fit(q=.5) return res.prsquared
def __qfit_l(self): """ Fit a quantile regression at every quantile and horizon """ # Prepare a container for each individual fit (convenient later) QFit = namedtuple('Qfit', ['depvar', 'horizon', 'tau', 'qfit']) qfit_l = list() # Container for h, depvar in zip(self.horizon_l, self.depvar_l): reg_f = self.regform_d[depvar] # Formula for tau in self.quantile_l: # For every tau # Estimate the quantile regression p = {'q':tau, 'maxiter':1000, 'p_tol':1e-05} qfit = smf.quantreg(formula=reg_f, data=self.data).fit(**p) # Package it into a container nt = {'depvar':depvar, 'horizon':h, 'tau':tau, 'qfit':qfit} qfit_l.append(QFit(**nt)) print(f'{len(qfit_l)} quantile regressions estimated ' f'for {len(self.horizon_l)} horizons ' f'and {len(self.quantile_l)} quantiles') return(qfit_l)
def fitMinSpline(self, Yvar, Xvar, smoothingWindow, plot=False, plotVar = None): ''' This function is to fit/interpolate a spline in the data ''' # use patsy class to define a matrix X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar})) # redefine dataframe modDat = pd.DataFrame(X, index=Yvar.index) # redefine our data into X1-X7 modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] modDatTrunc = modDat.iloc[self._smoothingWindow/2:-self._smoothingWindow/2].copy() window = np.ones(self._smoothingWindow)/float(self._smoothingWindow) modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[self._smoothingWindow/2:-self._smoothingWindow/2] mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc) res = mod.fit(q=0.01) preds = pd.Series(res.predict(modDat), index = Xvar.index) if plot: plotDF = pd.concat([plotVar, Yvar, preds],1) print(plotDF.columns) plotDF.columns = [plotVar.name, Yvar.name, 'fitted'] p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\ geom_line(aes(y='fitted'), color='red')+\ ylim(0,5) +\ xlab('') + ylab('Sensor (V)') print(p) #return regression predictors return(preds)
def fitMinSpline(Yvar, Xvar, smoothingWindow, plot=False, plotVar = None): ''' Function returns minimal interpolation spline Inputs: Yvar : dependent variables that needed to be fit Xvar : independent variables that needed to be fit smoothingWindow : the smoothing time average plot = boolean value to plot or not, default is not to plot plotVar = plot a specific variable, default none ''' X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar})) modDat = pd.DataFrame(X, index=Yvar.index) modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] modDatTrunc = modDat.iloc[smoothingWindow/2:-smoothingWindow/2].copy() window = np.ones(smoothingWindow)/float(smoothingWindow) modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[smoothingWindow/2:-smoothingWindow/2] mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc) res = mod.fit(q=0.01) preds = pd.Series(res.predict(modDat), index = Xvar.index) if plot: plotDF = pd.concat([plotVar, Yvar, preds],1) print(plotDF.columns) plotDF.columns = [plotVar.name, Yvar.name, 'fitted'] p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\ geom_line(aes(y='fitted'), color='red')+\ ylim(0,5) +\ xlab('') + ylab('Sensor (V)') print(p) return(preds)
def QR_beta(f,g,df=merge1): reg_p=[] Y=df[f].astype(float) for i in g: X=df[i].astype(float) tryme2=smf.quantreg('Y~X',data=df).fit(maxiter=100000,q=0.5) reg_p.append(str(round(tryme2.params[1],8))+" ("+str(round(tryme2.pvalues[1],4))+" )" )#reg_p.append(tryme2.pvalues[1] ) return reg_p
def QR(f,g,df=merge1): reg_p=[] Y=df[f].astype(float) for i in g: X=df[i].astype(float) tryme2=smf.quantreg('Y~X',data=df).fit(maxiter=100000,q=0.5) reg_p.append(tryme2.pvalues[1] ) return reg_p
def quant_summary(self, q=0.5): ''' Func:分位数回归概要\n q --> 分位数 ''' mod = smf.quantreg(formula=self.formula, data=self.df) res = mod.fit(q=q) return res.summary()
def fit_quanmod(df: pd.DataFrame, q: Number) -> List[float]: """ - quantile regression - returns array of floats [Intercept, NDVI] """ mod = smf.quantreg('STR ~ NDVI', df) res = mod.fit(q=q) return [res.params['Intercept'], res.params['NDVI']]
def quant_pred(q, data, **params): mod = smf.quantreg(params['formula'], data) reg_res = mod.fit(q=q, **params['method_args']) out = pd.DataFrame({ 'x': [data['x'].min(), data['x'].max()], 'quantile': q, 'group': '{}-{}'.format(data['group'].iloc[0], q)}) out['y'] = reg_res.predict(out) return out
def quantile_q(d, p, c, s): underage_cost = p - c overage_cost = c - s ratio = underage_cost / (underage_cost + overage_cost) tmp = d.copy() tmp['y'] = d tmp['ones'] = np.ones(len(d)) mod = smf.quantreg('y ~ ones', tmp) res = mod.fit(q=ratio) return res.params['Intercept']
def quantile_reg(df, quantile): mod = smf.quantreg(df.columns[1] + '~' + df.columns[0], df) res = mod.fit(q = quantile) print(res.summary()) # get_y = lambda a, b: a + b * df.iloc[:,0].values # pre_y = get_y(res.params[0], res.params[1]) qre_df = pd.DataFrame(data = [[quantile, res.params[0], res.params[1]] + res.conf_int().iloc[1].tolist()], index = ['quantile_reg'], columns = ['qt','intercept','x_coef','cf_lower_bound','cf_upper_bound',]) return qre_df
def quantile_regression_q(d, p, c, s): underage_cost = p - c overage_cost = c - s ratio = underage_cost / (underage_cost + overage_cost) tmp = d.copy() tmp['ones'] = np.ones(tmp.shape) mod = smf.quantreg('y ~ ones', tmp) res = mod.fit(q=ratio) print(res.summary()) return res
def __qfit_dict(self): """ Estimate the quantile fit for every quantile """ qfit_dict = dict() for tau in self.quantile_list: reg_f = self.reg_formula qfit = smf.quantreg(formula=reg_f, data=self.data).fit(q=tau, maxiter=2000, p_tol=1e-05) qfit_dict[tau] = qfit return (qfit_dict)
def ols_annotations(x, y, data=None, ax=None, color='black', font_size=8, textxy=[0.05, 0.95], textva='top', method='quantreg', stats=['N', 'slope', 'slope_p']): import statsmodels.api as sm import statsmodels.formula.api as smf if data is None: data = pandas.DataFrame({'X': x, 'Y': y}) x = 'X' y = 'Y' data = data.sort_values(x) if method == 'ols': X = sm.add_constant(data.loc[:, x]) Y = data.loc[:, y] mod = sm.OLS(Y, X) res = mod.fit() elif method == 'quantreg': mod = smf.quantreg(y + ' ~ ' + x, data) res = mod.fit(q=0.5) N = data.shape[0] slope = res.params[x] slope_p = res.pvalues[x] rsquared = res.rsquared_adj rsquared_p = res.f_pvalue text = '' for stat in stats: if stat == 'N': text += 'N = {:,}\n'.format(N) if stat == 'slope': text += 'slope = {}\n'.format('%.2f' % Decimal(slope)) if stat == 'slope_p': text += 'P = {}\n'.format('%.2E' % Decimal(slope_p)) if stat == 'rsquared': text += 'R2 = {}\n'.format('%.2f' % Decimal(rsquared)) if stat == 'rsquared_p': text += 'P = {}\n'.format('%.2E' % Decimal(rsquared_p)) ax.text(textxy[0], textxy[1], text, transform=ax.transAxes, va=textva, color=color, fontsize=font_size) xmin = data.loc[:, x].min() xmax = data.loc[:, x].max() ax.plot(data[x].values[[0, N - 1]], res.predict()[[0, N - 1]], color=color)
def fit(self, X, y=None): """ :param X: covariate dataframe :param y: currently unused """ # Build formula for prediction formula = 'num_det_target ~ np.log(num_det+1)' if self.covariates: formula += ' + ' + ' + '.join(self.covariates) self.fit_result = smf.quantreg(formula, data=X).fit(q=self.quantile) return self.fit_result
def time_varying_delta_covar_regression(macroData, instData, quantile, instName, writeToFile): data = macroData.merge(instData, on='5_Day_Dates') mod = smf.quantreg( str(instName) + ' ~ Change_3_M_TR + Change_TR_Slope + TED + Baa_3_M_TR + SP_500 + RE_excess_FS + SP_500_Vol', data) varqres = mod.fit(q=quantile) var50res = mod.fit(q=0.5) mod = smf.quantreg( 'Fin_Sec_Loss ~ Change_3_M_TR + Change_TR_Slope + TED + Baa_3_M_TR + SP_500 + RE_excess_FS + SP_500_Vol + ' + str(instName), data) covarqres = mod.fit(q=quantile) if writeToFile: f = open( str(instName) + ' Time Varying Delta CoVaR Parameters.txt', 'w') f.write(str(quantile) + ' Quantile VaR for institution') f.write('\n') f.write(str(varqres.summary())) f.write('\n') f.write('\n') f.write('0.5 Quantile VaR for institution') f.write('\n') f.write(str(var50res.summary())) f.write('\n') f.write('\n') f.write(str(quantile) + ' Quantile CoVaR for system given institution') f.write('\n') f.write(str(covarqres.summary())) f.close() return { 'VaRqParams': varqres.params, 'VaR50Params': var50res.params, 'CoVaRqParams': covarqres.params }
def getcoef(data, q=0.5): ''' Pega o coeficiente angular da regressão ''' data.columns = ['income', 'depend'] # por algum motivo a tabela não estava fazendo a regreção corretamente, estava entendendo o eixo # das variáveis independentes como vários eixos e não apenas um eixo table = {'income': data['income'].values.tolist(), 'depend': data['depend'].values.tolist()} mod = smf.quantreg('depend ~ income', table) res = mod.fit(q=q) return res.params['income']
def quantile(self, q=0.5): ''' Func:不准用!!! ''' mod = smf.quantreg(formula=self.formula, data=self.df) res = mod.fit(q=q) df_lu = res.conf_int() df_lu.columns = ['lb', 'ub'] result_dict = df_lu.to_dict() result_dict['params'] = dict(res.params) result_dict['pvalue'] = dict(res.pvalues) result_dict['q'] = str(res.q)[:4] result_dict['Pseudo R-squared'] = res.prsquared return pd.DataFrame(result_dict)
def fit_model(self): """ fit the linear quantile regression model using the train dataset Returns ------- output: statsmodels.regression.linear_model.RegressionResultsWrapper object the linear quantile regression model """ x_columns = list(self.x.columns.values) equation = self.y.name + '~' + '+'.join(x_columns) df = pd.concat([self.y, self.x], axis=1) self.linear_quantile = smf.quantreg(equation, data=df) self.linear_quantile = self.linear_quantile.fit(q=self.qt) return self.linear_quantile
def fit(self, X, y): # Build the design matrix via a tensor basis expansion of natural spline bases data = {'x{}'.format(i + 1): x for i, x in enumerate(X.T)} design_matrix = dmatrix( "te(" + ",".join([ 'cr(x{}, df={})'.format(i + 1, self.df) for i in range(X.shape[1]) ]) + ", constraints='center')", data) # Save the design information for future predictions self.design_info = design_matrix.design_info # Fit the model using the basis mod = smf.quantreg('y ~ x - 1', {'y': y, 'x': design_matrix}) if np.isscalar(self.quantiles): self.model = mod.fit(q=self.quantiles) else: self.model = [mod.fit(q=q) for q in self.quantiles]
def table_rq_res(formula, taus, data, alpha, R, n, sigma, jacobian): m = len(taus) tab = pd.DataFrame([], index=[0]) setab = pd.DataFrame([], index=[0]) for i in range(m): fit_model = smf.quantreg(formula, data) fit = fit_model.fit(q=taus[i]) coeff = np.dot(R.T, np.array(fit.params)) tab[str(i)] = coeff sigmatau = sigma(data, n, taus[i], fit.resid) jacobtau = jacobian(data, n, taus[i], fit.resid, alpha) solved_jacobtau = np.linalg.inv(jacobtau) V = np.dot(np.dot(solved_jacobtau, sigmatau), solved_jacobtau) / n secoeff = np.float(np.dot(np.dot(R.T, V), R))**.5 setab[str(i)] = secoeff tab = tab.transpose() setab = setab.transpose() return (tab, setab)
def time_constant_delta_covar_regression(macroData, instData, quantile, instName, writeToFile): data = macroData.merge(instData, on='5_Day_Dates') mod = smf.quantreg('Fin_Sec_Loss ~ ' + str(instName), data) res = mod.fit(q=quantile) var50 = data[str(instName)].quantile(0.5) varq = data[str(instName)].quantile(quantile) if writeToFile: f = open(str(instName) + ' Constant Time Delta CoVaR Summary.txt', 'w') f.write(str(quantile) + ' Quantile') f.write('\n') f.write(str(res.summary())) f.write('\n') f.write('Delta Covar ' + str(quantile) + " : " + str(res.params[1] * (varq - var50))) f.close()
def quantCV(q, alpha, L1_wt, data, folds): import statsmodels.formula.api as smf # from statsmodels.regression.quantile_regression import QuantReg from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error as MSE import warnings warnings.filterwarnings("ignore") ## KFold kf = KFold(len(np.unique(data.index)), n_folds=folds, random_state=0) score = np.zeros(folds) ct = 0 ## Train Model for train_index, test_index in kf: data_train = data[np.array( pd.DataFrame(data.index).isin(train_index).values.tolist())] data_test = data[np.array( pd.DataFrame(data.index).isin(test_index).values.tolist())] mod = smf.quantreg( 'Delay_100_Mile ~ TC + ATP + IP + TC_TC + TC_ATP + TC_IP + ATP_ATP + ATP_IP + IP_IP', data_train) res = mod.fit_regularized(q=q, alpha=alpha, L1_wt=L1_wt, maxiter=3000, random_state=0, cnvrg_tol=1e-08) ## Predict Values features_predict = data_test.groupby(by=['TC', 'ATP', 'IP'])[data.drop( ['Delay_100_Mile'], axis=1).columns].mean() params = res.params delay_predicted = params[0] + np.dot(features_predict, params[1:]) ## Corresponding Value of Same Percentile target_per = data_test.groupby( by=['TC', 'ATP', 'IP'])['Delay_100_Mile'].quantile(q) # score[ct]= MSE(np.expm1(target_per),np.expm1(delay_predicted))**.5 score[ct] = MSE(target_per, delay_predicted)**.5 ct += 1 return np.mean(score)
def quantile_fit(xi, yi, q=0.5): """Perform quantile regression. See for instance: https://www.statsmodels.org/dev/examples/notebooks/generated/quantile_regression.html (valid on 2091-04-16) Parametes: xi, yi np.array, x and y values Returns: slope regression slope estimate intercept regression intercept estimate """ data = {'xi': xi, 'yi': yi} df = pd.DataFrame.from_dict(data=data) mod = smf.quantreg('yi ~ xi', df) res = mod.fit(q=q) # return slope, intercept, covariance_matrix return res.params['xi'], res.params['Intercept'], res.cov_params().values
def fitMinSpline(Yvar, Xvar, smoothingWindow, plot=False, plotVar = None): X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar})) modDat = pd.DataFrame(X, index=Yvar.index) modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] modDatTrunc = modDat.iloc[smoothingWindow/2:-smoothingWindow/2].copy() window = np.ones(smoothingWindow)/float(smoothingWindow) modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[smoothingWindow/2:-smoothingWindow/2] mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc) res = mod.fit(q=0.01) preds = pd.Series(res.predict(modDat), index = Xvar.index) if plot: plotDF = pd.concat([plotVar, Yvar, preds],1) print(plotDF.columns) plotDF.columns = [plotVar.name, Yvar.name, 'fitted'] p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\ geom_line(aes(y='fitted'), color='red')+\ ylim(0,5) +\ xlab('') + ylab('Sensor (V)') print(p) return(preds)
def fit(self, train, quantiles=[0.025, 0.975], startx=None, endx=None): """ Uses the statsmodel implementation of quantile regression. Quantile weighted least squares. Possibility to only fit the exponential decay beyond a certain leadtime Works on dataframe with resetted index. (i.e. leadtime as a column) """ train = train.reset_index('leadtime') if startx is not None: train = train.loc[train[self.predcol] >= startx, :] if endx is not None: train = train.loc[train[self.predcol] <= endx, :] mod = smf.quantreg(self.obscol + ' ~ np.log(' + self.predcol + ')', train) self.fits = pd.DataFrame(np.zeros((len(quantiles), 2)), index=quantiles, columns=self.model_coefs) for q in quantiles: res = mod.fit(q=q) self.fits.loc[q, :] = res.params.values
def subsamplek(formula, V, tau, coeffs, data, n, b, B, R): k = np.zeros(B) RVR = (np.float(np.dot(np.dot(R.T, V), R) / b))**(-1 / 2) probs = np.array(data['perwt']) / np.sum(np.array(data['perwt'])) for s in range(B): sing = 0 while sing == 0: sample = np.random.choice(np.arange(0, n), size=int(b), replace=True, p=probs) sdata = data.iloc[sample, :] x = sdata[["educ", "exper", "exper2", "black", "perwt"]] x = x.as_matrix() sing = np.linalg.det(np.dot(x.T, x)) # Didn't use weights here sqr_model = smf.quantreg(formula, sdata) sqr = sqr_model.fit(q=tau) k[s] = np.abs(np.dot(np.dot(RVR, R.T), coeffs - np.array(sqr.params))) return (k)
def CoVar(): df = pd.read_csv("Data/Index_data.csv", sep=";") df["Date"] = pd.to_datetime(df["Date"]) df = df.dropna().ffill().set_index("Date") # data = np.log(df).diff().dropna()[['Nordea Bank','Sydbank','Danske Bank','Jyske Bank','Novo Nordisk B']] data = np.log(df).diff().dropna() data = data.rename(columns={"S&P": "SP"}) data = data.replace(np.inf, 0).replace(-np.inf, 0) mCovar = np.zeros((len(data.columns), len(data.columns))) q = 0.05 for nr1, j in enumerate(data.columns): for nr2, k in enumerate(data.columns): if nr1 != nr2: mod = smf.quantreg(str(j) + "~" + str(k), data) res = mod.fit(q=q) var5 = mod.fit(q=q).params[0] var5 = np.percentile(data[k], q) var50 = mod.fit(q=0.5).params[0] covar = res.params[0] + res.params[1] * var5 print covar print res.summary() dcovar = mod.fit(q=q).params[1] * (var5 - var50) # res = mod.fit(q=0.5) mCovar[nr1, nr2] = round(dcovar, 3) icepts = [] for i in np.arange(0.01, 1, 0.01): res = mod.fit(q=i) icepts.append(res.params[1]) plt.plot(np.arange(0.01, 1, 0.01), icepts) # plt.ylim(0,1) print j, k plt.show() else: mCovar[nr1, nr2] = np.nan # mCovar[nr1,nr2] print pd.DataFrame(mCovar, columns=data.columns, index=data.columns)
def _set_quantiles(self, data): #Compute quantiles for the transformed power conditional on the transformed power prediction #for a specific location and a specific lead time. #smf.quantreg generates warning - see documentation for more details #warning off just for this section warnings.filterwarnings("ignore") #Performs the actual quantile regression and stores the variables of prob = np.concatenate([[0.001],np.arange(0.05,0.951,0.05),[0.999]]) self.betas = expando() for location in data.metadata.id_nodes: print(location) setattr(self.betas, location, expando()) for ileadT, leadT in enumerate(data.metadata.fore_leadT, start = 1): clim_concurr_loc_leadT = getattr(getattr(self.clim.concurr, location), leadT) betas_aux = pd.DataFrame(0, columns = ['probabilities','intercept', 'coefficient'], index = range(len(prob))) betas_aux.loc[:,('probabilities')] = prob #For solar cases, all quantiles are kepts to zeros if not np.all(clim_concurr_loc_leadT.observations == 0.): mod = smf.quantreg('observations ~ predictions', clim_concurr_loc_leadT) for iq,q in enumerate(prob): res = mod.fit(q=q) betas_aux.loc[iq,('intercept')] = res.params['Intercept'] betas_aux.loc[iq,('coefficient')] = res.params['predictions'] del res del mod setattr(getattr(self.betas,location), leadT, betas_aux) del betas_aux gc.collect() #warning on warnings.filterwarnings("always") pass
# -*- coding: utf-8 -*-''' from __future__ import print_function import patsy import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt from statsmodels.regression.quantile_regression import QuantReg from matplotlib import rc data = sm.datasets.engel.load_pandas().data print(data.head()) mod = smf.quantreg('foodexp ~ income', data) res = mod.fit(q=.5) print(res.summary()) quantiles = np.arange(.05, .96, .1) def fit_model(q): res = mod.fit(q=q) return [q, res.params['Intercept'], res.params['income']] + \ res.conf_int().ix['income'].tolist() models = [fit_model(x) for x in quantiles] models = pd.DataFrame(models, columns=['q', 'a', 'b','lb','ub']) ols = smf.ols('foodexp ~ income', data).fit() ols_ci = ols.conf_int().ix['income'].tolist() ols = dict(a = ols.params['Intercept'],
def do_GLM(self, disp=1): """ Generaliesd Linear Models This fits a GLM to the training data set and then fits it to the testing dataset. Different families and links can be included if need be simply using the statsmodels simple API. """ import statsmodels.api as sm import statsmodels.formula.api as smf import statsmodels.genmod as smg # Decide the family if self.family_name == "Gamma": if self.link == "log": self.family = sm.families.Gamma(link=smg.families.links.log) else: self.family = sm.families.Gamma() elif self.family_name == "Quantile": self.family = self.family_name self.link = "None" else: logger.info("You can only pick the family: Gamma and Quantile") # Decide the formula poly = lambda x, power: x**power if not self.formula: formula = "redshift ~ poly(PC1, 2) +" for i in range(self.num_components): if i<self.num_components-1: formula += "PC{0}*".format(i+1) else: formula += "PC{0}".format(i+1) self.formula = formula self.logger.info("Family: {0} with \tformula: {1}\tlink: {2}".format(self.family_name, self.formula, self.link)) self.logger.info("Fitting...") t1 = time.time() if self.family == "Quantile": # Quantile regression model = smf.quantreg(formula=self.formula, data=self.data_frame_train) results = model.fit(q=.5) if verbose: self.logger.info(results.summary()) else: model = smf.glm(formula=self.formula, data=self.data_frame_train, family=self.family) results = model.fit() self.logger.info(results.summary()) t2 = time.time() self.dt = (t2-t1) self.logger.info("Time taken: {0} seconds".format(self.dt)) #Plot the model with our test data ## Prediction if self.cross_validate: self.logger.info("Cross validating") self.measured = np.array(self.data_frame_test["redshift"].values) self.predicted = results.predict(self.data_frame_test) else: self.measured = np.array(self.data_frame_train["redshift"].values) self.predicted = results.predict(self.data_frame_train) self.fitted = results.predict(self.data_frame_test) ## Outliers ## (z_phot - z_spec)/(1+z_spec) self.deltas = abs(self.predicted - self.measured) self.median = np.median(self.deltas) self.std = np.std(self.deltas) # First we will remove the outliers mega_out_indx = (self.deltas/(1+self.measured)) > 0.15 self.num_mega_outliers = mega_out_indx.sum() / (1.0*len(self.deltas)) self.average = np.mean(self.deltas[mega_out_indx.__invert__()]) self.rms = np.sqrt(np.mean(self.deltas**2)) self.rms_outliers = np.sqrt(np.mean(self.deltas[mega_out_indx.__invert__()]**2)) self.std_outliers = np.std(self.deltas[mega_out_indx.__invert__()]) self.bias_outliers = np.mean(self.deltas[mega_out_indx.__invert__()]) self.logger.info("Median (dz):.............................................{0}".format(self.median)) self.logger.info("Standard deviation (dz):.................................{0}".format(self.std)) self.logger.info("RMS (dz).................................................{0}".format(self.rms)) self.logger.info("............................................................") self.logger.info("Number of outliers removed...............................{0}".format(self.num_mega_outliers)) self.logger.info("Average (removed outliers for > 0.15) (dz):..............{0}".format(self.average)) self.logger.info("Standard deviation (removed outliers for > 0.15) (dz):...{0}".format(self.std_outliers)) self.logger.info("RMS (removed outliers for z > 0.15)......................{0}".format(self.rms_outliers)) self.logger.info("Bias (removed outliers for z > 0.15).....................{0}".format(self.bias_outliers)) self.outliers = (self.predicted - self.measured) / (1.0 + self.measured) # R code # Out<-100*length(PHAT0.Pred$fit[(abs(PHAT0.test.PCA$redshift-PHAT0.Pred$fit))>0.15*(1+PHAT0.test.PCA$redshift)])/length(PHAT0.Pred$fit) self.catastrophic_error = 100.0*(abs(self.measured-self.predicted) > (0.15*(1+self.measured))).sum()/(1.0*self.measured.shape[0]) self.logger.info("Catastrophic Error:......................................{0}%".format(self.catastrophic_error))
plt.yticks(()) plt.xlabel("x") plt.ylabel("y and predicted y") plt.title("Linear regression on data with non-constant variance") ## Quantile regression for the median, 0.5th quantile import pandas as pd data = pd.DataFrame(data = np.hstack([x_, y_]), columns = ["x", "y"]) print data.head() import statsmodels.formula.api as smf mod = smf.quantreg('y ~ x', data) res = mod.fit(q=.5) print(res.summary()) ## Build the model for other quantiles quantiles = np.arange(0.1,1,0.1) print quantiles models = [] params = [] for qt in quantiles: print qt res = mod.fit(q = qt ) models.append(res) params.append([qt, res.params['Intercept'], res.params['x']] + res.conf_int().ix['x'].tolist())
print() print(u'-'*30) print(u'Variable for close distance:', d_dist) # NO CONTROL ols_res = smf.ols('pct_rr ~ {:s}'.format(d_dist), data = df_compa).fit() #print() #print(ols_res.summary()) ls_res = [] ls_quantiles = [0.25, 0.5, 0.75] # use 0.7501 if issue for quantile in ls_quantiles: #print() #print(quantile) #print(smf.quantreg('pct_rr~d_dist_5', data = df_repro_compa).fit(quantile).summary()) ls_res.append(smf.quantreg('pct_rr ~ {:s}'.format(d_dist), data = df_compa[~df_compa[d_dist].isnull()]).fit(quantile)) print(summary_col([ols_res] + ls_res, stars=True, float_format='%0.2f', model_names=['OLS'] + [u'Q{:2.0f}'.format(quantile*100) for quantile in ls_quantiles], info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)), 'R2':lambda x: "{:.2f}".format(x.rsquared)})) # WITH CONTROLS ols_res_ctrl = smf.ols('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev), data = df_compa).fit() #print() #print(ols_res_ctrl.summary()) ls_res_ctrl = ([smf.quantreg('pct_rr ~ {:s} + {:s}'.format(d_dist, str_ev),
slope, intercept, r_value, p_value, std_err = stats.linregress(y,dur) correlation[q,sea_ID,state,2]=slope correlation[q,sea_ID,state,3]=p_value correlation[q,sea_ID,state,4]=intercept #print slope, intercept, r_value, p_value, std_err #plt.plot(y,dur,'o') #plt.plot(y,intercept+slope*y,'r') df=pd.DataFrame(data={'dur':dur,'y':y}) mod = smf.quantreg('dur ~ y', df) for qu,qui in zip(quantiles,range(5)): try: res = mod.fit(q=qu) slope,p_value,interc=res.params['y'],res.pvalues['y'],res.params['Intercept'] correlation_qu[q,sea_ID,state,qui,:]=[slope,p_value,interc] #plt.plot(y,interc+slope*y,'b--') except: pass #plt.show() #asdasd out_file='data/_TMean/91_7/gridded/91_7_TMean_duration_'+variable+'_cor.nc'
def quant_mincer(q, data): r = smf.quantreg('logwk ~ educ + black + exper + exper2 + wt - 1', data) result = r.fit(q = q) coef = result.params['educ'] se = result.bse['educ'] return [coef, se]
def elast_calc(self, key, Y, X, P, stub='', parts=100): """ Add elasticities using log-log quantile regressions number of elasticities will be that of hypothetical delimiters in parts i.e parts-1 key - household key Y - dependent variable - resource consumption X - independent variable - income P - household population weights stub - sufix to name variables containing quantiles and elasticities parts - number of parts """ dt = self.dataset quantstub = 'quant' + stub elaststub = 'elast' + stub print '\nElasticity calculator started - please be patient' # take the logs of Y and X dt['__lnY'] = np.log(dt[Y]) dt['__lnX'] = np.log(dt[X]) # log of 0 is -infinite, replace with missing (NaN) dt['__lnY'][dt[Y] == 0] = np.NaN dt['__lnX'][dt[X] == 0] = np.NaN # rescale and round weights to inform replication dt['__' + P] = dt[P]/dt[P].min() dt['__rdwgt'] = dt['__' + P].round() # define quantiles based on parts and mark dt.sort(Y, inplace=True) dt[quantstub] = (dt['__' + P].cumsum() / dt['__' + P].sum() * parts).astype(int) / float(parts) dt.sort(key, inplace=True) # the quantile of the regression, can't be 0 or 1 # unique() is sorted as dt, get the smallest non zero quantile # and the larger < 1 quantiles = dt[quantstub].unique() quantiles.sort() quantiles = quantiles[1:-1] dt[quantstub][dt[quantstub] == 0] = quantiles[0] dt[quantstub][dt[quantstub] == 1] = quantiles[-1] # dataframe with replications print 'Replicating observations, {} to {}...'.format( dt['__rdwgt'].count(), int(dt['__rdwgt'].sum())) lnY, lnX = pd.Series(), pd.Series() for i in xrange(len(dt)): lnY = lnY.append(pd.Series((dt['__lnY'][i],) * int(dt['__rdwgt'][i]))) lnX = lnX.append(pd.Series((dt['__lnX'][i],) * int(dt['__rdwgt'][i]))) estdt = pd.DataFrame() estdt['lnY'] = lnY estdt['lnX'] = lnX del lnY, lnX # calculate elasticities print 'Fitting models...' model = smf.quantreg('lnY ~ lnX', estdt) elastseries = () #elasterrors = () print 'Quantile\telasticity\tse_elast\tintercept\tse_intercept' for quantile in quantiles: elast = model.fit(quantile) elastseries += (elast.params[1],) print '{}\t{:8.6f}\t{:8.6f}\t{:8.6f}\t{:8.6f}'.format( quantile, elast.params[1], elast.bse[1], elast.params[0], elast.bse[0],) elastdt = pd.DataFrame() elastdt[quantstub] = quantiles elastdt[elaststub] = elastseries # add elasticities and clean dataset todrop = [var for var in dt.keys() if '__' in var] self.dataset = pd.merge(dt, elastdt, on=quantstub) self.dataset.sort(key, inplace=True) self.dataset.reset_index(drop=True, inplace=True) self.dataset.drop(todrop, axis=1, inplace=True) self.seedvars += [quantstub, elaststub]
def trend_CI(x_var, y_var, n_boot=1000, ci=95, trendtype="linreg", q=0.5, frac=0.6, it=3, autocorr=None, CItype="bootstrap"): """calculates bootstrap confidence interval and significance level for trend, ignoring autocorrelation or accounting for it Parameters ---------- x_var : list independent variable y_var : list dependent variable, same length as x_var q : int, optional, only if trendtype==quantreg quantile for which regression is to be calculated n : int, optional number of bootstrap samples ci : int, optional confidence level. Default is for 95% confidence interval frac : int, optional, only if trendtype==lowess lowess parameter (fraction of time period length used in local regression) it : int, optional, only if trendtype==lowess lowess parameter (numbre of iterations) autocorr : str, optional way of accounting for autocorrelation, possible values: None, "bootstrap" trendtype : str, optional method of trend derivation, possible values: lowess, linreg, quantreg, TheilSen CItype : str, optional method of CI derivation, possible values: "analytical" and "bootstrap". if trendtype is "lowess", CItype will be set to None if CItype is "analytical": autocorrelation will be set to None Results ------- returns library with following elements: slope - slope of the trend CI_high - CI on the slope value CI_low - as above pvalue - trend's significance level trend - trend line, or rather its y values for all x_var trendCI_high - confidence interval for each value of y trendCI_low - as above Remarks ------- the fit function ocassionally crashes on resampled data. The workaround is to use try statement """ import numpy as np import pandas as pd #for linreg import statsmodels.api as sm from statsmodels.regression.linear_model import OLS #for arima import statsmodels.tsa as tsa #for quantreg import statsmodels.formula.api as smf from statsmodels.regression.quantile_regression import QuantReg #for lowess import statsmodels.nonparametric.api as npsm #other from statsmodels.distributions.empirical_distribution import ECDF from scipy.stats import mstats, mannwhitneyu, t, kendalltau from arch.bootstrap import StationaryBootstrap, IIDBootstrap #preparing data if CItype=="analytical" and trendtype=="TheilSen": CItype="bootstrap" x_var=np.array(x_var) y_var=np.ma.masked_invalid(y_var) n_data=len(y_var) ci_low=(100-ci)/2 ci_high=100-ci_low #setting bootstrapping function if autocorr=="bootstrap": bs=StationaryBootstrap(3, np.array(range(len(y_var)))) else: bs=IIDBootstrap(np.array(range(len(y_var)))) if trendtype=="quantreg": print "Quantile regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" xydata=pd.DataFrame(np.column_stack([x_var, y_var]), columns=['X', 'Y']) model=smf.quantreg('Y ~ X', xydata) res=model.fit(q=q) intcpt=res.params.Intercept slope=res.params.X pvalue=res.pvalues[1] CI_low=res.conf_int()[0]['X'] CI_high=res.conf_int()[1]['X'] y_pred=res.predict(xydata) #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping bs_trends=np.copy(y_pred).reshape(-1,1) bs_slopes=[] bs_intcpts=[] for data in bs.bootstrap(n_boot): ind=data[0][0] model = smf.quantreg('Y ~ X', xydata.ix[ind,:]) try: res = model.fit(q=q) bs_slopes=bs_slopes+[res.params.X] bs_intcpts=bs_intcpts+[res.params.Intercept] bs_trends=np.append(bs_trends,res.predict(xydata).reshape(-1,1), 1) except: goingdownquietly=1 if trendtype=="linreg": print "Linear regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" x_varOLS = sm.add_constant(x_var) model = sm.OLS(y_var, x_varOLS, hasconst=True, missing='drop') res = model.fit() intcpt,slope=res.params pvalue=res.pvalues[1] CI_low,CI_high=res.conf_int()[1] y_pred=res.predict(x_varOLS) #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping for confidence intervals bs_slopes=[] bs_intcpts=[] bs_trends=np.copy(y_pred).reshape(-1,1) for data in bs.bootstrap(n_boot): ind=data[0][0] model = sm.OLS(y_var[ind], x_varOLS[ind,:], hasconst=True, missing='drop') try: res = model.fit() bs_slopes=bs_slopes+[res.params[1]] bs_intcpts=bs_intcpts+[res.params[0]] bs_trends=np.append(bs_trends,res.predict(x_varOLS).reshape(-1,1), 1) except: goingdownquietly=1 if trendtype=="TheilSen": # print "Theil-Sen slope, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n" #significance of MK tau tau,pvalue=kendalltau(x_var, y_var) # print "raw MK tau:", tau, "raw MK pvalue:", pvalue #TS slope and confidence intervals slope,intercept,CI_low,CI_high=mstats.theilslopes(y_var, x_var, alpha=0.95) #getting slope line's y values y_pred=intercept+slope*x_var #calculating residuals resids=y_var-y_pred #calculate autocorrelation indices autocorr_test(x_var, resids) if CItype=="bootstrap": #bootstrapping for confidence intervals bs_slopes=[] bs_intcpts=[] bs_trends=np.copy(y_pred).reshape(-1,1) for data in bs.bootstrap(n_boot): ind=data[0][0] res=mstats.theilslopes(y_var[ind], x_var[ind], alpha=0.95) bs_slopes=bs_slopes+[res[0]] bs_intcpts=bs_intcpts+[res[1]] bs_trends=np.append(bs_trends, (res[1]+res[0]*x_var).reshape(-1,1), 1) if trendtype=="lowess": print "Lowess\n" temp=dict(npsm.lowess(y_var, x_var, frac=frac, it=it, missing="drop")) y_pred=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1) bs_trends=np.copy(y_pred) for data in bs.bootstrap(n_boot): ind=data[0][0] try: temp = dict(npsm.lowess(y_var[ind], x_var[ind], frac=frac, it=it, missing="drop")) temp=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1) pred=pd.DataFrame(temp, index=x_var) temp_interp=pred.interpolate().values bs_trends=np.append(bs_trends, temp_interp, 1) except: goingdownquietly=1 #calculating final values of CI and p-value #skipping when lowess if trendtype=="lowess": CI_low=np.nan CI_high=np.nan slope=np.nan intcpt=np.nan pvalue=np.nan confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1) trendCI_low=confint[:,0] trendCI_high=confint[:,1] else: if CItype=="bootstrap": #values for slope, intercept and trend can be obtained as medians of bootstrap distributions, but normally analytical parameters are used instead # it the bootstrap bias (difference between analytical values and bootstap median) is strong, it might be better to use bootstrap values. # These three lines would need to be uncommented then # slope=np.median(bs_slopes) # intcpt=np.median(bs_intcpts) # trend=intcpt+slope*x_var #these are from bootstrap too, but needs to be used for this accounts for autocorrelation, which is the point of this script CI_low,CI_high=np.percentile(bs_slopes, [5, 95]) ecdf=ECDF(bs_slopes) pvalue=ecdf(0) #this makes sure we are calculating p-value on the correct side of the distribution. That will be one-sided pvalue if pvalue>0.5: pvalue=1-pvalue confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1) print "bs_trends:", bs_trends.shape, confint.shape trendCI_low=confint[:,0] trendCI_high=confint[:,1] else: #this is for analytical calculation of trend confidence interval #it happens in the same way for each of the trend types, thus it is done here, not under the trendtype subroutines #making sure x are floats xtemp=np.array(x_var)*1.0 #squared anomaly squanom=(xtemp-np.mean(xtemp))**2 temp=((1./len(x_var))+(squanom/sum(squanom)))**0.5 #standard error of estmation see=(np.nansum((np.array(y_var)-np.nanmean(y_pred))**2)/len(x_var))**0.5 #adjusting ci ci_adj=1-((1-ci/100.)/2) #accounting for uncertainty in mean through student's t tcomp=t.ppf(ci_adj, len(x_var)-2) #confidence interval cint=tcomp*see*temp #for trend only trendCI_high=y_pred+cint trendCI_low=y_pred-cint print trendtype, "slope:",slope, "pvalue (one sided):", pvalue, "conf interval:", CI_low, CI_high, "autocorrelation adjustment:", autocorr, "\n" output={"slope":slope, "CI_high":CI_high, "CI_low":CI_high, "pvalue":pvalue, "trend": y_pred, "trendCI_low":trendCI_low, "trendCI_high":trendCI_high} return output