def stepwise_selection(data, target, SL_in=0.05, SL_out=0.05): initial_features = data.columns.tolist() best_features = [] while (len(initial_features) > 0): remaining_features = list(set(initial_features) - set(best_features)) new_pval = pd.Series(index=remaining_features) for new_column in remaining_features: model = OLS(target, sm.add_constant(data[best_features + [new_column]])).fit() new_pval[new_column] = model.pvalues[new_column] min_p_value = new_pval.min() if (min_p_value < SL_in): best_features.append(new_pval.idxmin()) while (len(best_features) > 0): best_features_with_constant = sm.add_constant( data[best_features]) p_values = OLS(target, best_features_with_constant).fit().pvalues[1:] max_p_value = p_values.max() if (max_p_value >= SL_out): excluded_feature = p_values.idxmax() best_features.remove(excluded_feature) else: break else: break return best_features
def remove_outliers(train, targetField, dropVal, studentResid, verbose=True): """ Remove outliers from training data based on statsmodels OLS Fit studentized residuals and specified drop values across features :param pandas.DataFrame train: data for training :param str targetField: target from train/ test :py:class:`pandas.DataFrame` :param obj dropVal: value to drop rows across :param float studentResid: number to threshold absolute value of student residuals above :param bool verbose: flag to print out OLS summary information and number of outlier removed """ train = train.dropna() if dropVal is not None: train = train.ix[(train.T != dropVal).all()] design = train[[i for i in train if i != targetField]] target = train[targetField] design = StandardScaler().fit_transform(design) model = OLS(target, design) mask = np.ones((train.shape[0])).astype(bool) if studentResid is not None: mask = (model.fit().outlier_test()['student_resid'].abs() < 2) if verbose: print model.fit().summary() print 'Removed:' + str(train.shape[0] - sum(mask)) return train.ix[mask]
def calc_gwi(obs,obs_years,reg_type='mon',base_low=1850.,base_high=1900, name=''): #Express the observations relative to the base period obs = obs - np.mean(obs[np.logical_and(obs_years>=base_low,obs_years<(base_high+1))]) #Load the best estimate forcings from Piers forc_file = './Data/Annualforcings_Mar2014_GHGrevised.txt' data = np.genfromtxt(forc_file,skip_header=4) years = data[:,0] tot_forc = data[:,13] ant_forc = data[:,14] #Integrate anthropogenic and natural forcing with standard FAIR parameters C, t_nat = fair_scm(other_rf=tot_forc-ant_forc) C, t_anthro = fair_scm(other_rf=ant_forc) #Express relative to the centre of the base period t_nat = t_nat - np.mean(t_nat[np.logical_and(years>=base_low,years<base_high+1)]) t_anthro = t_anthro - np.mean(t_anthro[np.logical_and(years>=base_low,years<base_high+1)]) # ----------------------------------------------- # Prepare the temperatures run through FaIR, so they lie on same year-grid as observations, so they can be compared # ----------------------------------------------- #Interpolate the annual forced responses to the grid of the observed data if reg_type !='mon': t_nat = np.interp(obs_years+0.5, years+0.5, t_nat) t_anthro = np.interp(obs_years+0.5, years+0.5, t_anthro) else: t_nat = np.interp(obs_years, years+0.5, t_nat) t_anthro = np.interp(obs_years, years+0.5, t_anthro) #Linearly project the final half year t_anthro[obs_years>(years[-1]+0.5)] = 12*(t_anthro[obs_years<=(years[-1]+0.5)][-1] - t_anthro[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_anthro[obs_years<=(years[-1]+0.5)][-1] t_nat[obs_years>(years[-1]+0.5)] = 12*(t_nat[obs_years<=(years[-1]+0.5)][-1] - t_nat[obs_years<=(years[-1]+0.5)][-2]) * (obs_years[obs_years>(years[-1]+0.5)] - obs_years[obs_years<=(years[-1]+0.5)][-1]) \ +t_nat[obs_years<=(years[-1]+0.5)][-1] # ----------------------------------------------- #Use scipy defined OLS regression function to complete OLD regression of observations data on natural and anthropogenic warming with a constant y = np.copy(obs) x = DataFrame({'x1': (t_anthro), 'x2': (t_nat)}) # add constant vector on to dataframe we will fit to temp observations x = statsmodels.tools.tools.add_constant(x) # complete OLS regression of anthropogenic and natural temperatures (found from FaIR integrated best estimate forcing) onto given observed temperature dataset. model = OLS(y, x) result = model.fit() # collect output scaling factors for anthro and natural temperature timeseries sf = result.params #Form scaled anthropgenic warming index awi = t_anthro * sf['x1'] #Scaled natural warming index nwi = t_nat * sf['x2'] #Scaled total externally forced warming index gwi = awi + nwi print(name, ' AWI scale factor: ', sf['x1'], '\n', name, ' NWI scale factor: ', sf['x2']) return awi, nwi
def _capm_mu(self, asset, markets, mu, sigma, X): """Calculate mean estimated by CAPM.""" freq = tools.freq(X.index) X = X[[asset] + markets].dropna() res = OLS(X[asset] - 1 - self.rfr / freq, add_constant(X[markets] - 1 - self.rfr / freq)).fit() beta = res.params.drop(['const']) prev_mu = mu[asset] new_mu = self.rfr + (mu[markets] - self.rfr).dot(beta) alpha = res.params.const * freq alpha_std = freq * np.sqrt(res.cov_params().loc['const', 'const']) if self.verbose: print(f'Beta of {[x for x in beta.round(2)]} changed {asset} mean return from {prev_mu:.1%} to {new_mu:.1%} with alpha {alpha:.2%} ({alpha_std:.2%})') # be benevolent and add alpha if it is positive # k = 0.2 was fine tuned on DPST in order to get it out of the portfolio k = 0.2 if alpha - k * alpha_std > 0 and asset in ('KRE', 'DPST'): if self.verbose: print(f' Adding alpha of {alpha - k * alpha_std:.2%} for {asset}') new_mu += alpha - k * alpha_std return new_mu
def alpha_beta(self): rr = (self.X - 1).mean(1) m = OLS(self.r - 1, np.vstack([np.ones(len(self.r)), rr]).T) reg = m.fit() alpha, beta = reg.params.const * 252, reg.params.x1 return alpha, beta
def backwardElimination(x, SL): numVars = len(x[0]) temp = np.zeros((50, 6)).astype(int) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() print(regressor_OLS.summary()) maxVar = max(regressor_OLS.pvalues).astype(float) adjR_before = regressor_OLS.rsquared_adj.astype(float) if maxVar > SL: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): temp[:, j] = x[:, j] x = np.delete(x, j, 1) tmp_regressor = OLS(y, x).fit() adjR_after = tmp_regressor.rsquared_adj.astype(float) if (adjR_before >= adjR_after): x_rollback = np.hstack((x, temp[:, [0, j]])) x_rollback = np.delete(x_rollback, j, 1) print(regressor_OLS.summary()) return x_rollback else: continue else: break return x
def prosperity_score_regression(cards, metadata, score_columns=score_column_names): """ Perform a linear regression to determine the degree to which the Prosperity add-on treasure and victory cards contribute to a good score. """ prosperity = set(cards['currency'].columns.get_level_values(1)) # victory_cards = set(cards['victory'].columns.get_level_values(1)) # cards = currency_cards.union(victory_cards) scores = np.mean(metadata.loc[:, tuple(score_columns)], axis=1) # Ignore missing cells refine_idx = np.isfinite(scores) scores = scores[refine_idx] set_counts = pd.concat([ pd.DataFrame(cards.loc[refine_idx, pd.IndexSlice[:, :, c]].values, columns=[c]) for c in prosperity ] + [ pd.DataFrame(np.ones((scores.size, 1)), columns=['Average game score']) ], axis=1).fillna(0) results = OLS(scores, set_counts).fit() print results.summary()
def run_acc_compare(self, print_summary=False, data_df=None): #if regressiondict is None: # regressiondict=self.modeldict['regressiondict'] if data_df is None: self.set_flat_c_stats_df() data_df = self.flat_c_stats_df data_df.dropna(inplace=True, axis=0) y_df = data_df.loc[:, 'accuracy'] X_df = data_df.drop(labels='accuracy', axis=1, inplace=False) #print('y_df',y_df) #print('X_df',X_df) X_dtypes_ = dict(X_df.dtypes) obj_vars = [ var for var, dtype in X_dtypes_.items() if dtype == 'object' ] #float_idx=[i for i in range(X_df.shape[1]) if i not in obj_idx] #self.model=regressiondict['pipeline'](cat_idx=obj_idx,float_idx=float_idx) X_float_df = self.floatify_df(X_df, obj_vars) #X_float_df=add_constant(X_float_df) self.X_float_df = X_float_df self.y_df = y_df self.model = OLS(y_df, X_float_df) self.model_result = self.model.fit() if print_summary: print('OLS results for modeldict:') print(self.modeldict) print(self.model_result.summary())
def linear_regression(data): """ goal of this function : - to apply a linear regression ; ie. to calculate the coefficient and the intercept value of the regression line input parameter : - json file's content (data) output : - dict containing the coefficient value and intercept for each word cmd packages : - numpy (ones, arange) - statsmodels.api (ols) """ #initialisation dict_linreg = {} #for each entry in the json file (data) #intercept value and coefficient calculation for k, v in data.items(): mat_x = np.ones((len(v), 2)) mat_x[:, 1] = np.arange(0, len(v)) reg = OLS(v, mat_x) results = reg.fit() dict_linreg[k] = [results.params[1], results.params[0]] return (dict_linreg)
def get_cointLst(corrList, df_is): # called in main # Test cointegration the test has to be perform on both side of the spread cointLst = [] for pair in corrList: X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values x1 = add_constant(X1) x2 = add_constant(X2) r1 = OLS(X2, x1).fit() r2 = OLS(X1, x2).fit() adf1 = adfuller(r1.resid)[1] if adf1 < 0.01: adf2 = adfuller(r2.resid)[1] if adf2 < 0.01 and adf1 < adf2: # Test for strong cointegration in both side only. cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair + [adf1] + list(r1.params)) elif adf2 < 0.01: cointLst.append(["{0}_{1}".format(pair[1], pair[0])] + [pair[1], pair[0], pair[2], pair[3], adf2] + list(r2.params)) #print "There are {0} pairs strongly cointegrated.".format(len(cointLst)) return cointLst
def test_linearity(x, y, n_knots=5, verbose=True): """Test linearity between two variables. Run a linear regression of y on x, and take the residuals. Fit the residuals with a natural spline with `n_knots` knots. Conduct a joint F-test for all columns in the natural spline basis matrix. Example: >>> import numpy as np >>> rng = np.random.default_rng(0) >>> x = np.linspace(0., 1., 101) >>> y = 5 * x + 3 + rng.random(size=101) / 5 >>> test_linearity(x, y, n_knots=5, verbose=False) 0.194032 """ residuals = OLS(y, add_constant(x)).fit().resid basis_matrix = patsy.dmatrix( f"cr(x, df={n_knots - 1}, constraints='center') - 1", {'x': x}, return_type='dataframe') results = OLS(residuals, basis_matrix).fit() results.summary() nobs = results.nobs f_value = results.fvalue p_value = np.round(results.f_pvalue, 6) print('Test for Linearity: ' f'N = {nobs:.0f}; df={nobs - n_knots - 1:.0f}; ' f'F = {f_value:.3f}; p = {p_value:.6f}.') return p_value
def nuevo_regress(): modelo = OLS(DATASET.puntaje_global, DATASET.puntaje_matematicas).fit() summary = modelo.summary() vals_residuales = modelo.resid print(summary) print(anderson(vals_residuales)) grafica_qq(vals_residuales)
def testPow(n): raw_X = trainData.OverallQual.values.reshape(-1, 1) OLS_y = trainData.SalePrice X = raw_X**n features = sm.add_constant(X) ols_sm = OLS(OLS_y.values, features) model = ols_sm.fit() return model.rsquared
def fit(self, x, y): x = array(x).reshape(-1, 1) model = OLS(y, PolynomialFeatures(2).fit_transform(x)).fit() self.m = model.predict( PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1))) self.s = wls_prediction_std( model, PolynomialFeatures(2).fit_transform(AGES.reshape(-1, 1)))[0] return self
def _capm(self): rfr = self.rf_rate / self.freq() rr = self.ucrp_r - rfr if 'CASH' in self.B.columns: cash = self.B.CASH else: cash = 0 m = OLS(self.r - 1 - (1 - cash) * rfr, np.vstack([np.ones(len(self.r)), rr - 1]).T) return m.fit()
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = OLS(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) print(regressor_OLS.summary()) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) return x
def stats_models(self, X_train, y_train, show_summary=False): ''' perform OLS from stats model return model results ''' X = sm.add_constant(X_train) model_stats = OLS(y_train, X) results_stats = model_stats.fit() if show_summary: results_stats.summary() return results_stats
def find_apex(decel): res = [] for t in decel.index[10::10]: left = decel[:t]['accelY'] right = decel[t:]['accelY'] left_mod = OLS(left, add_constant(range(len(left)))).fit() right_mod = OLS(right, add_constant(range(len(right)))).fit() ssrs = [t, left_mod.ssr, right_mod.ssr] res.append(ssrs) apex = min(res, key=lambda x: x[1] + x[2])[0] return apex
def intermediate(): # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) data = utils.remove_nulls(data, errors='ignore') y = data.pop(dep_var['name']) featurizer = _create_featurizer(indep_vars) X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns, index=data.index) if not indep_vars: raise errors.UserError('No covariables selected.') # Distributed linear regression only works for continuous variables if utils.is_nominal(dep_var): raise errors.UserError( 'Dependent variable must be continuous in distributed mode. Use SGD Regression for ' 'nominal variables instead.') if data.empty: logging.warning('All values are NAN, returning zero values') result = { 'summary': {}, 'columns': [], 'means': 0, 'X^T * X': 0, 'count': 0, 'scale': 0, } else: # Compute linear-regression X.insert(loc=0, column='intercept', value=1.) lm = OLS(y, X) flm = lm.fit() logging.info(flm.summary()) output = format_output(flm) result = { 'summary': output, 'columns': list(X.columns), 'means': X.mean().values, 'X^T * X': X.T.values.dot(X.values), 'count': len(X), 'scale': flm.scale, } # Store results io_helper.save_results(json.dumps(result), 'application/json')
def est_via_ols(self): """ Estimate average treatment effects with Linear Regression. """ regressor = np.zeros((self.data.n, 1 + self.data.X.shape[1])) regressor[:, 0] = self.data.Z regressor[:, 1:] = self.data.X ols_model = LinearRegression(self.data.Y, regressor) reg_results = ols_model.fit() ate = reg_results.params[0] se = np.sqrt(reg_results.HC0_se[0]) return self._get_results(ate, se)
def LRFunc(self, measureType): ''' Linear regression using OLS cut-off leverage: 3k/n cut-off for influence: 1 cut-off for DFFITS 2*sqrt(k/n) cut-off for DFBETAS 2/sqrt(n) where k=1 ''' dft = self.dfA[(self.dfA.MEASURE_TYPE == measureType) & (self.dfA.FILTER_FLAG != 'WHO')].copy() reg = linear_model.LinearRegression() print(dft.MEASURE_VAL, dft.AGE) regression = OLS(dft.MEASURE_VAL, dft.AGE).fit() infl = regression.get_influence() test = regression.outlier_test() k = 1 N = len(dft) print(N) dft['OLS_BONFPVAL'] = test['bonf(p)'] dft['OLS_STUDENTRES'] = test['student_resid'] dft['OLS_INFLUENCE'] = infl.summary_frame().cooks_d dft['OLS_DFFITS'] = infl.summary_frame().dffits dft['OLS_DFB_AGE'] = infl.summary_frame().dfb_AGE dft['N'] = [N] * N coL, coI, coDf1, coDf2 = 3.0 * k / N, 1, 2 * (k / N)**0.5, 2 / (N**0.5) dft1 = dft[(abs(dft['OLS_INFLUENCE']) <= coI) & (abs(dft['OLS_DFFITS']) <= coDf1) & (abs(dft['OLS_DFB_AGE']) <= coDf2)] if len(dft1) <= 2: for idx, row in dft.iterrows(): self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_FEW_REMAIN' return reg.fit(dft1[['AGE']], dft1['SDS']) dft['pred1'] = reg.predict(dft[['AGE']]) dft['diff1'] = dft['SDS'] - dft['pred1'] stdVal = dft[dft.index.isin(dft1.index)].diff1.std() dft['STD_FOLD'] = dft.diff1 / stdVal self.stdVal[measureType] = stdVal self.coef[measureType] = reg.coef_[0] self.intercept[measureType] = reg.intercept_ for idx, row in dft.iterrows(): if abs(row.STD_FOLD) <= LRCutoffSD[measureType]: self.dfA.loc[idx, 'FILTER_FLAG'] = 'PLAUSIBLE' else: self.dfA.loc[idx, 'FILTER_FLAG'] = 'OLS_OUTLIER' return
def half_life(spread): lag = spread.shift(1) lag.iloc[0] = lag.iloc[1] ret = spread - lag ret.iloc[0] = ret.iloc[1] lag2 = add_constant(lag) model = OLS(ret, lag2) res = model.fit() halflife = int(round(-log(2) / res.params[1], 0)) if halflife <= 0: halflife = 1 return halflife
def fit(self, X, y, **kwargs): if self.fit_intercept: X = sm.add_constant(X) try: self.alpha = kwargs['alpha'] except: raise Exception( 'cannot find alpha! please set the penalty of Lasso') else: self.model = OLS(y, X) self.res = self.model.fit_regularized(alpha=self.alpha, L1_wt=1, **kwargs)
def run_regr(self): if self.pca_flag == True: self.train_x, self.test_x = self.pca( self.train_x, self.test_x, n_components=self.n_components) regr = OLS(self.train_y['Y_M_1'], add_constant(self.train_x)).fit() # print(regr.summary()) try: y_pred = regr.predict(add_constant(self.test_x)) except Exception as e: print(e) return None # print(f'R-square is {r2_score(self.test_y.Y_M_1, y_pred)}') # print(f'Mean - y_pred {np.mean(y_pred)}, Mean - y {np.mean(self.test_y.Y_M_1)}') return r2_score(self.test_y.Y_M_1, y_pred)
def get_half_life_from_scratch(stockX, stockY, beta, df_is): # called in get_df_coint z_array = get_z(stockX, stockY, beta, df_is) z_lag = np.roll(z_array, 1) z_lag[0] = 0 z_ret = z_array - z_lag # adds intercept terms to X for regression z_lag2 = add_constant(z_lag) model = OLS(z_ret, z_lag2) res = model.fit() return int(-np.log(2) / res.params[1])
def ols_cluster_robust(formula, cluster, covs, coef): """Model clusters with cluster-robust OLS, same signature as :func:`~gee_cluster`""" cov_rep = long_covs(covs, np.array([f.values for f in cluster])) res = OLS.from_formula(formula, data=cov_rep).fit( cov_type='cluster', cov_kwds=dict(groups=cov_rep['id'])) return get_ptc(res, coef)
def optimal_spreads_regression(cov_matrix, mid, market_rel_spread): regressors = 3*pd.DataFrame([np.diag(cov_matrix)], ['Variance'], mid.index).T regressors['Inverse decay'] = 1 fit = OLS(market_rel_spread*mid, regressors).fit() risk_aversion = fit.params['Variance'] intensity_decay = 2/fit.params['Inverse decay'] return risk_aversion, intensity_decay, fit.rsquared
def calculate(self): ''' vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1))) vander(x,n+1) ''' if not len(self.xs) or \ not len(self.ys): return if len(self.xs) != len(self.ys): return # xs = asarray(self.xs) ys = asarray(self.ys) # self._ols = OLS(ys, vander(xs, self.degree + 1)) # self._result = self._ols.fit() # print len(xs), len(ys) # print self.degree # print vander(xs, self.degree + 1) X = self._get_X() if X is not None: try: self._ols = OLS(ys, X) self._result = self._ols.fit() except Exception, e: print e
def ols_cluster_robust(formula, cluster, covs, coef): """Model clusters with cluster-robust OLS, same signature as :func:`~gee_cluster`""" cov_rep = long_covs(covs, np.array([f.values for f in cluster])) res = OLS.from_formula(formula, data=cov_rep).fit(cov_type='cluster', cov_kwds=dict(groups=cov_rep['id'])) return get_ptc(res, coef)
def capm(y: pd.Series, bases: pd.DataFrame, rf=0.0, fee=0.0): freq = _freq(y.index) rf = rf / freq fee = fee / freq R = y.pct_change() - rf R.name = y.name R_base = bases.pct_change().sub(rf, axis=0) # CAPM: # R = alpha + rf + beta * (Rm - rf) model = OLS(R, R_base.assign(Intercept=1), missing="drop").fit() alpha = model.params["Intercept"] * freq betas = model.params[bases.columns] # reconstruct artificial portfolio proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee) cumproxy = (1 + proxy).cumprod() # residual portfolio r = y.pct_change() - cumproxy.pct_change() residual = (1 + r).cumprod() return { "alpha": alpha, "betas": betas, "cumproxy": cumproxy, "model": model, "residual": residual, }
def _compute_vif(exog, exog_idx, weights=None, model_config=None): """ Compute variance inflation factor, VIF, for one exogenous variable for OLS and WLS that allows weights. Parameters ---------- exog: X features [X_1, X_2, ..., X_n] exog_idx: ith index for features weights: weights model_config: {"hasconst": True, "cov_type": "HC3"} by default Returns: vif ------- """ if model_config is None: model_config = {"hasconst": True, "cov_type": "HC3"} k_vars = exog.shape[1] x_i = exog[:, exog_idx] mask = np.arange(k_vars) != exog_idx x_noti = exog[:, mask] if weights is None: r_squared_i = OLS(x_i, x_noti, hasconst=model_config["hasconst"]).fit().rsquared else: r_squared_i = WLS(x_i, x_noti, hasconst=model_config["hasconst"], weights=weights).fit( cov_type=model_config["cov_type"]).rsquared vif = 1. / (1. - r_squared_i) return vif
def capm(y: pd.Series, bases: pd.DataFrame, rf=0., fee=0.): freq = _freq(y.index) rf = rf / freq fee = fee / freq R = y.pct_change() - rf R.name = y.name R_base = bases.pct_change().sub(rf, axis=0) # CAPM: # R = alpha + rf + beta * (Rm - rf) model = OLS.from_formula(f"Q('{y.name}') ~ {'+'.join(bases.columns)}", R_base.join(R)).fit() alpha = model.params['Intercept'] * freq betas = model.params[bases.columns] # reconstruct artificial portfolio proxy = R_base @ betas + (1 - betas.sum()) * (rf + fee) cumproxy = (1 + proxy).cumprod() # residual portfolio r = y.pct_change() - cumproxy.pct_change() residual = (1 + r).cumprod() return { 'alpha': alpha, 'betas': betas, 'cumproxy': cumproxy, 'model': model, 'residual': residual, }
def fit(xyz, xlim=None, ylim=None, zlim=None, **kwargs): all_true = numpy.empty_like(xyz[:,0], dtype=bool) \ if None in [xlim, ylim, zlim] \ else None xbool = numpy.abs(xyz[:,0]) < xlim if xlim else all_true ybool = numpy.abs(xyz[:,1]) < ylim if ylim else all_true zbool = numpy.abs(xyz[:,2]) < zlim if zlim else all_true bools = numpy.logical_and(numpy.logical_and(xbool, ybool), zbool) XYZ = xyz[bools,:] XY = add_constant(XYZ[:,:2], prepend=False) Z = XYZ[:,-1] model = OLS(Z, XY) result = model.fit() coeffs = result.params stderr = result.HC1_se return coeffs, stderr
def linear(data, **kwargs): '''linear regression model fitted with ordinary least squares Parameters ---------- data : array or dataframe first column is endogenous, second column is a column of ones, the rest are exogenous data ** Keyword Arguments ** prior_type : str 'uniform' or 'collinear adjusted dilution' Returns ------- rslts : array 1-d array of parameter coefficients ''' prior_type = kwargs.get('prior_type', 'uniform') endog = data[:, [0]] exog = data[:, 1:] model = OLS(endog=endog, exog=exog, missing='drop') adj = (np.cov(np.hstack((model.wexog, endog)), rowvar=0)[:-1, -1]/ \ np.var(endog)).reshape((-1, 1)) fit = model.fit() par_rsquared = fit.params.reshape((-1,1))*adj if prior_type == 'uniform': prior = 1. elif prior_type == 'collinear adjusted dilution': prior = collinear_adj_prior(exog) else: raise ValueError('prior {} not supported'.format(prior_type)) posterior = math.exp(fit.llf)*prior return np.hstack((fit.nobs, posterior, fit.rsquared, fit.params, fit.pvalues, fit.bse, par_rsquared.flat))
def mixed_model_cluster(formula, cluster, covs, coef): """Model clusters with a mixed-model, same signature as :func:`~gee_cluster`""" cov_rep = long_covs(covs, np.array([f.values for f in cluster])) # TODO: remove this once newer version of statsmodels is out. # speeds convergence by using fixed estimates from OLS params = OLS.from_formula(formula, data=cov_rep).fit().params res = MixedLM.from_formula(formula, groups='id', data=cov_rep).fit(start_params=dict(fe=params), reml=False, method='bfgs') return get_ptc(res, coef)
class OLSRegressor(BaseRegressor): degree = Property(depends_on='_degree') _degree = Int constant = None # _result = None # @on_trait_change('xs,ys') # def _update_data(self): # self._ols = OLS(self.xs, vander(self.ys, self.degree + 1)) # self._result = self._ols.fit() # def _xs_changed(self): # xs = asarray(self.xs) # ys = asarray(self.ys) # # print len(xs), len(ys) # self._ols = OLS(ys, vander(xs, self.degree + 1)) # self._result = self._ols.fit() def __degree_changed(self): self.calculate() def calculate(self): ''' vander is equivalent to sm.add_constant(np.column_stack((x**n,..x**2,x**1))) vander(x,n+1) ''' if not len(self.xs) or \ not len(self.ys): return if len(self.xs) != len(self.ys): return # xs = asarray(self.xs) ys = asarray(self.ys) # self._ols = OLS(ys, vander(xs, self.degree + 1)) # self._result = self._ols.fit() # print len(xs), len(ys) # print self.degree # print vander(xs, self.degree + 1) X = self._get_X() if X is not None: try: self._ols = OLS(ys, X) self._result = self._ols.fit() except Exception, e: print e
# In[49]: X=pd.DataFrame([timevncats.index.to_series(),timevncats.index.to_series()**2],index='x x**2'.split()).T import statsmodels.api as sm # In[65]: ols=OLS(timevncats,sm.add_constant(X)) # In[66]: ols=ols.fit() nclients=Clientes.shape[0] predtime=(ols.predict([1,nclients,nclients**2])/60/60)[0] print('Full data set should take %i hours' % int(predtime))
def fit_ols(y, x, idx=-1): ols = OLS(y, add_constant(x)) results = ols.fit() return results.params.values[idx], results.cov_params().values[idx, idx]
""" import numpy as np from statsmodels.api import add_constant, OLS, WLS import matplotlib.pyplot as plt # (x, y) is the set of observations. w contains precomputed weights; we'll # also compute these weights in this script. x, y, w = np.loadtxt('draper_smith_table9p1.txt', unpack=True) X = add_constant(x, prepend=True) # --- OLS --------------------------------------------------------------- # Ordinary least squares fit. ols_result = OLS(y, X).fit() print ols_result.summary() # Make a plot of the OLS residuals vs y and vs x. # The following recreates Fig. 9.1. plt.figure(1) plt.clf() plt.subplot(2, 1, 1) plt.plot(ols_result.fittedvalues, ols_result.resid, 'bo') plt.title("OLS Residuals versus fitted values") plt.xlabel('y') plt.ylabel('e') plt.grid() plt.subplot(2, 1, 2) plt.plot(x, ols_result.resid, 'bo')
def test_beta(self, b0_vals, param_nums, ftol=10 ** - 5, maxiter=30, print_weights=1): """ Returns the profile log likelihood for regression parameters 'param_num' at 'b0_vals.' Parameters ---------- b0_vals: list The value of parameters to be tested param_num: list Which parameters to be tested maxiter: int, optional How many iterations to use in the EM algorithm. Default is 30 ftol: float, optional The function tolerance for the EM optimization. Default is 10''**''-5 print_weights: bool If true, returns the weights tate maximize the profile log likelihood. Default is False Returns ------- test_results: tuple The log-likelihood and p-pvalue of the test. Notes ---- The function will warn if the EM reaches the maxiter. However, when optimizing over nuisance parameters, it is possible to reach a maximum number of inner iterations for a specific value for the nuisance parameters while the resultsof the function are still valid. This usually occurs when the optimization over the nuisance parameters selects paramater values that yield a log-likihood ratio close to infinity. Examples ------- import statsmodels.api as sm import numpy as np # Test parameter is .05 in one regressor no intercept model data=sm.datasets.heart.load() y = np.log10(data.endog) x = data.exog cens = data.censors model = sm.emplike.emplikeAFT(y, x, cens) res=model.test_beta([0], [0]) >>>res >>>(1.4657739632606308, 0.22601365256959183) #Test slope is 0 in model with intercept data=sm.datasets.heart.load() y = np.log10(data.endog) x = data.exog cens = data.censors model = sm.emplike.emplikeAFT(y, sm.add_constant(x, prepend=1), cens) res=model.test_beta([0], [1]) >>>res >>>(4.623487775078047, 0.031537049752572731) """ censors = self.model.censors endog = self.model.endog exog = self.model.exog uncensored = (censors == 1).flatten() censored = (censors == 0).flatten() uncens_endog = endog[uncensored] uncens_exog = exog[uncensored, :] reg_model = OLS(uncens_endog, uncens_exog).fit() llr, pval, new_weights = reg_model.el_test(b0_vals, param_nums, return_weights=True) # Needs to be changed km = self.model._make_km(endog, censors).flatten() # when merged uncens_nobs = self.model.uncens_nobs F = np.asarray(new_weights).reshape(uncens_nobs) # Step 0 ^ params = self.params() survidx = np.where(censors == 0) survidx = survidx[0] - np.arange(len(survidx[0])) numcensbelow = np.int_(np.cumsum(1 - censors)) if len(param_nums) == len(params): llr = self._EM_test([], F=F, params=params, param_nums=param_nums, b0_vals=b0_vals, survidx=survidx, uncens_nobs=uncens_nobs, numcensbelow=numcensbelow, km=km, uncensored=uncensored, censored=censored, ftol=ftol, maxiter=25) return llr, chi2.sf(llr, self.model.nvar) else: x0 = np.delete(params, param_nums) try: res = optimize.fmin(self._EM_test, x0, (params, param_nums, b0_vals, F, survidx, uncens_nobs, numcensbelow, km, uncensored, censored, maxiter, ftol), full_output=1, disp = 0) llr = res[1] return llr, chi2.sf(llr, len(param_nums)) except np.linalg.linalg.LinAlgError: return np.inf, 0
square = lambda row: row**2 sum_of_squares = df['difference'].apply(square).sum() return(sum_of_squares) x0 = [-20, .0008, 1.1] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True}) clf = linear_model.LinearRegression() x = df[['AADT', 'L']].as_matrix() y = df['Crashes'] clf.fit(x, y) clf.coef_ clf.intercept_ model = OLS(y, add_constant(x)) model_fit = model.fit() model_fit.summary() def estimator(x, row_in='Crashes'): estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L']) df['estimated'] = df.apply(estimated, axis=1) #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in]) probability = lambda row: poisson.pmf(row[row_in], row['estimated']) df['probability'] = df.apply(probability, axis=1) product = df['probability'].product() return(-product) x0 = [1.6, .0000026, .032] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})