def run_xval_stn(self, stn_id, bw_nngh=100): ''' Run a single leave-one-out cross validation of a geographically weighted regression model of a station's monthly and annual normals (norm~lst+elev+lon+lat). Parameters ---------- stn_id : str The stn_id for which to run the cross validation bw_nngh : int, optional The number of neighbors to use for the geographically weighted regression. Default: 100. Returns ---------- err : float The difference between predicted and observed (predicted minus observed) ''' xval_stn = self.stn_da.stns[self.stn_da.stn_idxs[stn_id]] df_xval_stn = self.df_stns.loc[stn_id, :] self.stn_slct.set_ngh_stns(xval_stn[LAT], xval_stn[LON], bw_nngh, load_obs=False, stns_rm=stn_id) df_nghs = self.df_stns.loc[self.stn_slct.ngh_stns[STN_ID], :] errs = np.empty(13) # Errors for monthly normals for mth in np.arange(1, 13): ls_form = 'norm%.2d~lst%.2d+elevation+longitude+latitude' % (mth, mth) ls_fit = sm.wls(ls_form, data=df_nghs, weights=self.stn_slct.ngh_wgt).fit() err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm%.2d' % mth] errs[mth - 1] = err # Error for annual normal ls_form = 'norm~lst+elevation+longitude+latitude' ls_fit = sm.wls(ls_form, data=df_nghs, weights=self.stn_slct.ngh_wgt).fit() err = ls_fit.predict(df_xval_stn)[0] - df_xval_stn['norm'] errs[-1] = err return errs
def _fit_hdd_only(df, weighted=False): bps = [i[4:] for i in df.columns if i[:3] == 'HDD'] best_bp, best_rsquared, best_mod, best_res = None, -9e9, None, None best_formula, hdd_qualified = None, False try: # TODO: fix big try block anti-pattern for bp in bps: candidate_hdd_formula = 'upd ~ HDD_' + bp if (np.nansum(df['HDD_' + bp] > 0) < 10) or \ (np.nansum(df['HDD_' + bp]) < 20): continue if weighted: candidate_hdd_mod = smf.wls(formula=candidate_hdd_formula, data=df, weights=df['ndays']) else: candidate_hdd_mod = smf.ols(formula=candidate_hdd_formula, data=df) candidate_hdd_res = candidate_hdd_mod.fit() candidate_hdd_rsquared = candidate_hdd_res.rsquared_adj if (candidate_hdd_rsquared > best_rsquared and candidate_hdd_res.params['Intercept'] >= 0 and candidate_hdd_res.params['HDD_' + bp] >= 0 and candidate_hdd_res.pvalues['HDD_' + bp] < 0.1): best_bp, best_rsquared = int(bp), candidate_hdd_rsquared best_mod, best_res = candidate_hdd_mod, candidate_hdd_res hdd_qualified = True best_formula = 'upd ~ HDD_' + bp except: # TODO: catch specific error best_rsquared, hdd_qualified = 0, False best_formula, best_mod, best_res = None, None, None best_bp = None return best_formula, best_mod, best_res, best_rsquared, hdd_qualified, best_bp
def determineTrendWLS(dictParam): NLag = dictParam['NLag'] import statsmodels.formula.api as sm df = dictParam['df'] dfOLS = pd.DataFrame(df['Close']) dfOLS['i'] = range(0, dfOLS.index.size) dfOLS['weight'] = dfOLS['i'] dfOLS['weight'] = dfOLS['weight'] - dfOLS['i'].mean() dfOLS['weight'] = dfOLS['weight'].apply(lambda x: np.power(abs(x), 2)) dfOLS['weight'] = dfOLS['weight'] / dfOLS['weight'].sum() dfOLS = dfOLS.dropna() #wls = sm.wls(formula='Close ~ i', data=dfOLS, weights=dfOLS.weight.values).fit() wls = sm.wls(formula='Close ~ i', data=dfOLS, weights=dfOLS.weight).fit(cov_type='HAC', cov_kwds={'maxlags': NLag}) #wls = sm.wls(formula='Close ~ i', data=dfOLS, weights=dfOLS.weight).fit(cov_type='HC0') t = wls.tvalues['i'] tThreshold = 2 if t > tThreshold: return 1 elif t < -tThreshold: return -1 else: return 0
def model_at(formula, **kwargs): data = data_at(**kwargs) data.dropna(inplace=True) print(data) print(formula) model = smf.wls(formula, weights=data.weight, data=data) return model, data
def _estimate_hour_of_week_occupancy(model_data, threshold): index = pd.CategoricalIndex(range(168)) if model_data.dropna().empty: return pd.Series(np.nan, index=index, name="occupancy") usage_model = smf.wls( formula="meter_value ~ cdd_65 + hdd_50", data=model_data, weights=model_data.weight, ) model_data_with_residuals = model_data.merge( pd.DataFrame({"residuals": usage_model.fit().resid}), left_index=True, right_index=True, ) def _is_high_usage(df): if df.empty: return np.nan n_positive_residuals = sum(df.residuals > 0) n_residuals = float(len(df.residuals)) ratio_positive_residuals = n_positive_residuals / n_residuals return int(ratio_positive_residuals > threshold) return (model_data_with_residuals.groupby([ "hour_of_week" ]).apply(_is_high_usage).rename("occupancy").reindex(index).astype(bool) ) # guarantee an index value for all hours
def wls_cluster(formula, df, wt, clt): """ wt : Weight clt : Cluster """ model = wls(formula=formula, data=df, weights=df[wt]) reg = model.fit(cov_type='cluster', cov_kwds={'groups': df[clt]}) return reg
def WLS(xdata, ydata, xerr): ws = pandas.DataFrame({'x': xdata, 'y': ydata}) weights = pandas.Series(xerr) fit = sm.wls('y ~ x', data=ws, weights=1 / weights).fit() Int, x = fit.pvalues residuals = fit.resid rval = fit.rsquared residuals = [abs(i) for i in residuals] newerr = numpy.sqrt(sum(residuals) / (len(residuals) - 2)) return round(rval, 2), fit.predict(), round(newerr, 2)
def get_calibration(data): data = data.copy() try: data['weight'] = data.known_concentration**-2 except ZeroDivisionError: data['weight'] = np.nan data = data.replace([np.inf, -np.inf], np.nan).dropna(subset=['weight', 'area']) if not len(data) > 1: return # Deal with presence/absence of an intercept term according to calibration_config _intercept = data.intercept.unique() assert len(_intercept) == 1 intercept = _intercept[0] try: if intercept == 0: fit = sm.wls('area ~ known_concentration - 1', data=data, weights=data.weight).fit() else: fit = sm.wls('area ~ known_concentration', data=data, weights=data.weight).fit() except ValueError as err: print(data, file=sys.stderr) raise err out = {} if hasattr(fit.params, 'Intercept'): out['intercept'] = fit.params.Intercept out['slope'] = fit.params[1] else: out['intercept'] = 0 out['slope'] = fit.params[0] out['limit_of_detection'] = np.nan # TODO out['observations'] = fit.nobs out['relative_standard_error'] = relative_standard_error(fit) out['rsquared'] = fit.rsquared return pd.Series(out)
def __get_model_fit( self, serie: Optional[int] = None ) -> sm.RegressionResultsWrapper: if serie is None: calibration_data: pd.DataFrame = self.data.calibration_data else: calibration_data: pd.DataFrame = self.data.get_serie(serie, "calibration") return smf.wls( formula=self.formula, weights=dmatrix(self.weight, calibration_data), data=calibration_data, ).fit()
def statsmodels_results(xdata, ydata, xerr=None): ws=pandas.DataFrame({'x':xdata, 'y':ydata}) if xerr!=None: weights=pandas.Series(xerr) fit=sm.wls('y ~ x', data=ws, weights=1/weights).fit() else: fit=sm.ols('y ~ x', data=ws).fit() Int, x=fit.pvalues residuals=fit.resid rval=fit.rsquared residuals=[abs(i) for i in residuals] newerr=numpy.sqrt(sum(residuals)/(len(residuals)-2)) return fit, round(rval, 2), round(newerr,2)
def fit_caltrack_hourly_model_segment(segment_name, segment_data): def _get_hourly_model_formula(data): if (np.sum(data.loc[data.weight > 0].occupancy) == 0) or (np.sum( data.loc[data.weight > 0].occupancy) == len( data.loc[data.weight > 0].occupancy)): bin_occupancy_interactions = "".join( [" + {}".format(c) for c in data.columns if "bin" in c]) return "meter_value ~ C(hour_of_week) - 1{}".format( bin_occupancy_interactions) else: bin_occupancy_interactions = "".join([ " + {}:C(occupancy)".format(c) for c in data.columns if "bin" in c ]) return "meter_value ~ C(hour_of_week) - 1{}".format( bin_occupancy_interactions) warnings = [] if segment_data.dropna().empty: model = None formula = None model_params = None warnings.append( EEMeterWarning( qualified_name= "eemeter.fit_caltrack_hourly_model_segment.no_nonnull_data", description= "The segment contains either an empty dataset or all NaNs.", data={ "n_rows": segment_data.shape[0], "n_rows_after_dropna": segment_data.dropna().shape[0], }, )) else: formula = _get_hourly_model_formula(segment_data) model = smf.wls(formula=formula, data=segment_data, weights=segment_data.weight) model_params = { coeff: value for coeff, value in model.fit().params.items() } return CalTRACKSegmentModel( segment_name=segment_name, model=model, formula=formula, model_params=model_params, warnings=warnings, )
def rdestimate(data, y, x, controls=None, cutpoint=0, weights=1): """ Wrapper around `smf.wls` to produce `RDestimate`""" data["TREATED"] = np.where(data[x] >= cutpoint, 1, 0) equation = f"{y} ~ TREATED + {x}" if controls is not None: if isinstance(controls, list): eq_controls = " + ".join(controls) elif isinstance(controls, str): eq_controls = controls else: print(type(controls), "controls should be either list or str") eq_controls = "" equation += eq_controls return smf.wls(equation, data=data, weights=weights)
def rdd(input_data, xname, yname=None, cut=0, equation=None, controls=None, noconst=False, weights=1, verbose=True): ''' This function implements a linear regression (ordinary or weighted least squares can be used) for the estimation of regressing the outcome variable on the running variable. A "TREATED" variable is created, the coefficient on which is the causal effect of being to the right of the threshold. The user may specify a list of controls to be added linearly, or supply their own equation. INPUT: input_data: dataset with outcome and running variables (and potentially controls) (pandas DataFrame) xname: name of running variable (string) yname: name of outcome variable (string) (default is None - not needed if you include your own equation) cut: location of threshold in xname (scalar) (default is 0) equation: Estimation equation as a string (see Statsmodels formula syntax for more info) controls: List of controls to include in the estimation (list of strings) (not needed if you include your own equation) noconst: If True, model does not estimate an intercept (bool) (default is false) weights: Weights for weighted least squares (numpy array) (default is equal weights, ie OLS) OUTPUT: Statsmodels object ''' if yname == None and equation == None: raise NameError( "You must supply either a outcome variable name or an equation to estimate." ) if 'TREATED' in input_data.columns: raise NameError( "TREATED is a reserved column name. Please change the name.") data = input_data.copy() # To avoid SettingWithCopy warnings data['TREATED'] = np.where(data[xname] >= cut, 1, 0) if equation == None: equation = yname + ' ~ TREATED + ' + xname if controls != None: equation_controls = ' + '.join(controls) equation += ' + ' + equation_controls if noconst == True: equation += ' -1' if verbose == True: print('Estimation Equation:\t', equation) rdd_model = smf.wls(equation, data=data, weights=weights) return rdd_model
def _fit_full(df, weighted=False, billing=False): hdd_bps = [i[4:] for i in df.columns if i[:3] == 'HDD'] cdd_bps = [i[4:] for i in df.columns if i[:3] == 'CDD'] best_hdd_bp, best_cdd_bp, best_rsquared, best_mod, best_res = \ None, None, -9e9, None, None best_formula, full_qualified = None, False try: # TODO: fix big try block anti-pattern for hdd_bp in hdd_bps: for cdd_bp in cdd_bps: if cdd_bp < hdd_bp: continue candidate_full_formula = 'upd ~ CDD_' + cdd_bp + \ ' + HDD_' + hdd_bp if not billing: if (np.nansum(df['HDD_' + hdd_bp] > 0) < 10) or \ (np.nansum(df['HDD_' + hdd_bp]) < 20): continue if (np.nansum(df['CDD_' + cdd_bp] > 0) < 10) or \ (np.nansum(df['CDD_' + cdd_bp]) < 20): continue if weighted: candidate_full_mod = smf.wls( formula=candidate_full_formula, data=df, weights=df['ndays']) else: candidate_full_mod = smf.ols( formula=candidate_full_formula, data=df) candidate_full_res = candidate_full_mod.fit() candidate_full_rsquared = candidate_full_res.rsquared_adj if (candidate_full_rsquared > best_rsquared and candidate_full_res.params['Intercept'] >= 0 and candidate_full_res.params['HDD_' + hdd_bp] >= 0 and candidate_full_res.params['CDD_' + cdd_bp] >= 0 and candidate_full_res.pvalues['HDD_' + hdd_bp] < 0.1 and candidate_full_res.pvalues['CDD_' + cdd_bp] < 0.1): best_hdd_bp, best_cdd_bp, best_rsquared = \ int(hdd_bp), int(cdd_bp), candidate_full_rsquared best_mod, best_res = candidate_full_mod, candidate_full_res full_qualified = True best_formula = 'upd ~ CDD_' + cdd_bp + ' + HDD_' + hdd_bp except: # TODO: catch specific error best_rsquared, full_qualified = 0, False best_formula, best_mod, best_res = None, None, None best_hdd_bp, best_hdd_bp = None, None return best_formula, best_mod, best_res, best_rsquared, full_qualified, best_hdd_bp, best_cdd_bp
def rolling_ols(formula: str, data: pd.DataFrame, window: int, r2_adj=False, expanding=False, robust=False, M=sm.robust.norms.AndrewWave()): para_res = {} r_2_res = {} model_sig = {} forcast_res = pd.Series([]) for i in range(len(data) - window + 1): if expanding: start_index = 0 else: start_index = i tmp_df = data.iloc[start_index:i + window] forcast_x = data.iloc[i + window:i + window + 1] if robust: rlm_model = smf.rlm(formula, data=tmp_df, M=M) ols_result = smf.wls(formula, data=tmp_df, weights=rlm_model.fit().weights).fit() # ols_result = sm.WLS(rlm_model.endog, rlm_model.exog, # weights=rlm_model.fit().weights).fit() else: ols_result = smf.ols(formula, data=tmp_df).fit() para_res[data.index[i + window - 1]] = ols_result.params model_sig[data.index[i + window - 1]] = ols_result.f_pvalue if r2_adj: r_2_res[data.index[i + window - 1]] = ols_result.rsquared_adj else: r_2_res[data.index[i + window - 1]] = ols_result.rsquared # 一步预测 forcast_res = forcast_res.append(ols_result.predict(forcast_x)) para_res = pd.DataFrame(para_res).T r_2_res = pd.Series(r_2_res) model_sig = pd.Series(model_sig) return para_res, r_2_res.mean(), model_sig, forcast_res
def calibrate(): (times, gaussian_means, gaussian_stds) = find_gaussians() weights = 1 / np.power(gaussian_stds, 2) # put x and y into a pandas DataFrame, and the weights into a Series ws = pd.DataFrame({ 'x': times, 'y': gaussian_means, 'yerr': map(lambda x: x * 1000, gaussian_stds) }) wls_fit = sm.wls('x ~ y', data=ws, weights=1 / weights).fit() return ((wls_fit.params['y'], wls_fit.params['Intercept']), (wls_fit.bse['y'], wls_fit.bse['Intercept']))
def init_model(df_train, df_test, model_txt, yield_type='rainfed', weight=False): if weight: results = smf.wls(model_txt, data=df_train, missing='drop', weights=df_train['corn_percent']).fit() else: results = smf.ols(model_txt, data=df_train, missing='drop').fit() return results, df_test.copy().join( results.predict(df_test).to_frame('Predicted_' + yield_type_dict[yield_type] + '_ana'))
def _fit_intercept(df, weighted=False): int_formula = 'upd ~ 1' try: if weighted: int_mod = smf.wls(formula=int_formula, data=df, weights=df['ndays']) else: int_mod = smf.ols(formula=int_formula, data=df) int_res = int_mod.fit() except: # TODO: catch specific error int_rsquared, int_qualified = 0, False int_formula, int_mod, int_res = None, None, None else: int_rsquared, int_qualified = 0, True return int_formula, int_mod, int_res, int_rsquared, int_qualified
def fit_caltrack_hourly_model_segment(segment_name, segment_data): def _get_hourly_model_formula(data): bin_occupancy_interactions = "".join( [" + {}:C(occupancy)".format(c) for c in data.columns if "bin" in c] ) return "meter_value ~ C(hour_of_week) - 1{}".format(bin_occupancy_interactions) formula = _get_hourly_model_formula(segment_data) model = smf.wls(formula=formula, data=segment_data, weights=segment_data.weight) model_params = {coeff: value for coeff, value in model.fit().params.items()} warnings = [] return SegmentModel( segment_name=segment_name, model=model, formula=formula, model_params=model_params, warnings=warnings, )
def setup_class(cls): import statsmodels.formula.api as smf data = sm.datasets.cpunish.load_pandas() endog = data.endog data = data.exog data['EXECUTIONS'] = endog data['INCOME'] /= 1000 aweights = np.array( [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1]) model = smf.glm( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, family=sm.families.Gaussian(link=sm.families.links.identity()), var_weights=aweights) wlsmodel = smf.wls('EXECUTIONS ~ INCOME + SOUTH - 1', data=data, weights=aweights) cls.res1 = model.fit(rtol=1e-25, atol=1e-25) cls.res2 = wlsmodel.fit()
def armonic(t, m, f, merr): ws = pd.DataFrame({ 'x': m, 'y1': np.sin(2 * np.pi * t * f), 'y2': np.cos(2 * np.pi * t * f), 'y3': np.sin(4 * np.pi * t * f), 'y4': np.cos(4 * np.pi * t * f), 'y5': np.sin(6 * np.pi * t * f), 'y6': np.cos(6 * np.pi * t * f), 'y7': np.sin(8 * np.pi * t * f), 'y8': np.cos(8 * np.pi * t * f) }) weights = pd.Series(merr) wls_fit = sm.wls('x ~ y1+y2+y3+y4+y5+y6+y7+y8-1', data=ws, weights=1 / weights).fit() pred = wls_fit.predict() r = m - pred A = np.zeros(4) PH = np.zeros(4) A[0] = np.sqrt(wls_fit.params[0]**2 + wls_fit.params[1]**2) A[1] = np.sqrt(wls_fit.params[2]**2 + wls_fit.params[3]**2) A[2] = np.sqrt(wls_fit.params[4]**2 + wls_fit.params[5]**2) A[3] = np.sqrt(wls_fit.params[6]**2 + wls_fit.params[7]**2) PH[0] = np.arctan2(wls_fit.params[1], wls_fit.params[0]) - ( 1 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[1] = np.arctan2(wls_fit.params[3], wls_fit.params[2]) - ( 2 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[2] = np.arctan2(wls_fit.params[5], wls_fit.params[4]) - ( 3 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) PH[3] = np.arctan2(wls_fit.params[7], wls_fit.params[6]) - ( 4 * f / f) * np.arctan2(wls_fit.params[1], wls_fit.params[0]) influence = inf.OLSInfluence(wls_fit) dffits = influence.dffits cook = influence.cooks_distance leverage = influence.hat_matrix_diag inf1 = np.where(dffits[0] > dffits[1]) inf2 = np.where(cook[1] < 0.05) inffin = np.concatenate((inf1, inf2), axis=1) return pred, r, A, PH, inffin
def setup_class(cls): import statsmodels.formula.api as smf data = sm.datasets.cpunish.load_pandas() endog = data.endog data = data.exog data['EXECUTIONS'] = endog data['INCOME'] /= 1000 aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2, 1]) model = smf.glm( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, family=sm.families.Gaussian(link=sm.families.links.identity()), var_weights=aweights ) wlsmodel = smf.wls( 'EXECUTIONS ~ INCOME + SOUTH - 1', data=data, weights=aweights) cls.res1 = model.fit(rtol=1e-25, atol=1e-25) cls.res2 = wlsmodel.fit()
def lm_formula(data, xseq, **params): """ Fit OLS / WLS using a formula """ formula = params['formula'] eval_env = params['enviroment'] weights = data.get('weight', None) if weights is None: init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.OLS, sm.OLS.fit) model = smf.ols(formula, data, eval_env=eval_env, **init_kwargs) else: if np.any(weights < 0): raise ValueError("All weights must be greater than zero.") init_kwargs, fit_kwargs = separate_method_kwargs( params['method_args'], sm.OLS, sm.OLS.fit) model = smf.wls(formula, data, weights=weights, eval_env=eval_env, **init_kwargs) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(data) if params['se']: _, predictors = dmatrices(formula, data, eval_env=eval_env) alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std(results, predictors, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
def forward_select_weighted(df, resp_str, maxk, counts): remaining = set(df.columns) remaining.remove(resp_str) selected = [] numselected = 1 score_crnt, score_new = 0.0, 0.0 while remaining and score_crnt == score_new: score_array = [] for candidate in remaining: formula = "{} ~ {} + 1".format(resp_str, ' + '.join(selected + [candidate])) score = smf.wls(formula, df, weights=counts).fit().rsquared_adj score_array.append((score, candidate)) score_array.sort() score_new, best_option = score_array.pop() if score_crnt < score_new and numselected <= maxk: remaining.remove(best_option) selected.append(best_option) score_crnt = score_new numselected += 1 formula = "{} ~ {} + 1".format(resp_str, ' + '.join(selected)) model = smf.ols(formula, df).fit() return model
def getBestColumns(df, columns, patsy_string_so_far, for_method, includePripas1): best_columns = [] for x in columns: if df[x].nunique() > 20 and for_method == 'ANOVA': continue # Remove future-looking columns if x in (['subsid', 'weight', 'priexp1'] + ([] if includePripas1 else ['pripas1'])): continue if 'priexp' in x or 'genecon' in x or x == 'downchance': continue if 'exp' in x or 'brexit_' in x: continue if for_method in ['DT', 'SVM'] and 'age_grp' == x: continue formula = 'df.priexp1 ~ ' + patsy_string_so_far + 'C(' + x + ')' try: lm = wls(formula, df, weights = df.weight).fit() if lm.nobs > AT_LEAST_THIS_MANY_OBS: best_columns.append([lm.rsquared_adj,x, lm.params]) except: pass #don't handle best_columns.sort(reverse = True) return best_columns[:(5 if for_method == 'SVM' else 20)]
# <codecell> m_regression_data[["PVI", "per_black", "per_hisp", "older_pop", "average_income", "romney_give", "obama_give", "educ_coll", "educ_hs"]].corr() # <codecell> (today - m_regression_data["poll_date"].astype('O')) # <codecell> time_weights = (today - m_regression_data["poll_date"].astype('O')).apply(exp_decay) # <codecell> m_model = wls("m ~ PVI + per_hisp + per_black + average_income + educ_coll", data=m_regression_data, weights=time_weights).fit() m_model.summary() # <codecell> state_resid = pandas.DataFrame(zip(m_model.resid, m_regression_data.State), columns=["resid", "State"]) # <codecell> state_resid_group = state_resid.groupby("State") # <codecell> fig, axes = plt.subplots(figsize=(12,8), subplot_kw={"ylabel" : "Residual", "xlabel" : "State"})
castle['lead5'] = castle['time_til'] == -5 castle['lead6'] = castle['time_til'] == -6 castle['lead7'] = castle['time_til'] == -7 castle['lead8'] = castle['time_til'] == -8 castle['lead9'] = castle['time_til'] == -9 castle['lag0'] = castle['time_til'] == 0 castle['lag1'] = castle['time_til'] == 1 castle['lag2'] = castle['time_til'] == 2 castle['lag3'] = castle['time_til'] == 3 castle['lag4'] = castle['time_til'] == 4 castle['lag5'] = castle['time_til'] == 5 formula = "l_homicide ~ r20001 + r20002 + r20003 + r20011 + r20012 + r20013 + r20021 + r20022 + r20023 + r20031 + r20032 + r20033 + r20041 + r20042 + r20043 + r20051 + r20052 + r20053 + r20061 + r20062 + r20063 + r20071 + r20072 + r20073 + r20081 + r20082 + r20083 + r20091 + r20092 + r20093 + lead1 + lead2 + lead3 + lead4 + lead5 + lead6 + lead7 + lead8 + lead9 + lag1 + lag2 + lag3 + lag4 + lag5 + C(year) + C(state)" event_study_formula = smf.wls(formula, data=castle, weights=castle['popwt']).fit( cov_type='cluster', cov_kwds={'groups': castle['sid']}) leads = [ 'lead9[T.True]', 'lead8[T.True]', 'lead7[T.True]', 'lead6[T.True]', 'lead5[T.True]', 'lead4[T.True]', 'lead3[T.True]', 'lead2[T.True]', 'lead1[T.True]' ] lags = [ 'lag1[T.True]', 'lag2[T.True]', 'lag3[T.True]', 'lag4[T.True]', 'lag5[T.True]' ] leadslags_plot = pd.DataFrame({ 'sd': np.concatenate([
if False: formula_rhs = formula_rhs + " + " + " + ".join(gb_cols) formula_rhs = formula_rhs + " + " + " + ".join(elorange_cols) # hey lets just use the elorange columns and see how they do #formula_rhs = " + ".join(elorange_cols) formula = "elo ~ " + " + ".join(rhs_cols) msg("Fitting!") weights = np.ones(train.shape[0]) do_statsmodels=True if do_statsmodels: ols = sm.wls(formula=formula, data=train, weights=weights).fit() print ols.summary() msg("Making predictions for all playergames") yy_df['ols_prediction'] = ols.predict(yy_df) else: ols_lr = LassoCV(n_jobs=-1, verbose=True) X = train[rhs_cols] y = train['elo'] ols_lr.fit(X,y) yy_df['ols_prediction'] = ols_lr.predict(X) yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs() yy_df['training'] = (yy_df['gamenum'] % 3) insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std}) print insample_scores
import statsmodels.api as sm from matplotlib import pyplot as plt from scipy.stats import levene from statsmodels.stats.anova import anova_lm import seaborn as sns import statsmodels.formula.api as smf from variables import DIR_OUT if __name__ == "__main__": path_df = os.path.join(DIR_OUT, "derived_tables", "nb_streamlines_hemi_level.csv") df = pd.read_csv(path_df) # sns.lmplot(x='Mesh_Area',y='Nb_Streamlines_Hemi', hue='Hemisphere',data=df, truncate=True,robust=True) # model = smf.wls("Nb_Streamlines_Hemi ~ Mesh_Area -1", data=df).fit() print model.summary() df["Corrected_Nb_Streamlines_Hemi"] = model.resid # plt.scatter(df['Mesh_Area'].values,df['Nb_Streamlines_Hemi']) # plt.plot(df['Mesh_Area'].values, float(model.params)*(df['Mesh_Area'].values)) # # plt.show() # # # model = smf.ols('Corrected_Nb_Streamlines_Hemi~ PP_CS_Coord_Iso', data=df).fit() # # print model.summary() # # anova = anova_lm(model) # # print summary # # print anova # model = smf.ols('Corrected_Nb_Streamlines_Hemi ~ C(Hemisphere)*C(HandednessQ)*C(Gender)*C(AgeQ)',data=df).fit() # print model.summary()
plt.ylabel("log(Sales)") plt.title("Log Transformation of y") plt.scatter(adv.TV, np.log(adv.Sales), alpha=0.3) plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9) # View the residuals plt.figure() plt.scatter(est.predict(adv), est.resid, alpha=0.3) plt.title("Residuals with Log Transformation of y") plt.xlabel("Predicted log(Sales)") plt.ylabel("Residuals") ##### # Option #2: Weighted least squares w = 1./(adv.TV) est_wls = smf.wls(formula='Sales ~ TV', data=adv, weights = w).fit() # What is the difference? est = smf.ols(formula='Sales ~ TV', data=adv).fit() y_hat = est.predict(x_prime) y_hat_wls = est_wls.predict(x_prime) plt.xlabel("TV") plt.ylabel("Sales") plt.title("OLS (red) vs. WLS (blue") plt.scatter(adv.TV, adv.Sales, alpha=0.3) plt.plot(x_prime, y_hat, 'r', linewidth=2, alpha=0.9) plt.plot(x_prime, y_hat_wls, 'b', linewidth=2, alpha=0.9) # What are the pros and cons of these approaches?
def fit_caltrack_hourly_model_segment(segment_name, segment_data): """ Fit a model for a single segment. Parameters ---------- segment_name : :any:`str` The name of the segment. segment_data : :any:`pandas.DataFrame` A design matrix for caltrack hourly, of the form returned by :any:`eemeter.caltrack_hourly_prediction_feature_processor`. Returns ------- segment_model : :any:`CalTRACKSegmentModel` A model that represents the fitted model. """ def _get_hourly_model_formula(data): if (np.sum(data.loc[data.weight > 0].occupancy) == 0) or (np.sum( data.loc[data.weight > 0].occupancy) == len( data.loc[data.weight > 0].occupancy)): bin_occupancy_interactions = "".join( [" + {}".format(c) for c in data.columns if "bin" in c]) return "meter_value ~ C(hour_of_week) - 1{}".format( bin_occupancy_interactions) else: bin_occupancy_interactions = "".join([ " + {}:C(occupancy)".format(c) for c in data.columns if "bin" in c ]) return "meter_value ~ C(hour_of_week) - 1{}".format( bin_occupancy_interactions) warnings = [] if segment_data.dropna().empty: model = None formula = None model_params = None warnings.append( EEMeterWarning( qualified_name= "eemeter.fit_caltrack_hourly_model_segment.no_nonnull_data", description= "The segment contains either an empty dataset or all NaNs.", data={ "n_rows": segment_data.shape[0], "n_rows_after_dropna": segment_data.dropna().shape[0], }, )) else: formula = _get_hourly_model_formula(segment_data) model = smf.wls(formula=formula, data=segment_data, weights=segment_data.weight) model_params = { coeff: value for coeff, value in model.fit().params.items() } segment_model = CalTRACKSegmentModel( segment_name=segment_name, model=model, formula=formula, model_params=model_params, warnings=warnings, ) if model: this_segment_data = segment_data[segment_data.weight == 1] predicted_value = pd.Series(model.fit().predict(this_segment_data)) segment_model.totals_metrics = ModelMetrics( this_segment_data.meter_value, predicted_value, len(model_params)) else: segment_model.totals_metrics = None return segment_model
t1=(31-2)** 0.5*-0.247984/(1+0.247984** 2)** 0.5 t.cdf(t1,df=29) #可以从残差图看出明显的异方差 # plt.scatter(res['地区生产总值'], res['residual']) # plt.show() #1.加权最小二乘法 #加权最小二乘法,需要构建一个权重。 #python中也无法自动寻找一个合适的m #所以,只能通过找似然值最小的,作为合适的. #一般从-2-2试。每隔0.5取一个 #直接取书中的结果2,但是wls内部会自动取倒数。 data['w']=data['地区生产总值'].apply(lambda x:x**-2) model=smf.wls('财政收入~地区生产总值',data=data,weights=data['w']) result=model.fit() result.summary() res=data res['residual']=result.resid*(model.weights**0.5) #做加权残差图 # plt.scatter(res['地区生产总值'], res['residual']) # plt.show() #2.BOX-BOX变换 data=pd.read_csv(r"D:/书籍资料整理/应用回归分析/表4-3.csv") #使用lmbda=None,得出与书中描述不符.所以只能指定lmbda x_norm = stats.boxcox(data['财政收入'],lmbda=0)
mae = np.sqrt( mean_absolute_error(test6_df['salary'], test6_df['predicted_salary'])) print('Mean Absolute Error: {}'.format(mae)) rms = np.sqrt( mean_squared_error(test6_df['salary'], test6_df['predicted_salary'])) print('Mean Squared Error: {}'.format(rms)) ## Model 7 ## Model 6 using WLS test7_df = test_df_nooutlines.copy() train7_df = train_df_nooutlines.copy() w = np.ones(len(train7_df)) model7 = str('salary ~ conference + wl_ratio + capacity') train7_fit = statsform.wls(model7, data=train7_df, weights=1. / (w**2)).fit() train7_df['predicted_salary'] = train7_fit.fittedvalues test7_df['predicted_salary'] = train7_fit.predict(test7_df) test_variance7 = round( np.power(test7_df['salary'].corr(test7_df['predicted_salary']), 2), 3) print('Test Set Variance Accounted for: ', test_variance7) fit7 = statsform.wls(model7, data=train7_df, weights=1. / (w**2)).fit() print(fit7.summary()) ## Model 8 ## Model 6 using GLS test8_df = test_df_nooutlines.copy() train8_df = train_df_nooutlines.copy()
__author__ = 'Yas' import numpy as np import matplotlib.pyplot as plt import statsmodels.formula.api as sm import pandas as pd x_list = [1,2,3,4,5,6,7] y_list = [1,2,3,1,5,6,7] y_wts = [0.1,0.1,0.1,0.001,0.1,0.1,0.1] # put x and y into a pandas DataFrame, and the weights into a Series ws = pd.DataFrame({ 'x': x_list, 'y': y_list }) weights = pd.Series(y_wts) wls_fit = sm.wls('x ~ y', data=ws, weights=1 / weights).fit() ols_fit = sm.ols('x ~ y', data=ws).fit() # show the fit summary by calling wls_fit.summary() # wls fit r-squared is 0.754 # ols fit r-squared is 0.701 # let's plot our data plt.clf() fig = plt.figure() ax = fig.add_subplot(111, axisbg='w') ws.plot( kind='scatter', x='x', y='y', style='o',
formula_rhs = formula_rhs + " + " + " + ".join(material_features) if False: formula_rhs = formula_rhs + " + " + " + ".join(gb_cols) formula_rhs = formula_rhs + " + " + " + ".join(elorange_cols) # hey lets just use the elorange columns and see how they do #formula_rhs = " + ".join(elorange_cols) msg("Fitting!") weights = np.ones(train.shape[0]) formula = "elo_avg ~ " + formula_rhs ols_avg = sm.wls(formula=formula, data=train, weights=weights).fit() print ols_avg.summary() formula = "elo_advantage ~ " + formula_rhs ols_ea = sm.wls(formula=formula, data=train, weights=weights).fit() print ols_ea.summary() msg("Making predictions for all playergames") yy_df['ols_avg_prediction'] = ols_avg.predict(yy_df) yy_df['ols_ea_prediction'] = ols_ea.predict(yy_df) yy_df['ols_avg_error'] = (yy_df['ols_avg_prediction'] - yy_df['elo_avg']).abs() yy_df['ols_ea_error'] = (yy_df['ols_ea_prediction'] - yy_df['elo_advantage']).abs() yy_df['training'] = (yy_df['gamenum'] % 3) insample_scores = yy_df.groupby('training')['ols_avg_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
def optimise_combination(self): """ Use multiple linear regression to determine the optimal weighted combination of the GEOGRAPHIC, GENETIC and FEATUE methods. """ df = {} df["auth"] = self.common_auth_combo_vector names = ("geo", "gen", "feat") funcs = (distance.build_optimal_geographic_matrix, distance.build_optimal_genetic_matrix, distance.build_optimal_feature_matrix) for name, func in zip(names, funcs): austro_method = self.compute_method_vector(func(self.austrolangs), self.common_austro_langs, self.wals_austro_trans) indo_method = self.compute_method_vector(func(self.indolangs), self.common_indo_langs, self.wals_indo_trans) df[name] = np.concatenate([austro_method, indo_method]) df = pd.DataFrame(df) df.to_csv("calibration_results/feature_data.csv") model = smf.wls('auth ~ geo + gen + feat', data=df, weights=self.weights).fit() fp = open("calibration_results/optimal_combination_weights", "w") # fp.write("intercept\t%f\n" % model.params["Intercept"]) fp.write("intercept\t%f\n" % 0.0) fp.write("geo\t%f\n" % model.params["geo"]) fp.write("gen\t%f\n" % model.params["gen"]) fp.write("feat\t%f\n" % model.params["feat"]) fp.close() # return (model.params["Intercept"], model.params["geo"], model.params["gen"], model.params["feat"]) combo_austro = distance.build_optimal_combination_matrix(self.austrolangs) combo_indo = distance.build_optimal_combination_matrix(self.indolangs) D, intt, mult = self.fit_models(combo_austro, combo_indo, "combo") print "best combo D: ", D fp = open("calibration_results/optimal_combination_weights", "w") fp.write("intercept\t%f\n" % intt) print intt fp.write("geo\t%f\n" % (mult*model.params["geo"])) print mult*model.params["geo"] fp.write("gen\t%f\n" % (mult*model.params["gen"])) print mult*model.params["gen"] fp.write("feat\t%f\n" % (mult*model.params["feat"])) print mult*model.params["feat"] fp.close() return return (best_intercept, best_weights[0], best_weights[1], best_weights[2]) old_D = 1000 lowest_D = 1000 weights = [1.0/3, 1.0/3, 1.0/3] best_weights = weights[:] intercept = 0.5 best_intercept = 0.5 for iterations in range(0,10000): oldweights = weights[:] oldint = intercept # change params if random.randint(1,100) == 42: # Go back to best so far weights = best_weights[:] intercept = best_intercept elif random.randint(1,3) == 1: # shuffle weights random.shuffle(weights) elif random.randint(1,3) == 2: # shift weights source, target = random.sample([0,1,2],2) delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0] if weights[source] > delta: weights[source] -= delta weights[target] += delta elif random.randint(1,3) == 3: # shift intercept delta = random.sample([0.01, 0.05, 0.1, 0.2],1)[0] if random.randint(1,2) == 1 and intercept >= delta: intercept -= delta elif intercept <= 1.0 - delta: intercept += delta observations = [weights[0]*a + weights[1]*b + weights[2]*c for a, b, c in itertools.izip(geo, gen, feat)] D, p = scipy.stats.kstest(observations, baselinecdf) if D < old_D or random.randint(1,100) < 20: # We've improved, or it's a rare backward step old_D = D else: # Keep old value weights = oldweights[:] intercept = oldint if D < lowest_D: lowest_D = D best_weights = weights best_intercept = intercept # df = {} # df["auth"] = self.auth_combo_vector # df["geo"] = np.concatenate([geo_austro, geo_indo]) # df["gen"] = np.concatenate([gen_austro, gen_indo]) # df["feat"] = np.concatenate([feat_austro, feat_indo]) # df = pd.DataFrame(df) # df.to_csv("calibration_results/combination_data.csv") # model = smf.ols('auth ~ geo + gen + feat', data=df).fit() # weights = [model.params[x] for x in ("geo", "gen", "feat")] fp = open("calibration_results/optimal_combination_weights", "w") fp.write("intercept\t%f\n" % best_intercept) fp.write("geo\t%f\n" % best_weights[0]) fp.write("gen\t%f\n" % best_weights[1]) fp.write("feat\t%f\n" % best_weights[2]) fp.close() return (best_intercept, best_weights[0], best_weights[1], best_weights[2])
def optimise_feature(self): conn = sqlite3.connect("../WALS2SQL/wals.db") cursor = conn.cursor() cursor.execute('''PRAGMA cache_size = -25000''') wals2sql.compute_dense_features(conn, cursor, 25) dense_features = wals2sql.get_dense_features(conn, cursor) cursor.close() conn.close() comparators = distance.build_comparators() # Ugly hack langs_by_name = {} for lang in self.austrolangs: langs_by_name[lang.name] = lang for lang in self.indolangs: langs_by_name[lang.name] = lang # Identify good features good_features = [] long_good_features = [] for index, feature in enumerate(dense_features): if feature == bwo: continue for l1, l2 in itertools.chain(itertools.combinations(self.common_austro_langs, 2), itertools.combinations(self.common_indo_langs, 2)): l1 = langs_by_name[l1] l2 = langs_by_name[l2] useful_points = 0 if feature in l1.data and feature in l2.data: useful_points += 1 if useful_points > 0: good_features.append("feat%d" % index) long_good_features.append(feature) # Compute supermeans austromeans = {} austrosupermean = 0 austrosupernorm = 0 for feature in long_good_features: austromeans[feature] = 0 norm = 0 for l1, l2 in itertools.combinations(self.common_austro_langs, 2): l1 = langs_by_name[l1] l2 = langs_by_name[l2] # pdb.set_trace() if feature in l1.data and feature in l2.data: austromeans[feature] += comparators[feature](l1.data[feature], l2.data[feature]) norm += 1 if norm: austromeans[feature] /= norm austrosupermean += austromeans[feature] austrosupernorm += 1 else: austromeans[feature] = "NODATA" if austrosupernorm: austrosupermean /= austrosupernorm else: austrosupermean = 0.5 for feature in austromeans: if austromeans[feature] == "NODATA": austromeans[feature] = austrosupermean indomeans = {} indosupermean = 0 indosupernorm = 0 for feature in long_good_features: indomeans[feature] = 0 norm = 0 for l1, l2 in itertools.combinations(self.common_indo_langs, 2): l1 = langs_by_name[l1] l2 = langs_by_name[l2] if feature in l1.data and feature in l2.data: indomeans[feature] += comparators[feature](l1.data[feature], l2.data[feature]) norm += 1 if norm: indomeans[feature] /= norm indosupermean += indomeans[feature] indosupernorm += 1 else: indomeans[feature] = "NODATA" if indosupernorm: indosupermean /= indosupernorm else: indosupermean = 0.5 for feature in indomeans: if indomeans[feature] == "NODATA": indomeans[feature] = indosupermean # Actually compute raw data df = {} df["auth"] = self.common_auth_combo_vector for feature, long_feature in zip(good_features, long_good_features): if long_feature == bwo: continue df[feature] = [] for l1, l2 in itertools.chain(itertools.combinations(self.common_austro_langs, 2), itertools.combinations(self.common_indo_langs, 2)): l1 = langs_by_name[l1] l2 = langs_by_name[l2] if long_feature in l1.data and long_feature in l2.data: df[feature].append(comparators[long_feature](l1.data[long_feature], l2.data[long_feature])) else: if l1 in self.austrolangs: df[feature].append(austromeans[long_feature]) else: df[feature].append(indomeans[long_feature]) df = pd.DataFrame(df) df.to_csv("calibration_results/feature_data.csv") austrodf = df[0:len(self.common_auth_austro_vector)] indodf = df[len(self.common_auth_austro_vector):] # Optimise for a fixed length of time rank = [] starttime = time.time() while (time.time() - starttime) < 30*60: # Generate a random binary vector indicating which # features are and are not in the model on_features = random.randint(1, len(good_features)) feature_selectors = [True,]*on_features + [False,]*(len(good_features)-on_features) random.shuffle(feature_selectors) # Fit a model using the randomly selected features model_spec = "auth ~ " + " + ".join([feat for feat, sel in zip(good_features, feature_selectors) if sel]) model = smf.wls(model_spec, data=df, weights=self.weights).fit() # Compute correlations for the two families # separately austrofit = model.fittedvalues[0:len(self.common_auth_austro_vector)] austroauth = austrodf["auth"] austro_correl = austroauth.corr(austrofit) indofit = model.fittedvalues[len(self.common_auth_austro_vector):] indoauth = indodf["auth"] indo_correl = indoauth.corr(indofit) # Record pertinent details in a big list min_correl = min(austro_correl, indo_correl) thingy = (min_correl, austro_correl, indo_correl, feature_selectors.count(True), feature_selectors) rank.append(thingy) if len(rank) == 50000: # List is getting kind of long # Let's keep the best 10% and ditch the rest, # then keep going... rank.sort() rank.reverse() rank = rank[0:5000] # Find the highest min correlation rank.sort() rank.reverse() best_min_correl = rank[0][0] # Now, filter rank to include only those models with # a min correlation within 5% of the best possible # and rank them by number of features in model, finding # the highest filter count rank = [(c,m,a,i,s) for (m,a,i,c,s) in rank if m>=0.95*best_min_correl] rank.sort() rank.reverse() highest_count = rank[0][0] # Now, filter rank to include only those models with # the highest number of features, and rank them by # min correlation rank = [(m,a,i,c,s) for (c,m,a,i,s) in rank if c == highest_count] rank.sort() rank.reverse() # Take the best best_selectors = rank[0][-1] best_features = [feat for feat, sel in zip(good_features, best_selectors) if sel] model_spec = "auth ~ " + " + ".join(best_features) model = smf.wls(model_spec, data=df, weights=self.weights).fit() weights = {} for index, feature in enumerate(dense_features): if "feat%d" % index in best_features: weights[feature] = model.params["feat%d" % index] print index, weights[feature] func = distance.feature_matrix_factory(weights) D, intercept, mult = self.evaluate_method(func, "feat") fp = open("calibration_results/optimal_feature_weights", "w") fp.write("%f\tintercept\n" % (intercept)) for index, feature in enumerate(dense_features): if "feat%d" % index in best_features: fp.write("%f\t%s\n" % (mult*model.params["feat%d" % index], feature)) fp.close() print "Best feature D: ", D
autoCorr.append(m.log(abs(np.real(densTimeSeries[(int(L/2) + 2)][i] - avDens[(int(L/2) + 2)][0])))) trivWeights.append(1.0) corrInfo.append(str(i*10.0/float(numTimeSlices-1))+" "+str(m.log(abs(np.real(densTimeSeries[(int(L/2) + 2)][i] - avDens[(int(L/2) + 2)][0]))))+"\n") y_list = timeSeries x_list = autoCorr y_err = trivWeights # put x and y into a pandas DataFrame, and the weights into a Series ws = pd.DataFrame({ 'x': x_list, 'y': y_list }) weights = pd.Series(trivWeights) wls_fit = sm.wls('x ~ y', data=ws, weights=1.0 / ((weights)**2)).fit() #ols_fit = sm.ols('x ~ y', data=ws).fit() #print avDens #print("\nThe mean current should be:\n") avCurr = cscCurrentMatrix.dot(vecsLR) #print avCurr with open(resultsPlace+'eigenvalues.dat', 'w') as f: for eig in valsLR: f.write(str(np.real(eig))+'\n') #with open(resultsPlace+'fullEigenvalues.dat', 'w') as f: # for eig in vals: # f.write(str(eig)+'\n')