def run_regressions(mydf): print "\n************ Regression Results ************\n" #run a very simple regression to estimate effects of regressors on outcome results = smf.ols('dollars_per_day ~ \ C(week_day_name_posted) + day_posted + C(region) + maleness + \ treat_cost + patient_age:smile_scale + \ patient_age + smile_scale', data=mydf).fit() print results.summary() #smile scale is negative but lacks statistical signficance # model after dropping insignificant terms (backwards selection process) results = smf.ols('dollars_per_day ~ \ weekend_post + treat_cost + patient_age + smile_scale', data=mydf).fit() print results.summary() #smile scale is negative with p-val<.1 # run with smile categories (do not treat as linear relationship) mydf = pd.read_csv(towrite_path) bins = [0, .45, .55, 1] smile_cat_names = ["negative","neutral","positive"] smile_dums = pd.get_dummies(pd.cut(mydf.smile_scale, bins, labels=smile_cat_names)) mydf = pd.merge(mydf,smile_dums,left_index=True,right_index=True) results = smf.ols('dollars_per_day ~ \ treat_cost + patient_age + \ weekend_post + negative + positive', data=mydf).fit() print results.summary()
def run_anova(self): ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)] #ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit() ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rf', anova['F'].values[0:3]) self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3]) ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])] print 'nsamples =', len(ps_table_for_anova_low) ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rd_low', anova['F'].values[0:3]) self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3]) ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_ra_low', anova['F'].values[0:3]) self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3]) ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])] print 'nsamples =', len(ps_table_for_anova_high) ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rd_high', anova['F'].values[0:3]) self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3]) ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_ra_high', anova['F'].values[0:3]) self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
def test_statsmodels(): statsmodels = import_module('statsmodels') # noqa import statsmodels.api as sm import statsmodels.formula.api as smf df = sm.datasets.get_rdataset("Guerry", "HistData").data smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit()
def model_formulas(): ''' Define models through formulas ''' # Get the data: # Development of world record times for the 100m Freestyle, for men and women. data = pd.read_csv('swim100m.csv') # Different models model1 = ols("time ~ sex", data).fit() # one factor model2 = ols("time ~ sex + year", data).fit() # two factors model3 = ols("time ~ sex * year", data).fit() # two factors with interaction # Model information print((model1.summary())) print((model2.summary())) print((model3.summary())) # ANOVAs print('----------------- Results ANOVAs: Model 1 -----------------------') print((anova_lm(model1))) print('--------------------- Model 2 -----------------------------------') print((anova_lm(model2))) print('--------------------- Model 3 -----------------------------------') model3Results = anova_lm(model3) print(model3Results) # Just to check the correct run return model3Results['F'][0] # should be 156.1407931415788
def regression(self): print self.people.head(n=1) self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class" self.logfile.write( "\n\n Sum Temp Interest NegBinom") m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Sum Temp Interest OLS") m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Pos Temp Interest NegBinom") m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.timePosInterest>0] self.logfile.write( "\n\n Pos Temp Interest OLS") m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular())
def _do_analysis_no_cross_validation(self): """ Find the best model (fit) and create self.list_of_fits and self.fit """ self.list_of_fits = [] # first model is just the mean self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit()) # try to improve the model until no improvements can be found all_exog = self.list_of_exog[:] while all_exog: # try each x in all_exog and overwrite the best_fit if we find a better one # the first best_fit is the one from the previous round best_fit = deepcopy(self.list_of_fits[-1]) for x in all_exog: # make new_fit, compare with best found so far formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x) fit = fm.ols(formula=formula, data=self.df).fit() best_fit = self.find_best_bic([best_fit, fit]) # Sometimes, the obtained fit may be better, but contains unsignificant parameters. # Correct the fit by removing the unsignificant parameters and estimate again best_fit = self._prune(best_fit, p_max=self.p_max) # if best_fit does not contain more variables than last fit in self.list_of_fits, exit if best_fit.model.formula in self.list_of_fits[-1].model.formula: break else: self.list_of_fits.append(best_fit) all_exog.remove(x) self.fit = self.list_of_fits[-1]
def prepare_data(subdata): subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop'] subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost']) ### predicts missing water level data points formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel=sm.ols(formula,data=subdata).fit() predictions=olsmodel.predict(subdata) subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values] formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel2=sm.ols(formula,data=subdata).fit() res2=olsmodel2.params predictions2=olsmodel2.predict(subdata) subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values] ### predicts damages based on a few points using water level subdata['log{}'.format(varin1)]=np.log(subdata[varin1]) subdata['log{}'.format(varin2)]=np.log(subdata[varin2]) formula="costlog ~ log{}".format(varin1) damagemodel=sm.ols(formula,data=subdata).fit() predicted_damages=damagemodel.predict(subdata) subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values] subdata['popestimated']=np.exp(subdata['costlog']) return subdata
def main(): teams = pd.read_csv('../data/Teams.csv') teams = teams[teams['yearID'] >= 1985] teams = teams[['yearID', 'teamID', 'Rank', 'R', 'RA', 'G', 'W', 'H', 'BB', 'HBP', 'AB', 'SF', 'HR', '2B', '3B']] teams = teams.set_index(['yearID', 'teamID']) salaries = pd.read_csv('../data/Salaries.csv') salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum() teams = teams.join(salaries_by_yearID_teamID) plot_spending_wins(teams, 2001) teams['BA'] = teams['H']/teams['AB'] teams['OBP'] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF']) teams['SLG'] = (teams['H'] + teams['2B'] + (2*teams['3B']) + (3*teams['HR'])) / teams['AB'] #First Model runs_reg_model1 = sm.ols("R~OBP+SLG+BA",teams) runs_reg1 = runs_reg_model1.fit() #Second Model runs_reg_model2 = sm.ols("R~OBP+SLG",teams) runs_reg2 = runs_reg_model2.fit() #Third Model runs_reg_model3 = sm.ols("R~BA",teams) runs_reg3 = runs_reg_model3.fit() print runs_reg1.summary() print runs_reg2.summary() print runs_reg3.summary()
def RunModels(live): """Runs regressions that predict birth weight. live: DataFrame of pregnancy records """ columns = ['isfirst[T.True]', 'agepreg', 'agepreg2'] header = ['isfirst', 'agepreg', 'agepreg2'] rows = [] formula = 'totalwgt_lb ~ isfirst' results = smf.ols(formula, data=live).fit() rows.append(FormatRow(results, columns)) print(formula) SummarizeResults(results) formula = 'totalwgt_lb ~ agepreg' results = smf.ols(formula, data=live).fit() rows.append(FormatRow(results, columns)) print(formula) SummarizeResults(results) formula = 'totalwgt_lb ~ isfirst + agepreg' results = smf.ols(formula, data=live).fit() rows.append(FormatRow(results, columns)) print(formula) SummarizeResults(results) live['agepreg2'] = live.agepreg**2 formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2' results = smf.ols(formula, data=live).fit() rows.append(FormatRow(results, columns)) print(formula) SummarizeResults(results) PrintTabular(rows, header)
def fit_model(self, model=None, verbose=False): if model is None: # model = "Mach*B_field*Driving*Temperature" ## Full model = "M+B+k+T+M:B+M:T+B:T" #Fractional for i,(stat, vec, last) in enumerate(zip(self.statistics, \ self.respvecs, self.laststep_respvecs)): self.model_matrix["resp"] = Series(vec, index=self.model_matrix.index) self.model_matrix["laststep_resp"] = Series(last, index=self.model_matrix.index) fcn_model = sm.ols("".join(["resp~",model]), data=self.model_matrix) laststep_model = sm.ols("".join(["laststep_resp~",model]), data=self.model_matrix) results = fcn_model.fit() laststep_results = laststep_model.fit() self.fitparam.append(results.params[1:]) self.laststep_fitparam.append(laststep_results.params[1:]) if i==0: self.paramnames = fcn_model.exog_names[1:] # Set the names of the coefficients if verbose: print "Fits for "+ stat print results.summary() print laststep_results.summary() return self
def prepare_data(subdata,gdp_gr,gdp_per_capita_2013,pop_multiplier,pop_affected,endyear): def calccost(pop_exposed,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear): cost=pop_exposed*gdp_per_capita_2013*(1+gdp_gr)**(endyear-2013)*pop_multiplier return cost subdata.ix[notnull(subdata['share']),'cost']=calccost(pop_affected,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear) subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost']) ### predicts missing water level data points formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel=sm.ols(formula,data=subdata).fit() predictions=olsmodel.predict(subdata) subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values] formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff + bndconditions" olsmodel2=sm.ols(formula,data=subdata).fit() res2=olsmodel2.params predictions2=olsmodel2.predict(subdata) subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values] ### predicts damages based on a few points using water level subdata['log{}'.format(varin1)]=np.log(subdata[varin1]) subdata['log{}'.format(varin2)]=np.log(subdata[varin2]) formula="costlog ~ log{}".format(varin1) damagemodel=sm.ols(formula,data=subdata).fit() predicted_damages=damagemodel.predict(subdata) subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values] subdata['costestimated']=np.exp(subdata['costlog']) return subdata
def partial_correlation(df, x, y, measures): ''' A little (but hopefully quite useful) piece of code that calculates the partial correlation between x and y while covarying for the remaining measures in a list of measures. It requires a data frame, the names of x and y, and a list of measures (that don't need to, but can, contain x or y) This function returns r and p values ''' # Import the modules you need from scipy.stats import pearsonr from statsmodels.formula.api import ols # Your covars are all the measures you've selected # that aren't x and y covars = [ z for z in measures if not z == x and not z == y ] # Your formulae just set x and y to be a function # of all the other covariates formula_x = x + ' ~ ' + ' + '.join(covars) formula_y = y + ' ~ ' + ' + '.join(covars) # Fit both of these formulae lm_x = ols(formula_x, df).fit() lm_y = ols(formula_y, df).fit() # Save the residuals from the model res_x = lm_x.resid res_y = lm_y.resid r, p = pearsonr(res_x, res_y) return r, p
def linear_foward_selection(X_train, y_train): ''' forward selection of optimize adjusted R-squared by adding features that help the most one at a time until the score goes down or you run out of features not implemeneted yet. would only make sense for a linear model. not for categorical data presently not called from within module. ''' remaining = {X_train.columns} remaining.remove(response) selected = [] current_score, best_new_score = 0.0, 0.0 while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate])) score = smf.ols(formula, data).fit().rsquared_adj scores_with_candidates.append((score, candidate)) scores_with_candidates.sort() best_new_score, best_candidate = scores_with_candidates.pop() if current_score < best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score formula = "{} ~ {} + 1".format(response, ' + '.join(selected)) model = smf.ols(formula, data).fit() return model
def test_patsy_lazy_dict(): class LazyDict(dict): def __init__(self, data): self.data = data def __missing__(self, key): return np.array(self.data[key]) data = cpunish.load_pandas().data data = LazyDict(data) res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit() res2 = res.predict(data) npt.assert_allclose(res.fittedvalues, res2) data = cpunish.load_pandas().data data['INCOME'].loc[0] = None data = LazyDict(data) data.index = cpunish.load_pandas().data.index res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit() res2 = res.predict(data) assert_equal(res.fittedvalues, res2) # Should lose a record assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))
def run_regressions(data, formulas): """ Run len(formulas) regressions on the clustered data. arguments: data -- Dataset, a dataset with the cdb field initialized to a DataFrame containing clusters and dep.vars. formulas -- a list of strings of the type 'dep_var ~ ex_var + ...'" see statsmodels documentation for details. returns: a list of RegressionResults objects each one containing the results of one regression model. See statsmodels documentation for additional info. """ results = [] # We need to create an additional dataset for the fragility dep.var. # because scores from some countries are missing (marked as 'NA') # if we feed the statsmodels.ols function data with nas, it throws # errors. c_frag = data[data['fragility'] != 'NA'] c_frag[['fragility']] = c_frag['fragility'].astype(float) for f in formulas: if 'fragility' in f: r = sm.ols(formula=f, data=c_frag).fit() else: r = sm.ols(formula=f, data=data).fit() results.append(r) return results
def model_formulas(): ''' Define models through formulas ''' # Get the dta data = read_csv(r'..\Data\data_kaplan\swim100m.csv') # Different models model1 = ols("time ~ sex", data).fit() # one factor model2 = ols("time ~ sex + year", data).fit() # two factors model3 = ols("time ~ sex * year", data).fit() # two factors with interaction # Model information print((model1.summary())) print((model2.summary())) print((model3.summary())) # ANOVAs print('-----------------------------------------------------------------') print((anova_lm(model1))) print('-----------------------------------------------------------------') print((anova_lm(model2))) print('-----------------------------------------------------------------') model3Results = anova_lm(model3) print(model3Results) # Just to check the correct run return model3Results['F'][0] # should be 156.1407931415788
def multiple_linear_regression(): '''Multiple linear regression chapter 6.3, p. 98''' # get the data from the web inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls' df = get_data(inFile) # do the fit, for the original model ... model = ols('carbohydrate ~ age + weight + protein', data=df).fit() print model.summary() print anova_lm(model) # as GLM glm = glm('carbohydrate ~ age + weight + protein', family=Gaussian(), data=df).fit() print 'Same model, calculated with GLM' ''' The confidence intervals are different than those from OLS. The reason (from Nathaniel Smith): OLS uses a method that gives exact results, but only works in the special case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM instead uses an approximate method which is correct asymptotically but may be off for small samples; the tradeoff you get in return is that this method works the same way for all GLM models, including those with non-Gaussian error terms and non-trivial link functions. So that's why they're different. ''' print glm.summary() # ... and for model 1 model1 = ols('carbohydrate ~ weight + protein', data=df).fit() print model1.summary() print anova_lm(model1)
def backsel(df, response, alpha = 0.1): ''' Performs backward selection for regression. args: df = data frame with response and covariates alpha = a float indicating confidence level response = string that represents the response variable e.g. 'Y' attributes: summary = ols(formula,data).fit().summary() ''' # initial assignments covariates = set(df.columns) covariates.remove(response) formula = '{} ~ {}'.format(response,' + '.join(list(covariates))) while True: pvals = ols(formula,df).fit().pvalues candidates = pvals[pvals > alpha] if candidates.empty: break dropvar = candidates[candidates == max(candidates)].index[0] covariates.remove(dropvar) formula = '{} ~ {}'.format(response,' + '.join(list(covariates))) print 'The optimal model is {}'.format(formula) return ols(formula,df).fit().summary()
def anova_interaction(data_lastDV): """ Two-way ANOVA and interaction analysis of given data http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each :param data: data frame containing the independent variables in first two columns, dependent in the third :return: None """ col_names = data_lastDV.columns.values # get the columns' names factor_groups = data_lastDV[col_names].dropna() if len(col_names) < 3: print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names)) # two-way anova formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")" formula_interaction = formula.replace('+', '*') interaction_lm = ols(formula, data=factor_groups).fit() # linear model print(interaction_lm.summary()) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -") print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm)) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -") print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit())) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -") print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
def _do_analysis_cross_validation(self): """ Find the best model (fit) based on cross-valiation (leave one out) """ assert len(self.df) < 15, "Cross-validation is not implemented if your sample contains more than 15 datapoints" # initialization: first model is the mean, but compute cv correctly. errors = [] formula = "Q('{}') ~ 1".format(self.endog) for i in self.df.index: # make new_fit, compute cross-validation and store error df_ = self.df.drop(i, axis=0) fit = fm.ols(formula=formula, data=df_).fit() cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :]) errors.append(cross_prediction['predicted'] - cross_prediction[self.endog]) self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()] self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))] # try to improve the model until no improvements can be found all_exog = self.list_of_exog[:] while all_exog: # import pdb;pdb.set_trace() # try each x in all_exog and overwrite if we find a better one # at the end of iteration (and not earlier), save the best of the iteration better_model_found = False best = dict(fit=self.list_of_fits[-1], cverror=self.list_of_cverrors[-1]) for x in all_exog: formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x) # cross_validation, currently only implemented for monthly data # compute the mean error for a given formula based on leave-one-out. errors = [] for i in self.df.index: # make new_fit, compute cross-validation and store error df_ = self.df.drop(i, axis=0) fit = fm.ols(formula=formula, data=df_).fit() cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :]) errors.append(cross_prediction['predicted'] - cross_prediction[self.endog]) cverror = np.mean(np.abs(np.array(errors))) # compare the model with the current fit if cverror < best['cverror']: # better model, keep it # first, reidentify using all the datapoints best['fit'] = fm.ols(formula=formula, data=self.df).fit() best['cverror'] = cverror better_model_found = True if better_model_found: self.list_of_fits.append(best['fit']) self.list_of_cverrors.append(best['cverror']) else: # if we did not find a better model, exit break # next iteration with the found exog removed all_exog.remove(x) self.fit = self.list_of_fits[-1]
def test_one_column_exog(self): from statsmodels.formula.api import ols res = ols("y~var1-1", data=self.data).fit() plot_regress_exog(res, "var1") plt.close('all') res = ols("y~var1", data=self.data).fit() plot_regress_exog(res, "var1") plt.close('all')
def for_all_critics(critics, lim=50, num=5, genre="all", low="no", high="no", mpaa="no", pub="no"): """ Return a sorted list of critics with highest Correlations(r^2) of their reviews to average user reviews :param critics: raw critic data :param lim: filter by number of eligible reviews :param num: filter by this column number :param genre: filter by genre :param low: filter by runtime min :param high: filter by runtime max :param mpaa: filter by mpaa rating :param pub: filter by current critic publication :return: filtered and sorted pandas dataframe """ critics = clean_critic_data(critics, lim) info = [] if pub != "no": critic_list = [] for critic in critics: if critic[0][2][0] == pub: critic_list.append(critic) critics = critic_list for index, critic in enumerate(critics): df = pd.DataFrame(critic[1]) if genre != "all": df = df[df.genre.apply(lambda x: genre in x)] if low != "no": df = df[df['runtime'] > low] if high != "no": df = df[df['runtime'] < high] if mpaa != "no": df = df[df['mpaa_rating'] == mpaa] mean_score = df.mean().score try: mean_user = df.mean().user_review mean_meta = df.mean().metascore mean_meta_diff = mean_score - mean_meta r = df.corr().iloc[0,1] mean_diff = mean_score - mean_user x = smf.ols("score ~ user_review", df).fit() rsquared = x.rsquared rsquared_adj = x.rsquared_adj meta = smf.ols("score ~ metascore", df).fit() meta_rsquared_adj = meta.rsquared_adj row = [critic[0][0], critic[0][2][0], len(critic[1]), r, rsquared, rsquared_adj, mean_score, mean_user, mean_diff, mean_meta_diff, meta_rsquared_adj] info.append(row) except: pass info = sorted(info, key=itemgetter(num)) df = pd.DataFrame(info[::-1]) df.columns = ['Critic', 'Publication', 'Eligible Reviews', "Pearson's Coefficient", "R^2", "R^2 Adj.", "Avg. Critic Review", "Avg. User Review", "Avg. Diff", "Avg. Meta Diff", "Meta R^2 Adj."] return df
def permutation_ols(df, formula, n=500): ''' INPUTS: df - data frame formula - text string containing pasty style formula referring to columns in data frame n - number of permutations default = 500 RETURNS: t_values - a numpy array of n+1 t values (with the first being the true t-value) for each of the regressors in the model Note that these t-values are tests of the partial correlation between the regressor and the dependent variable *after* taking into account any variance described by the other regressors p_values - the permutation test p-values for each regressor. p < 0.05 --> significantly greater than the null p > 0.95 --> significantly smaller than the null ''' import pandas as pd from statsmodels.formula.api import ols import numpy as np # First calculate the true linear model lm_true = ols(formula, df).fit() # Write the values you need into numpy arrays t_values = np.copy(lm_true.tvalues) # Make a copy of the endog (y) and exog (x) values # (These are the data you sent to the linear model) x = np.copy(lm_true.model.exog) y = np.copy(lm_true.model.endog) for i in range(n): # Now shuffle y while keeping x the same np.random.shuffle(y) # Recombine your data into a data frame df_shuff = pd.DataFrame(np.append(y[..., None], x, 1)) df_shuff.columns = [lm_true.model.endog_names] + lm_true.model.exog_names lm_shuff = ols(formula, df_shuff).fit() t_values = np.vstack([t_values, lm_shuff.tvalues]) # Now calculate the permuted p value for each column in x. p_values = np.ones(t_values.shape[1]) for x in range(x.shape[1]): p_values[x] = np.sum(t_values[1:,x] < t_values[0,x]) / np.float(n) return t_values, p_values
def regression(self): print self.people.head(n=1) self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class" self.logfile.write( "\n\n Num Regions NegativeBinomial") m = glm("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.numRegions>0] self.logfile.write( "\n\n Num Regions OLS") m = ols("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) # we could use beta regression for normalized entropy #print "\n\n Region Entropy" #m = ols("entropy ~ C(gender,Treatment(reference='male')) ", #+ C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century # data=self.people).fit() #print m.summary() # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on self.logfile.write( "\n\n Sum Temp Interest") m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC"+str(+m.aic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) self.logfile.write( "\n\n Pos Temp Interest") m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular()) #lim_people = self.people[self.people.timePosInterest>0] self.logfile.write( "\n\n Pos Temp Interest OLS") m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit() self.logfile.write( "\n AIC "+str(m.aic)) self.logfile.write( "\n BIC "+str(m.bic)) for table in m.summary().tables: self.logfile.write(table.as_latex_tabular())
def anova_sm_func(dataframe, model): fit_full = ols('value' + ' ~ ' + model[0].fullmodel, data=dataframe).fit() fit_null = ols('value' + ' ~ ' + model[0].nullmodel, data=dataframe).fit() model_diff = sm.stats.anova_lm(fit_null, fit_full) # direction = np.sign(fit_full.tvalues[2]) # statsout.pvalues[i] = model_diff.values[1, 5] # statsout.pvalues_signed[i] = direction*model_diff.values[1, 5] # statsout.tvalues[i] = fit_full.tvalues[2] # return direction*model_diff.values[1, 5] return fit_full.tvalues[2]/abs(fit_full.tvalues[2])*model_diff.values[1, 5]
def stepwsel(df , response, alpha = 0.1): ''' Performs stepwise selection for regression. ARGS: DF = Data frame with response and covariates alpha = a float indicating confidence level response = string that represents the response variable e.g. 'Y' attributes: summary = ols(formula,data).fit().summary() ''' # initial assignments covariates = set(df.columns) # variables in dataframe covariates.remove(response) # remove Y candidates = [] dropvar =[] optmodelpvals = [0] while True: oldpval = alpha # initial value to enter adding variable if statement rejects = set() # space for variables not entered in model if any(optmodelpvals) > alpha: # drop non-significant dropvar = list(optmodelpvals[optmodelpvals > .1].index) if 'Intercept' in dropvar: dropvar.remove('Intercept') for variable in covariates: candidatesubset = candidates + [variable] [candidatesubset.remove(element) for element in dropvar] # remove variables in dropvar formula = '{} ~ {}'.format(response, ' + '.join(candidatesubset)) # create model based on subset pval = ols(formula,df).fit().pvalues # get pvalues if pval[-1] < oldpval: # if the pavalue of the variable just added is significant then considered to be added var2add = variable # place holder oldpval = pval[-1] # update optmodelpvals = pval optmodelvars = candidatesubset else: rejects.add(variable) # add to rejected if not significant candidates.append(var2add) if covariates == rejects: optmodelvars.remove(dropvar[0]) optmodel = '{} ~ {}'.format(response, ' + '.join(optmodelvars)) print 'The optimal model is {}'.format(optmodel) break covariates.remove(var2add) return ols(optmodel,df).fit().summary()
def getRegression(): merged = pd.concat([total_amount_lent,total_loans, loans_per_member, borrower_female_count,borrower_male_count],axis = 1) result = sm.ols(formula="total_amount_lent ~ total_loans", data = merged).fit() print result.summary() result = sm.ols(formula="total_amount_lent ~ loans_per_member", data = merged).fit() print result.summary() result = sm.ols(formula="total_amount_lent ~ borrower_female_count", data = merged).fit() print result.summary() result = sm.ols(formula="total_amount_lent ~ borrower_male_count", data = merged).fit() print result.summary() result = sm.ols(formula = "total_amount_lent ~ intro_len", data = impact).fit() print result.summary()
def factor_regression(self, market_return): # form pandas dataframe pars = {'date':[],'return':[], 'Mkt_Rf':[], 'SMB':[], 'HML':[], 'Excess_Return':[]} if self.factor_num == 5: pars['CMA'] = []; pars['RMW'] = [] for num, date in enumerate(sorted(self.returns)): if date not in self.factors: print "French factors haven't update to " + date break if num == 0: print "Starting time: ", date if num == len(self.returns) - 1: print "Ending time:", date pars['date'].append(date) pars['return'].append(self.returns[date].adjClose) pars["Mkt_Rf"].append(self.factors[date].Mkt_Rf) pars["SMB"].append(self.factors[date].SMB) pars["HML"].append(self.factors[date].HML) pars["Excess_Return"].append(self.returns[date].adjClose - self.factors[date].RF) if self.factor_num == 5: pars["CMA"].append(self.factors[date].CMA) pars["RMW"].append(self.factors[date].RMW) pd.DataFrame(pars) #pd.DataFrame(pars, index = dates, columns = ["Mkt_Rf", "SMB", "HML", "Excess_Return"]) if self.factor_num == 3: model = ols("Excess_Return ~ Mkt_Rf", pars).fit() print model.params market_risk_coef = model.params[1] market_risk_premium = 0.0727/12 required_rate_of_return = self.treasuryList["GB6"] / 12 + market_risk_coef * market_risk_premium else: model = ols("Excess_Return ~ Mkt_Rf + SMB + HML + CMA + RMW", pars).fit() # the other two risk premiums come from the average of the French's factors over the 1990 - now market_risk_premium, size_risk_premium, value_risk_premium, profit_risk_premium, \ invest_risk_premium = [0.055 / 12, 0.02 / 12, 0.043 / 12, 0.041 / 12, 0.03 / 12] market_risk_coef, size_risk_coef, value_risk_coef, profit_risk_coef, invest_risk_coef = model.params[1:] required_rate_of_return = self.treasuryList["GB6"] / 12 + market_risk_coef * market_risk_premium \ + size_risk_coef * size_risk_premium + value_risk_coef * value_risk_premium \ + profit_risk_coef * profit_risk_premium + invest_risk_coef * invest_risk_premium print model.summary() alpha = model.params[0] betas = model.params[1] #alpha = np.mean(np.array(pars['return'])) - model.params[1] * market_return var = np.var(np.array(pars['return'])) print "alpha: ", alpha ##########################################################################################################tmp print "beta", betas var = np.var(pars['return']) #print model.params print "Required_rate_of_return:", required_rate_of_return #print "annualized required_rate_of_return:", required_rate_of_return * 12 return key_index(required_rate_of_return, var, alpha, [betas, 0, 0])
def wu_test(form, data, variable): """ Perform the Wu endogeneity test. This test is carried out in 3 steps: 1. Regress the variable in question on all other exogenous variables 2. Add the residuals from the aforementioned regression to the main model 3. Examine the p-value associated with the residual term from the updated model from part 2. A statistically significant coeff indicates that the tested variable is indeed endogenous. Parameters ========== form : str The statsmodels (patsy) formula for the model data : pandas.DataFrame The pandas DataFrame holding the data for the regression variable : str The string naming the variable (column) for which to perform the test Returns ======= fit : statsmodels.regression.linear_model.RegressionResultsWrapper The statsmodels fit object associated with the Wu test. """ endog, exog = form.split("~") s2_form = form o_exog = map(str.strip, exog.split('+')) o_exog.remove(variable) s1_form = variable + ' ~ ' + " + ".join(o_exog) s1_fit = sm.ols(s1_form, data=data).fit() res_name = 'resid_%s' % (variable) data[res_name] = s1_fit.resid s2_form += " + %s" % (res_name) fit = sm.ols(s2_form, data=data).fit() p_val = fit.pvalues['resid_EXP'] endog_bool = 'not' if p_val >= 0.05 else 'is' msg = "WU TEST: The p_value of the added residual is %.4e" msg += "\n\t This %s significant at the alpha=0.05 level\n\n" print(msg % (p_val, endog_bool)) return fit
def airline_regression(): #COST PER MILE??? ols_mat =pd.concat([pd.DataFrame(target_frequency),pca_data],axis=1) ols_mat['FLIGHT_COST_2'] =ols_mat['FLIGHT_COST']**2 ols_mat['MARKET_TOT_2'] =ols_mat['MARKET_TOT']**2 ols_mat['COST_DEMAND'] =ols_mat['FLIGHT_COST']*ols_mat['MARKET_TOT'] ols_mat['DEMAND_COMPETITORS'] = ols_mat['MARKET_COMPETITORS']*ols_mat['MARKET_TOT'] ols_mat['COST_COMPETITORS'] = ols_mat['MARKET_COMPETITORS']*ols_mat['FLIGHT_COST'] fit_base = sm.ols(formula="DAILY_FREQ ~ FLIGHT_COST + FLIGHT_COST_2 + MARKET_TOT + MARKET_TOT_2 + SEATS_PER_FLIGHT + MARKET_COMPETITORS", data = ols_mat).fit() fit_base.summary() fit_base = sm.ols(formula="DAILY_FREQ ~ FLIGHT_COST + FLIGHT_COST_2 + MARKET_TOT + DEMAND_COMPETITORS + MARKET_COMPETITORS", data = ols_mat).fit() fit_base.summary() preds =fit_base.predict() MAPE = sum(abs(target_frequency-preds))/sum(target_frequency)
# In[22]: batting.dtypes # In[395]: clf = linear_model.Lasso(alpha=0.1) clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) print(clf.coef_, clf.intercept_) # In[382]: # regression model #1 - OBP and OBP Against results=smf.ols('Win_per ~ OBP + OBP_Against', data=batting99_03).fit().summary() print(results) # In[381]: # In[56]: # regression model #2 - SLG and SLG Against #print(smf.ols('Win_per ~ SLG + SLG_Against', data=batting99_03).fit().summary()) # In[384]:
# -*- coding: utf-8 -*- """ Created on Sun Dec 24 14:51:40 2017 @author: abhishek """ import statsmodels.formula.api as sm import pandas as pd import seaborn as sns np.set_printoptions(suppress=True) df = pd.read_csv("Housing.csv") model1 = sm.ols( formula= 'price ~ lotsize + bedrooms + bathrms + stories + driveway + recroom + fullbase + gashw + airco + garagepl + prefarea', data=df) fitted1 = model1.fit() summary = fitted1.summary() print(fitted1.summary()) # visualize the relationship between the features and the response using scatterplots #sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7) ############################################# from scipy import stats import numpy as np x = np.random.random(10) y = np.random.random(10) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
merged = table * 100.0 ports = ff5.join(mom).reset_index() ports = ports.merge(data[['yyyymm', 'qtr']], how='inner', left_on='date', right_on='yyyymm') del ports['yyyymm'], ports['date'] ports = ports.groupby('qtr').mean() merged = merged.join(ports) merged['MOM'] = merged['Mom '] del merged['Mom '] merged['exmkt'] = merged['Mkt-RF'] merged['mkt'] = merged['exmkt'] + merged['RF'] del merged['Mkt-RF'] reg = smf.ols('spread ~ exmkt', data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4}) reg = smf.ols('spread ~ exmkt + SMB + HML', data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4}) reg = smf.ols('spread ~ exmkt + SMB + HML + RMW + CMA', data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4}) reg = smf.ols('spread ~ exmkt + SMB + HML + RMW + CMA + MOM', data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4}) # Annualize the monthly Alpha print(reg.summary()) alpha = reg.params[0] / 100 print('Annualize Alpha: ' + str((((1 + alpha)**(4)) - 1) * 100) + ' with t-stat: ' + str(reg.tvalues[0])) #------------------------------------------------------------------------------ # TABLE 9 (8 and 9 from old draft) table = positive.pivot_table(index='qtr',
# This property is known as homoscedasticity. # ############################################################################### # Plot sns.violinplot("site", "gm_f", data=brain_vol1) ############################################################################### # Stats with scipy fstat, pval = scipy.stats.f_oneway(*[brain_vol1.gm_f[brain_vol1.site == s] for s in brain_vol1.site.unique()]) print("Oneway Anova gm_f ~ site F=%.2f, p-value=%E" % (fstat, pval)) ############################################################################### # Stats with statsmodels anova = smfrmla.ols("gm_f ~ site", data=brain_vol1).fit() # print(anova.summary()) print("Site explains %.2f%% of the grey matter fraction variance" % (anova.rsquared * 100)) print(sm.stats.anova_lm(anova, typ=2)) ############################################################################### # **2. Test the association between the age and gray matter atrophy** in the # control and patient population independently. ############################################################################### # Plot sns.lmplot("age", "gm_f", hue="group", data=brain_vol1) brain_vol1_ctl = brain_vol1[brain_vol1.group == "Control"]
#print(str(gradient[-1])+" "+str(flow[-1])+" "+str(flowErr[-1])) import pandas as pd import numpy as np import statsmodels.formula.api as sm y_list = gradient x_list = flow y_err = flowErr # put x and y into a pandas DataFrame, and the weights into a Series ws = pd.DataFrame({'x': x_list, 'y': y_list}) weights = pd.Series(y_err) wls_fit = sm.wls('x ~ y', data=ws, weights=1.0 / ((weights)**2)).fit() ols_fit = sm.ols('x ~ y', data=ws).fit() # show the fit summary by calling wls_fit.summary() # wls fit r-squared is 0.754 # ols fit r-squared is 0.701 with open(resultsPlace + "regressionData.dat", 'a') as f: f.writelines([ str(wls_fit.summary()) + "\n", str(wls_fit.params[0]) + " " + str(wls_fit.bse[0]) + "\n", str(wls_fit.params[1]) + " " + str(wls_fit.bse[1]) + "\n" ]) """ # This stuff works, but is far too slow =( for siteIndex in range(2, sysSize-2): changeTimes = []
def greedy(singles, data): """singles = list of individual candidates for parameters in linear model data = pandas dataframe with columns AGDIST and singles returns useful predictors""" global candidates, predictors # First we check if we need to continue from where we left off. if os.path.isfile("predictors/predictors_00.csv"): latest = 0 while True: if not os.path.isfile("predictors/predictors_%02d.csv" % (latest + 1)): break else: latest += 1 file = open("predictors/predictors_%02d.csv" % (latest), "r") predictors = file.read().split("\n") file.close() file = open("candidates/candidates_%02d.csv" % (latest), "r") candidates = file.read().split("\n") file.close() best_bic = smf.ols(formula="AGDIST ~ %s " % ('+'.join(predictors)), data=data).fit().bic loop = latest + 1 else: if not os.path.exists("predictors"): os.makedirs("predictors") if not os.path.exists("candidates"): os.makedirs("candidates") candidates = singles pairs = list(itertools.combinations(singles, 2)) pairs = [x + ":" + y for x, y in pairs] output_can = [] for x in candidates: if len(x) == 5: output_can.append(x) candidates = output_can # print (candidates) best_bic = 0 loop = 0 # Now we run the greedy algorithm using multithreading while not os.path.isfile("predictors.csv"): print("%d predictors, %d candidates" % (len(predictors), len(candidates))) best_candidate = [] bics = Parallel(n_jobs=num_cores)(delayed(get_bic)(candidate, data) for candidate in candidates) min_bic = min(bics) if min_bic < best_bic or best_bic == 0: best_bic = min_bic best_candidate = candidates[bics.index(min_bic)] candidates.remove(best_candidate) predictors.append(best_candidate) file = open("predictors/predictors_%02d.csv" % (loop), "w") file.write("\n".join(predictors)) file.close() file = open("candidates/candidates_%02d.csv" % (loop), "w") file.write("\n".join(candidates)) file.close() loop += 1 else: candidates = singles + pairs output_can2 = [] for i in candidates: if len(i) != 5: output_can2.append(i) candidates = output_can2 while not os.path.isfile("predictors.csv"): print("%d predictors, %d candidates" % (len(predictors), len(candidates))) best_candidate = [] bics = Parallel(n_jobs=num_cores)( delayed(get_bic)(candidate, data) for candidate in candidates) min_bic = min(bics) if min_bic < best_bic or best_bic == 0: best_bic = min_bic best_candidate = candidates[bics.index(min_bic)] candidates.remove(best_candidate) predictors.append(best_candidate) file = open("predictors/predictors_%02d.csv" % (loop), "w") file.write("\n".join(predictors)) file.close() file = open("candidates/candidates_%02d.csv" % (loop), "w") file.write("\n".join(candidates)) file.close() loop += 1 else: file = open("predictors.csv", "w") file.write("\n".join(predictors)) file.close() file = open("candidates.csv", "w") file.write("\n".join(candidates)) file.close() # Save results file = open("predictors.csv", "r") predictors = file.read().split("\n") file.close() print('Done') return predictors
# Using StatsModels # Let's run the same regression using SciPy and StatsModels, and confirm we get the same results. from scipy.stats import linregress import statsmodels.formula.api as smf # Run regression with linregress subset = brfss.dropna(subset=['INCOME2', '_VEGESU1']) xs = subset['INCOME2'] ys = subset['_VEGESU1'] res = linregress(xs,ys) print(res) # Run regression with StatsModels results = smf.ols('_VEGESU1 ~ INCOME2', data = brfss).fit() print(results.params) # Plot income and education # To get a closer look at the relationship between income and education, let's use the variable 'educ' to group the data, then plot mean income in each group. # Here, the GSS dataset has been pre-loaded into a DataFrame called gss. # Group by educ grouped = gss.groupby('educ') # Compute mean income in each group
# 목적 : 독립변수의 표준화 import pandas as pd from statsmodels.formula.api import ols import sys # Read the data set into a pandas DataFrame wine = pd.read_csv('winequality-both.csv', sep=',', header=0) wine.columns = wine.columns.str.replace(' ', '_') my_formula = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar +' \ 'sulphates + total_sulfur_dioxide + volatile_acidity' dependent_variable = wine['quality'] # 종속변수(y의 값 범위) independent_variables = wine[wine.columns.difference( ['quality', 'type', 'in_sample'])] # 독립변수(x의 값 범위) # difference : A.difference(B) -> A 중에서 B 값을 뺀 것 independent_variables_standardized = ( independent_variables - independent_variables.mean()) / independent_variables.std() wine_standardized = pd.concat( [dependent_variable, independent_variables_standardized], axis=1) print(wine_standardized.head()) lm_standardized = ols(my_formula, data=wine_standardized).fit() print(lm_standardized.summary()) wine_standardized.to_csv('revised_wine.csv', index=False)
def scatter_matrix( df, *, xs: Sequence[str] = None, ys: Sequence[str] = None, width=None, height=None, regression=True, **kwargs, ): assert len(df) > 0, 'TODO handle this' # FIXME handle empty df source = CDS(df) # TODO what about non-numeric stuff? xs = df.columns if xs is None else xs ys = df.columns if ys is None else ys ys = list(reversed( ys)) # reorder to move meaningful stuff to the top left corner isnum = lambda c: is_numeric_dtype(df.dtypes[c]) # reorder so non-numeric is in the back # todo mode to drop non-numeric? not sure.. definitely can drop 'error' and datetimish? xs = list(sorted(xs, key=isnum, reverse=True)) ys = list(sorted(ys, key=isnum, reverse=True)) from bokeh.models import Label # TODO not sure I wanna reuse axis? def make(xc: str, yc: str): p = figure(df=df) diag = xc == yc # todo handle properly # TODO not sure if I even want them... move to the very end? if isnum(xc) and isnum(yc): p.scatter(x=xc, y=yc, source=source, size=3) else: # TODO ugh, doesn't want to show the label without any points?? # p.circle(x=0.0, y=0.0) # FIXME how to make sure text fits into the plot?? add_text( p, x=0.0, y=0.0, text='Not numeric', text_color='red', ) p.xaxis.axis_label = xc p.yaxis.axis_label = yc return p grid = [[make(xc=x, yc=y) for x in xs] for y in ys] from bokeh.layouts import gridplot w1 = None if width is None else width // min(len(xs), len(ys)) h1 = None if height is None else height // min(len(xs), len(ys)) grid_res = gridplot(grid, plot_width=w1, plot_height=h1) # TODO might be useful to include/exclude specific cols (e.g. datetime) while keeping them in annotations # TODO add the presence of the grid to the 'visual tests' # but if I swith it to raw bokeh -- it has Grid class.. might need to mess with # also maybe add extra axis under each plot in the grid? easier for a huge matrix of plots # some code in old dashboard if not regression: return grid_res # todo this would be need for plotly as well? import statsmodels.formula.api as smf # type: ignore for plot in chain.from_iterable(grid): gs = plot.renderers if len(gs) == 0: # must be non-numeric? meh though continue [g] = gs xx = g.glyph.x yy = g.glyph.y if xx == yy: # diagonal thing, e.g. histogram. compute some stats?? continue with pd.option_context('mode.use_inf_as_null', True): # FIXME proper error handling, display number of dropped items? dd = df[[xx, yy]].dropna() # otherwise from_scatter fails # todo would be nice to display stats on the number of points dropped udd = dd.drop_duplicates() if len(udd) <= 1: # can't perform a reasonable regression then add_text( plot, x=0.0, y=0.0, text='ERROR: no points to correlate', text_color='red', ) continue res = smf.ols(f"{yy} ~ {xx}", data=dd).fit() intercept = res.params['Intercept'] slope = res.params[xx] r2 = res.rsquared ## TODO crap. is it really the best way to figure out relative position?? relx = 0.01 rely = 0.1 # todo highlight high enough R2? minx, maxx = min(dd[xx]), max(dd[xx]) miny, maxy = min(dd[yy]), max(dd[yy]) # todo font size dependent on width?? ugh. txt = f'R2 = {r2:.4f}\nY ~ {slope:.3f} X' # todo need to add various regression properties, like intercept, etc # TODO hopefuly this overlays correctly?? not sure about nans, again from bokeh.models import Slope sl = Slope(gradient=slope, y_intercept=intercept, line_color='green', line_width=3) plot.add_layout(sl) add_text( plot, text=txt, x=minx + (maxx - minx) * relx, y=miny + (maxy - miny) * rely, text_color=g.glyph.line_color, ) # TODO dynamic resizing would be nice return grid_res
# %% # quantiles = np.arange(.05, .96, .1) quantiles = [.1, .25, .5, .75, .9] def fit_model(q): res = mod.fit(q=q) return [q, res.params['Intercept'], res.params['ageL']] + \ res.conf_int().loc['ageL'].tolist() models = [fit_model(x) for x in quantiles] models = pd.DataFrame(models, columns=['q', 'a', 'b', 'lb', 'ub']) ols = smf.ols('accumrateL ~ ageL', data).fit() ols_ci = ols.conf_int().loc['ageL'].tolist() ols = dict(a=ols.params['Intercept'], b=ols.params['ageL'], lb=ols_ci[0], ub=ols_ci[1]) print(models) print(ols) # %% x = np.arange(data.ageL.min(), data.ageL.max() + 0.05, .05) get_y = lambda a, b: 10**a * (10**x)**b fig, ax = plt.subplots(figsize=(8, 6))
def runRegression(y,x,data, cov_type='HC0'): print("Covariance type: %s" %cov_type ) form = '{0} ~ {1}'.format(y,x) mod = smf.ols(formula=form, data=data) res = mod.fit(cov_type=cov_type) return(res)
df.columns df.head() plt.hist(df.Salary) plt.boxplot(df.Salary,0,'rs',0) plt.plot(df.Salary,df.YearsExperience,'bo');plt.xlabel("Salary");plt.ylabel("Years of Experience") #To find the correlation df.Salary.corr(df.YearsExperience) np.corrcoef(df.Salary,df.YearsExperience) #Model building import statsmodels.formula.api as smf model=smf.ols("Salary~YearsExperience",data=df).fit() model pred=model.predict(df) pred model.params model.summary() #Data Vizualization plt.scatter(x=df['YearsExperience'],y=df['Salary'],color='red');plt.plot(df['YearsExperience'],pred,color='black');plt.xlabel("YearsExperience");plt.ylabel("Salary") pred.corr(df.Salary) #Tranforming variables for accuracy model1=smf.ols("Salary~np.log(YearsExperience)",data=df).fit() model1 model1.summary()
import matplotlib.mlab as mlab from pandas import DataFrame import seaborn as sns import statsmodels.formula.api as sm df = DataFrame({ "Treatment": np.repeat(["Kommerziell", "Vakuum", "Gemischt", "CO2"], [3, 3, 3, 3]), "steak_id":[7.66, 6.98, 7.80, 5.26, 5.44, 5.80, 7.41, 7.33, 7.04, 3.51, 2.91, 3.66] }) df = DataFrame({ "Treatment": np.repeat(["Kommerziell", "Vakuum", "XO2", "Yemischt"], [3, 3, 3, 3]), "steak_id":[7.66, 6.98, 7.80, 5.26, 5.44, 5.80, 7.41, 7.33, 7.04, 10.51, 10.91, 10.66] }) fit = sm.ols("steak_id~Treatment", data=df).fit() fit.summary() help(sm.ols) fit_pred = fit.get_prediction() fit_pred.conf_int() print(fit.summary()) print(fit.params) print("T.CO2: " + str(3.3600 + 0)) print("T.Gemischt: " + str(3.3600 + 3.9000)) print("T.Kommerziell: " + str(3.3600 + 4.1200)) print("T.Vakuum]: " + str(3.3600 + 2.1400))
def anova(df, formula): lm = ols(formula, data=df).fit() table = sm.stats.anova_lm(lm, typ=2) return table
def coefplot(formula, data, fontsize=5): """ Plots coefficients of a regression model. formula = patsy-style formula for regression model data = pandas dataframe with columns for the variables in the formula fontsize = 5 by default returns figure, axes """ lm = smf.ols(formula, data=data).fit() lm0 = smf.ols(formula + "+ 0", data=data).fit() r.assign("data", data) r(""" trunc_reg <- truncreg(%s, data = data, point = 0, direction = 'left') summ <- summary(trunc_reg) coeffs <- trunc_reg$coefficients coeffs <- coeffs[names(coeffs) != "sigma"] coeffs_values <- as.vector(coeffs, mode="numeric") coeffs_bse <- head(summary(trunc_reg)$coefficients[,2], -1) """ % (formula)) params = pd.DataFrame(data=r("coeffs_values"), index=r("names(coeffs)")) params.index = [":".join(sorted(name.split(":"))) for name in params.index] params = params.drop("(Intercept)") params.columns = ["truncreg"] truncreg_bse = pd.DataFrame(data=r("coeffs_bse"), index=r("names(coeffs)")) lm_params = lm.params.drop("Intercept") lm_params.index = [ ":".join(sorted(name.split(":"))) for name in lm_params.index ] params["lm"] = lm_params lm0_params = lm0.params lm0_params.index = [ ":".join(sorted(name.split(":"))) for name in lm0_params.index ] params["lm0"] = lm0_params params = params.sort_values("lm") lm_bse = lm.bse lm_bse.index = [":".join(sorted(name.split(":"))) for name in lm_bse.index] lm0_bse = lm0.bse lm0_bse.index = [ ":".join(sorted(name.split(":"))) for name in lm0_bse.index ] fig, ax = plt.subplots() y = range(len(params.index)) ax.scatter(list(params["truncreg"]), y, color="g", s=2) ax.scatter(list(params["lm"]), y, color="r", s=2) ax.scatter(list(params["lm0"]), y, color="b", s=2) for y in range(len(params.index)): sub = params.index[y] x = params.lm[sub] se = lm_bse[sub] ax.plot([x - se, x + se], [y, y], color="red") x = params.lm0[sub] se = lm0_bse[sub] ax.plot([x - se, x + se], [y, y], color="blue") x = params.truncreg[sub] for perm in list(itertools.permutations(sub.split(":"))): s = ":".join(perm) try: se = truncreg_bse.loc[s] ax.plot([x - se, x + se], [y, y], color="green") except KeyError: pass red_patch = mpatches.Patch(color='red', label='Linear Model') blue_patch = mpatches.Patch(color='blue', label='Forced Zero Intercept') green_patch = mpatches.Patch(color='green', label='Truncated Regression') plt.legend(handles=[red_patch, blue_patch, green_patch], loc=2) plt.yticks(range(len(params.index)), params.index) ax.set_ylim([-1, len(params)]) ax.set_yticklabels(params.index, fontsize=fontsize) ax.set_ylabel("Substitutions") ax.set_xlabel("Coefficients") plt.title("Coefficient plot") plt.grid() fig.savefig("coefplot.png", dpi=200) file = open("lm_summary.txt", "w") file.write(str(lm.summary())) file.close() file = open("lm0_summary.txt", "w") file.write(str(lm0.summary())) file.close() file = open("truncreg_summary.txt", "w") file.write(str(r("summ"))) file.close() return fig, ax
ses_2_group[s] = 'HC' elif subjectNum > 100: ses_2_group[s] = 'MDD' ses_2_info = {} ses_2_info['FD'] = np.array(data2['FD']) ses_2_info['subj'] = list(data2['subj']) ses_2_info['ses'] = ses_2_col ses_2_info['group'] = ses_2_group ses_2_df = pd.DataFrame(data=ses_2_info) FULL_DF = pd.concat([ses_1_df, ses_2_df]) aovrm2way = AnovaRM(FULL_DF, 'FD', 'subj', within=['group']) res2way = aovrm2way.fit() print(res2way) model = ols('FD ~ group*ses', data=FULL_DF).fit() print( f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}" ) model.summary() aov_table = sm.stats.anova_lm(model, typ=2) aov_table # interaction not significant - repeat model = ols('FD ~ group + ses', data=FULL_DF).fit() print( f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}" ) model.summary() aov_table = sm.stats.anova_lm(model, typ=2)
plt.text(x + 0.5, -1, '%s' % array.columns[x], horizontalalignment='center', verticalalignment='center', fontsize=fontsize) plt.tick_params(axis='x', which='both', bottom='on', top='off') plt.tick_params(axis='y', which='both', left='on', right='off') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) ax.spines['left'].set_position('zero') ax.spines['bottom'].set_position('zero') ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plt.title("%s=%.1f" % (predictor, coeffs[predictor])) fig.subplots_adjust(bottom=0.2) fig.savefig(predictor + "/" + predictor + ".png", dpi=500) return fig, ax if __name__ == "__main__": predictors = [ "KN145", "EK156", "LS157:NS193", "GK135", "DE190:IT214", "EG158_KR050", "LQ226" ] data = pd.read_csv("culled.csv", sep=" ") data = data.drop("Unnamed: 75", axis=1) data = collinify(data) model = smf.ols("AGDIST ~ %s" % ("+".join(predictors)), data=data) coefvis("EK156", model)
df = pd.merge(combined, params_df, on=['fips']) filter_ind = list(range(len(df))) new_filter_ind = np.array( [i for i, x in enumerate(df['statsmodels_p_value']) if x < 0.1]) filter_ind = [i for i in filter_ind if i in new_filter_ind] new_filter_ind = np.array( [i for i, x in enumerate(df['new_positive_cnt_7_day_avg']) if x > 3]) filter_ind = [i for i in filter_ind if i in new_filter_ind] df['Mar_temp_minus_67'] = df['Mar_temp'] - 67 model = smf.ols( formula= 'statsmodels_mean ~ Mar_temp_minus_67 + Mar_precip + frac_female + frac_black + frac_asian + frac_native + frac_hispanic + frac_25_to_44 + frac_45_to_64 + frac_over_64 + np.log(pop_density) + unemployment_rate + np.log(median_household_income)', data=df.iloc[filter_ind]) results = model.fit() print(f'\n#######\n# Offset: {offset}\n#######') print(results.summary()) map_offset_to_df[offset] = df map_offset_to_results[offset] = results df['log_pop_density'] = np.log(df['pop_density']) df['log_median_household_income'] = np.log(df['median_household_income']) corr_dict = dict() for plot_param_name in [ 'Mar_temp_minus_67', 'Mar_precip', 'frac_male', 'frac_female', 'frac_white', 'frac_black', 'frac_asian', 'frac_native',
print '\n\nOut of sample Data and Prediction' x1n = np.linspace(20.5, 25, 10) Xnew = np.column_stack((x1n, np.sin(x1n), (x1n - 5)**2)) Xnew = sm.add_constant(Xnew) ynewpred = olsres.predict(Xnew) # predict out of sample print ynewpred import matplotlib.pyplot as plt # now plot the comparison fig, ax = plt.subplots() ax.plot(x1, y, 'o', label="Data") ax.plot(x1, y_true, 'b-', label="True") ax.plot(np.hstack((x1, x1n)), np.hstack((ypred, ynewpred)), 'r', label="OLS prediction") ax.legend(loc="best") plt.show() #Prediciton with formulas same as above from statsmodels.formula.api import ols data = {"x1": x1, "y": y} res = ols("y ~ x1 + np.sin(x1) + I((x1-5)**2)", data=data).fit() res.params res.predict(exog=dict(x1=x1n))
df_origins[df_origins.ARR_DELAY > 0].ARR_DELAY.hist(by=df_origins.ORIGIN, sharex=True) ### plot delay times (y axis) and time of day (x axis) colors = np.where(df_origins.CARRIER == "AA", "r", "b") df_origins.plot(x="DEP_TIME", y="ARR_DELAY", kind="SCATTER", c=colors, alpha=.3) ###plot delay times over the course of the entire time df_origins.groupby("ORIGIN").agg(np.percentile)["ARR_DELAY"] df_origins.groupby("ORIGIN")["ARR_DELAY"].describe() est_s = smf.ols(formula='ARR_DELAY ~ C(ORIGIN) ', data=df_origins).fit() #+ C(CARRIER) est_s.summary() ## just look at time est_s = smf.ols( formula= "ARR_DELAY ~ DAY_OF_WEEK + DEP_TIME + AIR_TIME + DISTANCE + SECURITY_DELAY + TOTAL_ADD_GTIME", data=df_origins).fit() est_s.summary() ## CARRIER #DEp_TIME,
'cluster3_precentral_postcentral_gyrus', 'cluster4_frontal_pole', 'cluster5_temporal_pole', 'cluster6_left_hippocampus_amygdala', 'cluster7_left_caudate_putamen', 'cluster8_left_thalamus', 'cluster9_right_thalamus', 'cluster10_middle_temporal_gyrus' ] features = pop_all[features_name].as_matrix() df = pd.DataFrame() df["age"] = pop_all["age"].values df["sex"] = pop_all["sex_num"].values df["site"] = pop_all["site_num"].values i = 0 for f in features_name: df[f] = features[:, i] mod = ols("%s ~ age+sex+C(site)" % f, data=df).fit() res = mod.resid df["%s" % f] = res print(mod.summary()) i = i + 1 features_corr = df[[ 'age', 'sex', 'site', 'cluster1_cingulate_gyrus', 'cluster2_right_caudate_putamen', 'cluster3_precentral_postcentral_gyrus', 'cluster4_frontal_pole', 'cluster5_temporal_pole', 'cluster6_left_hippocampus_amygdala', 'cluster7_left_caudate_putamen', 'cluster8_left_thalamus', 'cluster9_right_thalamus', 'cluster10_middle_temporal_gyrus' ]] features_corr.to_csv(
plt.show() # 3. Áp dụng ANOVA fvalue, pvalue = stats.f_oneway(df.S_cars, df.M_cars, df.X_cars) print(fvalue, pvalue) # get ANOVA table as R like output # reshape the d dataframe suitable for statsmodels package df_melt = pd.melt(df.reset_index(), id_vars = ['index'], value_vars = ['S_cars', 'M_cars', 'X_cars']) # replace column names df_melt.columns = ['index', 'cars', 'value'] # Ordinary Least Squares (OLS) model model = ols('value ~ C(cars)', data = df_melt).fit() anova_table = sm.stats.anova_lm(model, typ = 2) print(anova_table) # Giá trị P-value có ý nghĩa về mặt thống kê (P < 0.05), # do đó, có thể kết luận rằng có sự khác biệt đáng kể giữa các loại xe. # perform multiple pairwise comparison (Tukey HSD) m_comp = pairwise_tukeyhsd(endog = df_melt['value'], groups = df_melt['cars'], alpha = 0.05) print(m_comp) # ngoại trừ X_cars và M_cars, tất cả các so sánh cặp khác đều bác bỏ H0 # và chỉ ra sự khác biệt đáng kể về mặt thống kê. # Kiểm định Levene: phương sai bằng nhau ? w, pvalue = stats.levene(df.S_cars, df.M_cars, df.X_cars)
labelencoder_X = LabelEncoder() X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() # Avoiding the dummy variable trap ( this is done automatically ) # all columns starting from index 1 X = X[:, 1:] # # features scaling # from sklearn.preprocessing import StandardScaler # sc_X = StandardScaler() # X_train = sc_X.fit_transform(X_train) # X_test = sc_X.transform(X_test) # Fitting Multiple linear Regression to the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the Test set results # recheck here y_pred doest show on variables y_pred = regressor.predict(X_test) # building the optimal model using backward elimination import statsmodels.formula.api as sm # we add b0 to the statsmodel a vector of 50 ones , as type(int) cast array to int # ligne axises = 0 colomun = 1 X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1) # X_opt only the highly significant variables X_opt = X[:, [0, 1, 2, 3, 4, 5]] regressor_OLS = sm.ols(endog=y, exog=X_opt).fit() regressor_OLS.summary()
] }) fig = plt.figure(1, figsize=(12, 4)) fig.suptitle('') ax1, ax2 = fig.subplots(1, 2) ax1.plot(regress['Temperatur'], regress['Spannung'], 'bo') ax1.axis([0, 100, 2.5, 4.5]) ax1.set_xlabel('Temperatur $T$ / °C') ax1.set_ylabel('Spannung $U$ / V') ax1.set_title('Lineare Regression') ax1.grid(True) """ Lineares Regressionsmodell definieren und berechnen """ # model = ols("Spannung ~ Temperatur", regress).fit() # model = ols("Spannung ~ Temperatur + I(Temperatur**2) + I(Temperatur**3)" , regress).fit() model = ols("Spannung ~ I(Temperatur**2) + I(Temperatur**3)", regress).fit() print(model.summary()) st, data, ss2 = summary_table(model, alpha=0.05) """ Darstellung der Regressionsfunktion zusammen mit Originaldaten """ regress['Fit'] = data[:, 2] # regress['Resid'] = data[:,3] ax1.plot(regress['Temperatur'], regress['Fit'], 'b') """ Berechnung und Darstellung der Residuen """ ax2.stem(regress['Temperatur'], model.resid, 'r', use_line_collection=True, markerfmt='ro') ax2.axis([0, 100, -0.2, 0.2])
#!/usr/bin/env python # coding: utf-8 # In[18]: import pandas as pd Location = "C:/gradedata.csv" df = pd.read_csv(Location) df.head() df.corr() # In[19]: import statsmodels.formula.api as sm result = sm.ols(formula='grade ~ exercise + hours + gender', data=df).fit() result.summary() # In[ ]:
import statsmodels.api as sm import statsmodels.formula.api as smf import faraway.utils # import faraway.datasets.statedata statedata = faraway.datasets.statedata.load() statedata.index = statedata['State'] statedata = statedata.drop('State', 1) statedata.head() # lmod = smf.ols( 'LifeExp ~ Population + Income + Illiteracy + \ Murder + HSGrad + Frost + Area', statedata).fit() lmod.sumary() # lmod.pvalues.idxmax(), lmod.pvalues.max() # lmod = smf.ols( 'LifeExp ~ Population + Income + Illiteracy + \ Murder + HSGrad + Frost', statedata).fit() lmod.pvalues.idxmax(), lmod.pvalues.max() #
# Dropping the 41 items that can't decide whether they're in a basement or not srrs2 = srrs2[(srrs2['floor'] == 0) | (srrs2['floor'] == 1)] srrs2 = srrs2[srrs2['activity'] > 0] srrs2['uranium_in_county'] = srrs2['county'].map( lambda c: list(cty[cty['cty'] == c]['Uppm'])[0]) srrs2['jitter'] = np.random.normal(0, 0.01, len(srrs2)) lacquiparle = srrs2[srrs2['county'] == 'lacquiparle'] lacquiparle_uranium = list(lacquiparle['uranium_in_county'])[0] # Model is like: y_i = gamma_0 + gamma_1 * u * G + beta * x_i # log(radon) = g0 + g1 * log(uranium_in_county) + g2 * in_basement pooled = smf.ols('np.log(activity) ~ floor', data=srrs2).fit() print pooled.summary() unpooled = smf.ols('np.log(activity) ~ floor + county - 1', data=srrs2).fit() print unpooled.summary() # Values from lmer(logradon ~ floor + (1 | county), srrs2) in R def partial_predict_lacquiparle(floor): return 1.9169895 + -0.640674 * floor plt.figure() plt.grid() plt.scatter(srrs2['floor'] + srrs2['jitter'], np.log(srrs2['activity'])) plt.plot([-0.1, 1.1], [pooled.predict({'floor': -0.1}),
#drop null values df_supervised = df_supervised.dropna().reset_index(drop=True) #create dataframe for transformation from time series to supervised df_supervised = df_diff.drop(['prev_sales'], axis=1) #adding lags for inc in range(1, 13): field_name = 'lag_' + str(inc) df_supervised[field_name] = df_supervised['diff'].shift(inc) #drop null values df_supervised = df_supervised.dropna().reset_index(drop=True) # Import statsmodels.formula.api import statsmodels.formula.api as smf # Define the regression formula model = smf.ols(formula='diff ~ lag_1', data=df_supervised) # Fit the regression model_fit = model.fit() # Extract the adjusted r-squared regression_adj_rsq = model_fit.rsquared_adj print(regression_adj_rsq) #import MinMaxScaler and create a new dataframe for LSTM model from sklearn.preprocessing import MinMaxScaler df_model = df_supervised.drop(['sales', 'date'], axis=1) #split train and test set train_set, test_set = df_model[:500].values, df_model[500:].values #apply Min Max Scaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(train_set)
names=[ '0', '1', 'decimal date', 'average', 'interpolated', 'trend', '#days' ]) # time (t) goes on the x-axis anc co2 goes on the y-axis co2 = data['interpolated'] t = data['decimal date'] # by default, statsmodels doesn't compute y-intercept T = sm.add_constant(t) tsquared = t**2 # squaring provides the 'quadratic' aspect Tsquared = sm.add_constant( tsquared) # again, by default, statsmodels doesn't compute y-intercept model = smf.ols(formula='co2 ~ T + Tsquared', data=data).fit() results = model.fittedvalues print(model.summary()) plt.figure(1) plt.subplot(211) plt.plot(t, co2) plt.plot(t, results) plt.title('Quadratic Model of Atmospheric CO2 at Mauna Loa Observatory') plt.ylabel('CO2 Concentration (ppmv)') residuals = model.resid # c1 = np.cos(2*math.pi*t) # s1 = np.sin(2*math.pi*t) # residualModel = smf.ols(formula = 'residuals ~ T + Tsquared + c1 + s1', data = residuals).fit()
if df[cols][i]: if df[cols][i] == str(df[cols][i]): cats.append(cols) break else: i += 1 # In[ ]: #Choose the related categorical columns and create the final columns list get_ipython().run_line_magic('matplotlib', 'inline') import statsmodels.api as sm from statsmodels.formula.api import ols choice = [] for cat in cats: mod = ols('SalePrice ~ {}'.format(cat), data=df).fit() aov_table = sm.stats.anova_lm(mod, typ=2) choice.append([cat, aov_table['PR(>F)'][0]]) def Sort(sub_li): sub_li.sort(key=lambda x: x[1]) return sub_li choice = Sort(choice) choice = [choice[x][0] for x in range(0, 5)] print(choice) #Merge the columns final_columns = np.array(choice + pd.Series.tolist(categories))