def run_regressions(mydf):

    print "\n************ Regression Results ************\n"

    #run a very simple regression to estimate effects of regressors on outcome
    results = smf.ols('dollars_per_day ~ \
                      C(week_day_name_posted) + day_posted + C(region) + maleness + \
                      treat_cost + patient_age:smile_scale + \
                        patient_age + smile_scale', data=mydf).fit()
    print results.summary()
    #smile scale is negative but lacks statistical signficance


    # model after dropping insignificant terms (backwards selection process)
    results = smf.ols('dollars_per_day ~ \
                      weekend_post + treat_cost + patient_age + smile_scale', data=mydf).fit()
    print results.summary()
    #smile scale is negative with p-val<.1

    
    # run with smile categories (do not treat as linear relationship)
    mydf = pd.read_csv(towrite_path)
    bins = [0, .45, .55, 1]
    smile_cat_names = ["negative","neutral","positive"]
    smile_dums = pd.get_dummies(pd.cut(mydf.smile_scale, bins, labels=smile_cat_names))
    mydf = pd.merge(mydf,smile_dums,left_index=True,right_index=True)
    results = smf.ols('dollars_per_day ~ \
                      treat_cost + patient_age + \
                      weekend_post + negative + positive', data=mydf).fit()
    print results.summary() 
Exemple #2
0
    def run_anova(self):
        ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)]

        #ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit()
        ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rf', anova['F'].values[0:3])
        self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])]
        print 'nsamples =', len(ps_table_for_anova_low)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_low', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_low', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3])

        ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])]
        print 'nsamples =', len(ps_table_for_anova_high)

        ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_rd_high', anova['F'].values[0:3])
        self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3])

        ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit()
        anova = anova_lm(ps_lm)
        self.pass_object('fvalue_ra_high', anova['F'].values[0:3])
        self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
Exemple #3
0
def test_statsmodels():

    statsmodels = import_module('statsmodels')  # noqa
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    df = sm.datasets.get_rdataset("Guerry", "HistData").data
    smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit()
def model_formulas():
    ''' Define models through formulas '''
    
    # Get the data:
    # Development of world record times for the 100m Freestyle, for men and women.
    data = pd.read_csv('swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('----------------- Results ANOVAs: Model 1 -----------------------')
    print((anova_lm(model1)))
    
    print('--------------------- Model 2 -----------------------------------')
    print((anova_lm(model2)))
    
    print('--------------------- Model 3 -----------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
    def regression(self):

        print self.people.head(n=1)
        self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"


        self.logfile.write( "\n\n Sum Temp Interest NegBinom")
        m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        self.logfile.write( "\n\n Sum Temp Interest OLS")
        m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        self.logfile.write( "\n\n Pos Temp Interest NegBinom")
        m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        #lim_people = self.people[self.people.timePosInterest>0]
        self.logfile.write( "\n\n Pos Temp Interest OLS")
        m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())
Exemple #6
0
    def _do_analysis_no_cross_validation(self):
        """
        Find the best model (fit) and create self.list_of_fits and self.fit

        """

        self.list_of_fits = []
        # first model is just the mean
        self.list_of_fits.append(fm.ols(formula="Q('{}') ~ 1".format(self.endog), data=self.df).fit())
        # try to improve the model until no improvements can be found
        all_exog = self.list_of_exog[:]
        while all_exog:
            # try each x in all_exog and overwrite the best_fit if we find a better one
            # the first best_fit is the one from the previous round
            best_fit = deepcopy(self.list_of_fits[-1])
            for x in all_exog:
                # make new_fit, compare with best found so far
                formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
                fit = fm.ols(formula=formula, data=self.df).fit()
                best_fit = self.find_best_bic([best_fit, fit])

            # Sometimes, the obtained fit may be better, but contains unsignificant parameters.
            # Correct the fit by removing the unsignificant parameters and estimate again
            best_fit = self._prune(best_fit, p_max=self.p_max)

            # if best_fit does not contain more variables than last fit in self.list_of_fits, exit
            if best_fit.model.formula in self.list_of_fits[-1].model.formula:
                break
            else:
                self.list_of_fits.append(best_fit)
                all_exog.remove(x)
        self.fit = self.list_of_fits[-1]
def prepare_data(subdata):

	subdata.ix[notnull(subdata['share']),'cost']=subdata.ix[notnull(subdata['share']),'Pop']
	subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])

	### predicts missing water level data points

	formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel=sm.ols(formula,data=subdata).fit()
	predictions=olsmodel.predict(subdata)
	subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]

	formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel2=sm.ols(formula,data=subdata).fit()
	res2=olsmodel2.params
	predictions2=olsmodel2.predict(subdata)
	subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]

	### predicts damages based on a few points using water level
	subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
	subdata['log{}'.format(varin2)]=np.log(subdata[varin2])

	formula="costlog ~ log{}".format(varin1)
	damagemodel=sm.ols(formula,data=subdata).fit()
	predicted_damages=damagemodel.predict(subdata)
	subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
	subdata['popestimated']=np.exp(subdata['costlog'])
	return subdata
def main():	

	teams = pd.read_csv('../data/Teams.csv')

	teams = teams[teams['yearID'] >= 1985]
	teams = teams[['yearID', 'teamID', 'Rank', 'R', 'RA', 'G', 'W', 'H', 'BB', 'HBP', 'AB', 'SF', 'HR', '2B', '3B']]

	teams = teams.set_index(['yearID', 'teamID'])

	salaries = pd.read_csv('../data/Salaries.csv')

	salaries_by_yearID_teamID = salaries.groupby(['yearID', 'teamID'])['salary'].sum()

	teams = teams.join(salaries_by_yearID_teamID)

	plot_spending_wins(teams, 2001)

	teams['BA'] = teams['H']/teams['AB']
	teams['OBP'] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF'])
	teams['SLG'] = (teams['H'] + teams['2B'] + (2*teams['3B']) + (3*teams['HR'])) / teams['AB']

	#First Model
	runs_reg_model1 = sm.ols("R~OBP+SLG+BA",teams)
	runs_reg1 = runs_reg_model1.fit()
	#Second Model
	runs_reg_model2 = sm.ols("R~OBP+SLG",teams)
	runs_reg2 = runs_reg_model2.fit()
	#Third Model
	runs_reg_model3 = sm.ols("R~BA",teams)
	runs_reg3 = runs_reg_model3.fit()


	print runs_reg1.summary()
	print runs_reg2.summary()
	print runs_reg3.summary()
Exemple #9
0
def RunModels(live):
    """Runs regressions that predict birth weight.

    live: DataFrame of pregnancy records
    """
    columns = ['isfirst[T.True]', 'agepreg', 'agepreg2']
    header = ['isfirst', 'agepreg', 'agepreg2']

    rows = []
    formula = 'totalwgt_lb ~ isfirst'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)

    formula = 'totalwgt_lb ~ agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    formula = 'totalwgt_lb ~ isfirst + agepreg'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    live['agepreg2'] = live.agepreg**2
    formula = 'totalwgt_lb ~ isfirst + agepreg + agepreg2'
    results = smf.ols(formula, data=live).fit()
    rows.append(FormatRow(results, columns))
    print(formula)
    SummarizeResults(results)
    
    PrintTabular(rows, header)
Exemple #10
0
    def fit_model(self, model=None, verbose=False):

        if model is None:
            # model = "Mach*B_field*Driving*Temperature" ## Full
            model = "M+B+k+T+M:B+M:T+B:T" #Fractional

        for i,(stat, vec, last) in enumerate(zip(self.statistics, \
                        self.respvecs, self.laststep_respvecs)):

            self.model_matrix["resp"] = Series(vec, index=self.model_matrix.index)
            self.model_matrix["laststep_resp"] = Series(last, index=self.model_matrix.index)

            fcn_model = sm.ols("".join(["resp~",model]), data=self.model_matrix)
            laststep_model = sm.ols("".join(["laststep_resp~",model]), data=self.model_matrix)

            results = fcn_model.fit()
            laststep_results = laststep_model.fit()

            self.fitparam.append(results.params[1:])
            self.laststep_fitparam.append(laststep_results.params[1:])

            if i==0:
                self.paramnames = fcn_model.exog_names[1:] # Set the names of the coefficients

            if verbose:
                print "Fits for "+ stat
                print results.summary()
                print laststep_results.summary()
        return self
def prepare_data(subdata,gdp_gr,gdp_per_capita_2013,pop_multiplier,pop_affected,endyear):

	def calccost(pop_exposed,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear):
		cost=pop_exposed*gdp_per_capita_2013*(1+gdp_gr)**(endyear-2013)*pop_multiplier
		return cost

	subdata.ix[notnull(subdata['share']),'cost']=calccost(pop_affected,gdp_gr,gdp_per_capita_2013,pop_multiplier,endyear)
	subdata.ix[notnull(subdata['share']),'costlog']=np.log(subdata.ix[notnull(subdata['share']),'cost'])

	### predicts missing water level data points

	formula=varin1+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel=sm.ols(formula,data=subdata).fit()
	predictions=olsmodel.predict(subdata)
	subdata.loc[subdata[varin1].isnull().values,varin1]=predictions[subdata[varin1].isnull().values]

	formula=varin2+" ~ return_period + wetland_loss + climate_change + runoff  + bndconditions"
	olsmodel2=sm.ols(formula,data=subdata).fit()
	res2=olsmodel2.params
	predictions2=olsmodel2.predict(subdata)
	subdata.loc[subdata[varin2].isnull().values,varin2]=predictions2[subdata[varin2].isnull().values]

	### predicts damages based on a few points using water level
	subdata['log{}'.format(varin1)]=np.log(subdata[varin1])
	subdata['log{}'.format(varin2)]=np.log(subdata[varin2])

	formula="costlog ~ log{}".format(varin1)
	damagemodel=sm.ols(formula,data=subdata).fit()
	predicted_damages=damagemodel.predict(subdata)
	subdata.loc[subdata['costlog'].isnull().values,'costlog']=predicted_damages[subdata['costlog'].isnull().values]
	subdata['costestimated']=np.exp(subdata['costlog'])
	return subdata
def partial_correlation(df, x, y, measures):
    '''
    A little (but hopefully quite useful) piece of code that calculates
    the partial correlation between x and y while covarying for the
    remaining measures in a list of measures.
    
    It requires a data frame, the names of x and y, and a list of measures
    (that don't need to, but can, contain x or y)
    
    This function returns r and p values
    '''
    # Import the modules you need
    from scipy.stats import pearsonr
    from statsmodels.formula.api import ols

    # Your covars are all the measures you've selected
    # that aren't x and y
    covars = [ z for z in measures if not z == x and not z == y ]
                                
    # Your formulae just set x and y to be a function
    # of all the other covariates
    formula_x = x + ' ~ ' + ' + '.join(covars)
    formula_y = y + ' ~ ' + ' + '.join(covars)

    # Fit both of these formulae
    lm_x = ols(formula_x, df).fit()
    lm_y = ols(formula_y, df).fit()
        
    # Save the residuals from the model
    res_x = lm_x.resid
    res_y = lm_y.resid
            
    r, p = pearsonr(res_x, res_y)
    
    return r, p
Exemple #13
0
def linear_foward_selection(X_train, y_train):
    '''
    forward selection of optimize adjusted R-squared by adding features that help
    the most one at a time until the score goes down or you run out of features
    not implemeneted yet. would only make sense for a linear model. not for categorical
    data presently not called from within module.
    '''
    remaining = {X_train.columns}
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model
Exemple #14
0
def test_patsy_lazy_dict():
    class LazyDict(dict):
        def __init__(self, data):
            self.data = data

        def __missing__(self, key):
            return np.array(self.data[key])

    data = cpunish.load_pandas().data
    data = LazyDict(data)
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    npt.assert_allclose(res.fittedvalues, res2)

    data = cpunish.load_pandas().data
    data['INCOME'].loc[0] = None

    data = LazyDict(data)
    data.index = cpunish.load_pandas().data.index
    res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit()

    res2 = res.predict(data)
    assert_equal(res.fittedvalues, res2)  # Should lose a record
    assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))
def run_regressions(data, formulas):
    """
    Run len(formulas) regressions on the clustered data.

    arguments:
    data -- Dataset, a dataset with the cdb field initialized to
            a DataFrame containing clusters and dep.vars.
    formulas --  a list of strings of the type 'dep_var ~ ex_var + ...'"
                 see statsmodels documentation for details.

    returns:
    a list of RegressionResults objects each one containing the results of
    one regression model. See statsmodels documentation for additional info.
    """
    results = []

    # We need to create an additional dataset for the fragility dep.var.
    # because scores from some countries are missing (marked as 'NA')
    # if we feed the statsmodels.ols function data with nas, it throws
    # errors.
    c_frag = data[data['fragility'] != 'NA']
    c_frag[['fragility']] = c_frag['fragility'].astype(float)

    for f in formulas:
        if 'fragility' in f:
            r = sm.ols(formula=f, data=c_frag).fit()
        else:
            r = sm.ols(formula=f, data=data).fit()
        results.append(r)

    return results
Exemple #16
0
def model_formulas():
    ''' Define models through formulas '''
    # Get the dta
    data = read_csv(r'..\Data\data_kaplan\swim100m.csv')
    
    # Different models
    model1 = ols("time ~ sex", data).fit()  # one factor
    model2 = ols("time ~ sex + year", data).fit()   # two factors
    model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
    
    # Model information
    print((model1.summary()))
    print((model2.summary()))
    print((model3.summary()))
    
    # ANOVAs
    print('-----------------------------------------------------------------')
    print((anova_lm(model1)))
    
    print('-----------------------------------------------------------------')
    print((anova_lm(model2)))
    
    print('-----------------------------------------------------------------')
    model3Results = anova_lm(model3)
    print(model3Results)
    
    # Just to check the correct run
    return model3Results['F'][0] # should be 156.1407931415788
Exemple #17
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)
    
    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    glm = glm('carbohydrate ~ age + weight + protein',
            family=Gaussian(), data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print glm.summary()
    
    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)    
Exemple #18
0
def backsel(df, response, alpha = 0.1):
    '''
    Performs backward selection for regression.
    args:
        df = data frame with response and covariates
        alpha = a float indicating confidence level
        response = string that represents the response variable
            e.g. 'Y'
    attributes:
        summary = ols(formula,data).fit().summary()
    '''
    # initial assignments
    covariates = set(df.columns)
    covariates.remove(response)
    formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
    
    while True:
        
        pvals = ols(formula,df).fit().pvalues
        candidates = pvals[pvals > alpha]
        
        if candidates.empty:
            break
            
        dropvar = candidates[candidates == max(candidates)].index[0]
        covariates.remove(dropvar)
        
        formula = '{} ~ {}'.format(response,' + '.join(list(covariates)))
    
    print 'The optimal model is {}'.format(formula)
    
    return ols(formula,df).fit().summary()
def anova_interaction(data_lastDV):
    """
    Two-way ANOVA and interaction analysis of given data
    http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html

    Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each
    :param data: data frame containing the independent variables in first two columns, dependent in the third
    :return: None
    """

    col_names = data_lastDV.columns.values  # get the columns' names
    factor_groups = data_lastDV[col_names].dropna()
    if len(col_names) < 3:
        print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names))

    # two-way anova
    formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")"
    formula_interaction = formula.replace('+', '*')
    interaction_lm = ols(formula, data=factor_groups).fit()  # linear model
    print(interaction_lm.summary())

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -")
    print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))

    print(FORMAT_LINE)
    print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -")
    print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
Exemple #20
0
    def _do_analysis_cross_validation(self):
        """
        Find the best model (fit) based on cross-valiation (leave one out)

        """
        assert len(self.df) < 15, "Cross-validation is not implemented if your sample contains more than 15 datapoints"

        # initialization: first model is the mean, but compute cv correctly.
        errors = []
        formula = "Q('{}') ~ 1".format(self.endog)
        for i in self.df.index:
            # make new_fit, compute cross-validation and store error
            df_ = self.df.drop(i, axis=0)
            fit = fm.ols(formula=formula, data=df_).fit()
            cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
            errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])

        self.list_of_fits = [fm.ols(formula=formula, data=self.df).fit()]
        self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))]

        # try to improve the model until no improvements can be found
        all_exog = self.list_of_exog[:]
        while all_exog:
            # import pdb;pdb.set_trace()
            # try each x in all_exog and overwrite if we find a better one
            # at the end of iteration (and not earlier), save the best of the iteration
            better_model_found = False
            best = dict(fit=self.list_of_fits[-1], cverror=self.list_of_cverrors[-1])
            for x in all_exog:
                formula = self.list_of_fits[-1].model.formula + "+Q('{}')".format(x)
                # cross_validation, currently only implemented for monthly data
                # compute the mean error for a given formula based on leave-one-out.
                errors = []
                for i in self.df.index:
                    # make new_fit, compute cross-validation and store error
                    df_ = self.df.drop(i, axis=0)
                    fit = fm.ols(formula=formula, data=df_).fit()
                    cross_prediction = self._predict(fit=fit, df=self.df.loc[[i], :])
                    errors.append(cross_prediction['predicted'] - cross_prediction[self.endog])
                cverror = np.mean(np.abs(np.array(errors)))
                # compare the model with the current fit
                if cverror < best['cverror']:
                    # better model, keep it
                    # first, reidentify using all the datapoints
                    best['fit'] = fm.ols(formula=formula, data=self.df).fit()
                    best['cverror'] = cverror
                    better_model_found = True

            if better_model_found:
                self.list_of_fits.append(best['fit'])
                self.list_of_cverrors.append(best['cverror'])
            else:
                # if we did not find a better model, exit
                break

            # next iteration with the found exog removed
            all_exog.remove(x)

        self.fit = self.list_of_fits[-1]
 def test_one_column_exog(self):
     from statsmodels.formula.api import ols
     res = ols("y~var1-1", data=self.data).fit()
     plot_regress_exog(res, "var1")
     plt.close('all')
     res = ols("y~var1", data=self.data).fit()
     plot_regress_exog(res, "var1")
     plt.close('all')
Exemple #22
0
def for_all_critics(critics, lim=50, num=5, genre="all", low="no", high="no", mpaa="no", pub="no"):
    """
    Return a sorted list of critics with highest Correlations(r^2) of their reviews to average user reviews
    :param critics: raw critic data
    :param lim: filter by number of eligible reviews
    :param num: filter by this column number
    :param genre: filter by genre
    :param low: filter by runtime min
    :param high: filter by runtime max
    :param mpaa: filter by mpaa rating
    :param pub: filter by current critic publication
    :return: filtered and sorted pandas dataframe
    """
    critics = clean_critic_data(critics, lim)
    info = []
    if pub != "no":
        critic_list = []
        for critic in critics:
            if critic[0][2][0] == pub:
                critic_list.append(critic)
        critics = critic_list

    for index, critic in enumerate(critics):
        df = pd.DataFrame(critic[1])
        if genre != "all":
            df = df[df.genre.apply(lambda x: genre in x)]
        if low != "no":
            df = df[df['runtime'] > low]
        if high != "no":
            df = df[df['runtime'] < high]
        if mpaa != "no":
            df = df[df['mpaa_rating'] == mpaa]
        mean_score = df.mean().score
        try:
            mean_user = df.mean().user_review
            mean_meta = df.mean().metascore
            mean_meta_diff = mean_score - mean_meta
            r = df.corr().iloc[0,1]
            mean_diff = mean_score - mean_user
            x = smf.ols("score ~ user_review", df).fit()
            rsquared = x.rsquared
            rsquared_adj = x.rsquared_adj
            meta = smf.ols("score ~ metascore", df).fit()
            meta_rsquared_adj = meta.rsquared_adj
            row = [critic[0][0], critic[0][2][0], len(critic[1]),
                   r, rsquared, rsquared_adj, mean_score,
                   mean_user, mean_diff, mean_meta_diff, meta_rsquared_adj]
            info.append(row)
        except:
            pass
    info = sorted(info, key=itemgetter(num))
    df = pd.DataFrame(info[::-1])
    df.columns = ['Critic', 'Publication', 'Eligible Reviews',
              "Pearson's Coefficient", "R^2", "R^2 Adj.",
              "Avg. Critic Review", "Avg. User Review",
              "Avg. Diff", "Avg. Meta Diff", "Meta R^2 Adj."]
    return df
def permutation_ols(df, formula, n=500):
    '''
    INPUTS:
        df      - data frame
        formula - text string containing pasty style formula
                  referring to columns in data frame
        n       - number of permutations
                    default = 500
                    
    RETURNS:
        t_values - a numpy array of n+1 t values (with the first being
                   the true t-value) for each of the regressors in the model
                   Note that these t-values are tests of the partial correlation
                   between the regressor and the dependent variable *after*
                   taking into account any variance described by the other
                   regressors
        p_values - the permutation test p-values for each regressor.
                     p < 0.05 --> significantly greater than the null
                     p > 0.95 --> significantly smaller than the null

    '''
    import pandas as pd
    from statsmodels.formula.api import ols
    import numpy as np
    
    # First calculate the true linear model
    lm_true = ols(formula, df).fit()
    
    # Write the values you need into numpy arrays
    t_values = np.copy(lm_true.tvalues)

    # Make a copy of the endog (y) and exog (x) values
    # (These are the data you sent to the linear model)
    x = np.copy(lm_true.model.exog)
    y = np.copy(lm_true.model.endog)
    
    for i in range(n):
        # Now shuffle y while keeping x the same
        np.random.shuffle(y)
        
        # Recombine your data into a data frame
        df_shuff = pd.DataFrame(np.append(y[..., None], x, 1))
        df_shuff.columns = [lm_true.model.endog_names] + lm_true.model.exog_names
        
        lm_shuff = ols(formula, df_shuff).fit()
        
        t_values = np.vstack([t_values, lm_shuff.tvalues])
        
    # Now calculate the permuted p value for each column in x.
    p_values = np.ones(t_values.shape[1])
    
    for x in range(x.shape[1]):
        p_values[x] = np.sum(t_values[1:,x] < t_values[0,x]) / np.float(n)
        
    return t_values, p_values
    def regression(self):

        print self.people.head(n=1)
        self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"

        self.logfile.write( "\n\n Num Regions NegativeBinomial")
        m = glm("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
                data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        #lim_people = self.people[self.people.numRegions>0]
        self.logfile.write( "\n\n Num Regions OLS")
        m = ols("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
                data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())



        # we could use beta regression for normalized entropy
        #print "\n\n Region Entropy"
        #m = ols("entropy ~ C(gender,Treatment(reference='male')) ", #+ C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
        #        data=self.people).fit()
        #print m.summary() # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on



        self.logfile.write( "\n\n Sum Temp Interest")
        m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        self.logfile.write( "\n\n Pos Temp Interest")
        m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        #lim_people = self.people[self.people.timePosInterest>0]
        self.logfile.write( "\n\n Pos Temp Interest OLS")
        m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())
Exemple #25
0
def anova_sm_func(dataframe, model):

    fit_full = ols('value' + ' ~ ' + model[0].fullmodel, data=dataframe).fit()
    fit_null = ols('value' + ' ~ ' + model[0].nullmodel, data=dataframe).fit()

    model_diff = sm.stats.anova_lm(fit_null, fit_full)
    # direction = np.sign(fit_full.tvalues[2])
    # statsout.pvalues[i] = model_diff.values[1, 5]
    # statsout.pvalues_signed[i] = direction*model_diff.values[1, 5]
    # statsout.tvalues[i] = fit_full.tvalues[2]
    # return direction*model_diff.values[1, 5]
    return fit_full.tvalues[2]/abs(fit_full.tvalues[2])*model_diff.values[1, 5]
Exemple #26
0
def stepwsel(df , response, alpha = 0.1):
    '''
    Performs stepwise selection for regression.
    ARGS:
        DF = Data frame with response and covariates
        alpha = a float indicating confidence level
        response = string that represents the response variable
            e.g. 'Y'
    attributes:
        summary = ols(formula,data).fit().summary()
    '''
    # initial assignments
    covariates = set(df.columns) # variables in dataframe
    covariates.remove(response) # remove Y
    candidates = []
    dropvar =[]
    optmodelpvals = [0]
    
    while True:
        
        oldpval = alpha # initial value to enter adding variable if statement
        rejects = set() # space for variables not entered in model
        
        if any(optmodelpvals) > alpha: # drop non-significant
            dropvar = list(optmodelpvals[optmodelpvals > .1].index)
            if 'Intercept' in dropvar:
                dropvar.remove('Intercept')
        
        for variable in covariates:
            candidatesubset = candidates + [variable]
            [candidatesubset.remove(element) for element in dropvar] # remove variables in dropvar
            formula = '{} ~ {}'.format(response, ' + '.join(candidatesubset)) # create model based on subset
            pval = ols(formula,df).fit().pvalues # get pvalues
            
            if pval[-1] < oldpval: # if the pavalue of the variable just added is significant then considered to be added
                var2add = variable # place holder
                oldpval = pval[-1] # update
                optmodelpvals = pval
                optmodelvars = candidatesubset
            else:
                rejects.add(variable) # add to rejected if not significant
              
        candidates.append(var2add)
        
        if covariates == rejects:
            optmodelvars.remove(dropvar[0])
            optmodel = '{} ~ {}'.format(response, ' + '.join(optmodelvars))
            print 'The optimal model is {}'.format(optmodel)
            break
            
        covariates.remove(var2add)
            
    return ols(optmodel,df).fit().summary()
Exemple #27
0
def getRegression():
	merged = pd.concat([total_amount_lent,total_loans, loans_per_member, borrower_female_count,borrower_male_count],axis = 1)
	result = sm.ols(formula="total_amount_lent ~ total_loans", data = merged).fit()
	print result.summary()
	result = sm.ols(formula="total_amount_lent ~ loans_per_member", data = merged).fit()
	print result.summary()
	result = sm.ols(formula="total_amount_lent ~ borrower_female_count", data = merged).fit()
	print result.summary()
	result = sm.ols(formula="total_amount_lent ~ borrower_male_count", data = merged).fit()
	print result.summary()
	result = sm.ols(formula = "total_amount_lent ~ intro_len", data = impact).fit()
	print result.summary()
	def factor_regression(self, market_return):
		# form pandas dataframe
		pars = {'date':[],'return':[], 'Mkt_Rf':[], 'SMB':[], 'HML':[], 'Excess_Return':[]} 
		if self.factor_num == 5:
			pars['CMA'] = []; pars['RMW'] = []

		for num, date in enumerate(sorted(self.returns)):
			if date not in self.factors:
				print "French factors haven't update to " + date
				break
			if num == 0: print "Starting time: ", date
			if num == len(self.returns) - 1: print "Ending   time:", date
			pars['date'].append(date)
			pars['return'].append(self.returns[date].adjClose)
			pars["Mkt_Rf"].append(self.factors[date].Mkt_Rf)
			pars["SMB"].append(self.factors[date].SMB)
			pars["HML"].append(self.factors[date].HML)
			pars["Excess_Return"].append(self.returns[date].adjClose - self.factors[date].RF)
			if self.factor_num == 5:
				pars["CMA"].append(self.factors[date].CMA)
				pars["RMW"].append(self.factors[date].RMW)
		pd.DataFrame(pars)
		#pd.DataFrame(pars, index = dates, columns = ["Mkt_Rf", "SMB", "HML", "Excess_Return"])
		if self.factor_num == 3:
			model = ols("Excess_Return ~ Mkt_Rf", pars).fit()
			print model.params
			market_risk_coef = model.params[1]
			market_risk_premium = 0.0727/12
			required_rate_of_return = self.treasuryList["GB6"] / 12 + market_risk_coef * market_risk_premium
		else:
			model = ols("Excess_Return ~ Mkt_Rf + SMB + HML + CMA + RMW", pars).fit()
			# the other two risk premiums come from the average of the French's factors over the 1990 - now
			market_risk_premium, size_risk_premium, value_risk_premium, profit_risk_premium, \
			invest_risk_premium = [0.055 / 12, 0.02 / 12, 0.043 / 12, 0.041 / 12, 0.03 / 12]
			market_risk_coef, size_risk_coef, value_risk_coef, profit_risk_coef, invest_risk_coef = model.params[1:]
			required_rate_of_return = self.treasuryList["GB6"] / 12 + market_risk_coef * market_risk_premium \
			+ size_risk_coef * size_risk_premium + value_risk_coef * value_risk_premium \
			+ profit_risk_coef * profit_risk_premium + invest_risk_coef * invest_risk_premium
		print model.summary()
		alpha = model.params[0]
		betas = model.params[1]
		#alpha = np.mean(np.array(pars['return'])) - model.params[1] * market_return
		var = np.var(np.array(pars['return']))
		print "alpha: ", alpha
		##########################################################################################################tmp
		print "beta", betas
		var = np.var(pars['return'])
		#print model.params
		print "Required_rate_of_return:", required_rate_of_return

		#print "annualized required_rate_of_return:", required_rate_of_return * 12
		return key_index(required_rate_of_return, var, alpha, [betas, 0, 0])
Exemple #29
0
def wu_test(form, data, variable):
    """
    Perform the Wu endogeneity test. This test is carried out in 3
    steps:

    1. Regress the variable in question on all other exogenous variables
    2. Add the residuals from the aforementioned regression to the main
       model
    3. Examine the p-value associated with the residual term from the
       updated model from part 2. A statistically significant coeff
       indicates that the tested variable is indeed endogenous.

    Parameters
    ==========
    form : str
        The statsmodels (patsy) formula for the model

    data : pandas.DataFrame
        The pandas DataFrame holding the data for the regression

    variable : str
        The string naming the variable (column) for which to perform
        the test

    Returns
    =======
    fit : statsmodels.regression.linear_model.RegressionResultsWrapper
        The statsmodels fit object associated with the Wu test.
    """
    endog, exog = form.split("~")
    s2_form = form

    o_exog = map(str.strip, exog.split('+'))
    o_exog.remove(variable)

    s1_form = variable + ' ~ ' + " + ".join(o_exog)
    s1_fit = sm.ols(s1_form, data=data).fit()
    res_name = 'resid_%s' % (variable)
    data[res_name] = s1_fit.resid
    s2_form += " + %s" % (res_name)

    fit = sm.ols(s2_form, data=data).fit()

    p_val = fit.pvalues['resid_EXP']
    endog_bool = 'not' if p_val >= 0.05 else 'is'
    msg = "WU TEST: The p_value of the added residual is %.4e"
    msg += "\n\t This %s significant at the alpha=0.05 level\n\n"
    print(msg % (p_val, endog_bool))

    return fit
def airline_regression():
    #COST PER MILE??? 
    ols_mat =pd.concat([pd.DataFrame(target_frequency),pca_data],axis=1)
    ols_mat['FLIGHT_COST_2'] =ols_mat['FLIGHT_COST']**2
    ols_mat['MARKET_TOT_2'] =ols_mat['MARKET_TOT']**2
    ols_mat['COST_DEMAND'] =ols_mat['FLIGHT_COST']*ols_mat['MARKET_TOT']
    ols_mat['DEMAND_COMPETITORS'] = ols_mat['MARKET_COMPETITORS']*ols_mat['MARKET_TOT']
    ols_mat['COST_COMPETITORS']  = ols_mat['MARKET_COMPETITORS']*ols_mat['FLIGHT_COST']
    fit_base = sm.ols(formula="DAILY_FREQ ~  FLIGHT_COST + FLIGHT_COST_2 + MARKET_TOT + MARKET_TOT_2 + SEATS_PER_FLIGHT +  MARKET_COMPETITORS", data = ols_mat).fit()
    fit_base.summary()
    fit_base = sm.ols(formula="DAILY_FREQ ~  FLIGHT_COST + FLIGHT_COST_2 + MARKET_TOT + DEMAND_COMPETITORS + MARKET_COMPETITORS", data = ols_mat).fit()
    fit_base.summary()
    preds =fit_base.predict()
    MAPE = sum(abs(target_frequency-preds))/sum(target_frequency)
Exemple #31
0
# In[22]:

batting.dtypes


# In[395]:

clf = linear_model.Lasso(alpha=0.1)
clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
print(clf.coef_, clf.intercept_)


# In[382]:

# regression model #1 -  OBP and OBP Against
results=smf.ols('Win_per ~ OBP + OBP_Against', data=batting99_03).fit().summary()
print(results)


# In[381]:




# In[56]:

# regression model #2 - SLG and SLG Against
#print(smf.ols('Win_per ~ SLG + SLG_Against', data=batting99_03).fit().summary())


# In[384]:
Exemple #32
0
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 24 14:51:40 2017

@author: abhishek
"""

import statsmodels.formula.api as sm
import pandas as pd
import seaborn as sns

np.set_printoptions(suppress=True)

df = pd.read_csv("Housing.csv")
model1 = sm.ols(
    formula=
    'price ~ lotsize + bedrooms + bathrms + stories + driveway + recroom + fullbase + gashw + airco + garagepl + prefarea',
    data=df)
fitted1 = model1.fit()
summary = fitted1.summary()
print(fitted1.summary())

# visualize the relationship between the features and the response using scatterplots
#sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7)

#############################################

from scipy import stats
import numpy as np
x = np.random.random(10)
y = np.random.random(10)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
merged = table * 100.0
ports = ff5.join(mom).reset_index()
ports = ports.merge(data[['yyyymm', 'qtr']],
                    how='inner',
                    left_on='date',
                    right_on='yyyymm')
del ports['yyyymm'], ports['date']
ports = ports.groupby('qtr').mean()
merged = merged.join(ports)
merged['MOM'] = merged['Mom   ']
del merged['Mom   ']
merged['exmkt'] = merged['Mkt-RF']
merged['mkt'] = merged['exmkt'] + merged['RF']
del merged['Mkt-RF']

reg = smf.ols('spread ~ exmkt', data=merged).fit(cov_type='HAC',
                                                 cov_kwds={'maxlags': 4})
reg = smf.ols('spread ~ exmkt + SMB + HML',
              data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4})
reg = smf.ols('spread ~ exmkt + SMB + HML + RMW + CMA',
              data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4})
reg = smf.ols('spread ~ exmkt + SMB + HML + RMW + CMA + MOM',
              data=merged).fit(cov_type='HAC', cov_kwds={'maxlags': 4})
# Annualize the monthly Alpha
print(reg.summary())
alpha = reg.params[0] / 100
print('Annualize Alpha: ' + str((((1 + alpha)**(4)) - 1) * 100) +
      ' with t-stat: ' + str(reg.tvalues[0]))

#------------------------------------------------------------------------------
# TABLE 9 (8 and 9 from old draft)
table = positive.pivot_table(index='qtr',
Exemple #34
0
#   This property is known as homoscedasticity.
#

###############################################################################
# Plot
sns.violinplot("site", "gm_f", data=brain_vol1)

###############################################################################
# Stats with scipy
fstat, pval = scipy.stats.f_oneway(*[brain_vol1.gm_f[brain_vol1.site == s]
                                   for s in brain_vol1.site.unique()])
print("Oneway Anova gm_f ~ site F=%.2f, p-value=%E" % (fstat, pval))

###############################################################################
# Stats with statsmodels
anova = smfrmla.ols("gm_f ~ site", data=brain_vol1).fit()
# print(anova.summary())
print("Site explains %.2f%% of the grey matter fraction variance" %
      (anova.rsquared * 100))

print(sm.stats.anova_lm(anova, typ=2))

###############################################################################
# **2. Test the association between the age and gray matter atrophy** in the
# control and patient population independently.

###############################################################################
# Plot
sns.lmplot("age", "gm_f", hue="group", data=brain_vol1)

brain_vol1_ctl = brain_vol1[brain_vol1.group == "Control"]
Exemple #35
0
    #print(str(gradient[-1])+" "+str(flow[-1])+" "+str(flowErr[-1]))

import pandas as pd
import numpy as np
import statsmodels.formula.api as sm

y_list = gradient
x_list = flow
y_err = flowErr

# put x and y into a pandas DataFrame, and the weights into a Series
ws = pd.DataFrame({'x': x_list, 'y': y_list})
weights = pd.Series(y_err)

wls_fit = sm.wls('x ~ y', data=ws, weights=1.0 / ((weights)**2)).fit()
ols_fit = sm.ols('x ~ y', data=ws).fit()

# show the fit summary by calling wls_fit.summary()
# wls fit r-squared is 0.754
# ols fit r-squared is 0.701

with open(resultsPlace + "regressionData.dat", 'a') as f:
    f.writelines([
        str(wls_fit.summary()) + "\n",
        str(wls_fit.params[0]) + " " + str(wls_fit.bse[0]) + "\n",
        str(wls_fit.params[1]) + " " + str(wls_fit.bse[1]) + "\n"
    ])
"""
# This stuff works, but is far too slow =(
    for siteIndex in range(2, sysSize-2):
        changeTimes = []
def greedy(singles, data):
    """singles = list of individual candidates for parameters in linear model
    data = pandas dataframe with columns AGDIST and singles
    returns useful predictors"""
    global candidates, predictors
    #  First we check if we need to continue from where we left off.
    if os.path.isfile("predictors/predictors_00.csv"):
        latest = 0
        while True:
            if not os.path.isfile("predictors/predictors_%02d.csv" %
                                  (latest + 1)):
                break
            else:
                latest += 1
        file = open("predictors/predictors_%02d.csv" % (latest), "r")
        predictors = file.read().split("\n")
        file.close()
        file = open("candidates/candidates_%02d.csv" % (latest), "r")
        candidates = file.read().split("\n")
        file.close()
        best_bic = smf.ols(formula="AGDIST ~ %s " % ('+'.join(predictors)),
                           data=data).fit().bic
        loop = latest + 1
    else:
        if not os.path.exists("predictors"):
            os.makedirs("predictors")
        if not os.path.exists("candidates"):
            os.makedirs("candidates")
        candidates = singles
        pairs = list(itertools.combinations(singles, 2))
        pairs = [x + ":" + y for x, y in pairs]
        output_can = []
        for x in candidates:
            if len(x) == 5:
                output_can.append(x)
        candidates = output_can
        # print (candidates)
        best_bic = 0
        loop = 0
    #  Now we run the greedy algorithm using multithreading
    while not os.path.isfile("predictors.csv"):
        print("%d predictors, %d candidates" %
              (len(predictors), len(candidates)))
        best_candidate = []
        bics = Parallel(n_jobs=num_cores)(delayed(get_bic)(candidate, data)
                                          for candidate in candidates)
        min_bic = min(bics)
        if min_bic < best_bic or best_bic == 0:
            best_bic = min_bic
            best_candidate = candidates[bics.index(min_bic)]
            candidates.remove(best_candidate)
            predictors.append(best_candidate)
            file = open("predictors/predictors_%02d.csv" % (loop), "w")
            file.write("\n".join(predictors))
            file.close()
            file = open("candidates/candidates_%02d.csv" % (loop), "w")
            file.write("\n".join(candidates))
            file.close()
            loop += 1
        else:
            candidates = singles + pairs
            output_can2 = []
            for i in candidates:
                if len(i) != 5:
                    output_can2.append(i)
            candidates = output_can2
            while not os.path.isfile("predictors.csv"):
                print("%d predictors, %d candidates" %
                      (len(predictors), len(candidates)))
                best_candidate = []
                bics = Parallel(n_jobs=num_cores)(
                    delayed(get_bic)(candidate, data)
                    for candidate in candidates)
                min_bic = min(bics)
                if min_bic < best_bic or best_bic == 0:
                    best_bic = min_bic
                    best_candidate = candidates[bics.index(min_bic)]
                    candidates.remove(best_candidate)
                    predictors.append(best_candidate)
                    file = open("predictors/predictors_%02d.csv" % (loop), "w")
                    file.write("\n".join(predictors))
                    file.close()
                    file = open("candidates/candidates_%02d.csv" % (loop), "w")
                    file.write("\n".join(candidates))
                    file.close()
                    loop += 1
                else:
                    file = open("predictors.csv", "w")
                    file.write("\n".join(predictors))
                    file.close()
                    file = open("candidates.csv", "w")
                    file.write("\n".join(candidates))
                    file.close()
    #  Save results
    file = open("predictors.csv", "r")
    predictors = file.read().split("\n")
    file.close()
    print('Done')
    return predictors
Exemple #37
0
# Using StatsModels
# Let's run the same regression using SciPy and StatsModels, and confirm we get the same results.


from scipy.stats import linregress
import statsmodels.formula.api as smf

# Run regression with linregress
subset = brfss.dropna(subset=['INCOME2', '_VEGESU1'])
xs = subset['INCOME2']
ys = subset['_VEGESU1']
res = linregress(xs,ys)
print(res)

# Run regression with StatsModels
results = smf.ols('_VEGESU1 ~ INCOME2', data = brfss).fit()
print(results.params)




# Plot income and education
# To get a closer look at the relationship between income and education, let's use the variable 'educ' to group the data, then plot mean income in each group.

# Here, the GSS dataset has been pre-loaded into a DataFrame called gss.


# Group by educ
grouped = gss.groupby('educ')

# Compute mean income in each group
Exemple #38
0
# 목적 : 독립변수의 표준화
import pandas as pd
from statsmodels.formula.api import ols
import sys

# Read the data set into a pandas DataFrame
wine = pd.read_csv('winequality-both.csv', sep=',', header=0)
wine.columns = wine.columns.str.replace(' ', '_')

my_formula = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar +' \
              'sulphates + total_sulfur_dioxide + volatile_acidity'

dependent_variable = wine['quality']  # 종속변수(y의 값 범위)
independent_variables = wine[wine.columns.difference(
    ['quality', 'type', 'in_sample'])]  # 독립변수(x의 값 범위)
# difference : A.difference(B) -> A 중에서 B 값을 뺀 것
independent_variables_standardized = (
    independent_variables -
    independent_variables.mean()) / independent_variables.std()
wine_standardized = pd.concat(
    [dependent_variable, independent_variables_standardized], axis=1)
print(wine_standardized.head())
lm_standardized = ols(my_formula, data=wine_standardized).fit()
print(lm_standardized.summary())

wine_standardized.to_csv('revised_wine.csv', index=False)
Exemple #39
0
def scatter_matrix(
    df,
    *,
    xs: Sequence[str] = None,
    ys: Sequence[str] = None,
    width=None,
    height=None,
    regression=True,
    **kwargs,
):
    assert len(df) > 0, 'TODO handle this'

    # FIXME handle empty df
    source = CDS(df)
    # TODO what about non-numeric stuff?

    xs = df.columns if xs is None else xs
    ys = df.columns if ys is None else ys
    ys = list(reversed(
        ys))  # reorder to move meaningful stuff to the top left corner

    isnum = lambda c: is_numeric_dtype(df.dtypes[c])
    # reorder so non-numeric is in the back
    # todo mode to drop non-numeric? not sure.. definitely can drop 'error' and datetimish?
    xs = list(sorted(xs, key=isnum, reverse=True))
    ys = list(sorted(ys, key=isnum, reverse=True))

    from bokeh.models import Label

    # TODO not sure I wanna reuse axis?
    def make(xc: str, yc: str):
        p = figure(df=df)
        diag = xc == yc  # todo handle properly
        # TODO not sure if I even want them... move to the very end?
        if isnum(xc) and isnum(yc):
            p.scatter(x=xc, y=yc, source=source, size=3)
        else:
            # TODO ugh, doesn't want to show the label without any points??
            # p.circle(x=0.0, y=0.0)
            # FIXME how to make sure text fits into the plot??
            add_text(
                p,
                x=0.0,
                y=0.0,
                text='Not numeric',
                text_color='red',
            )
        p.xaxis.axis_label = xc
        p.yaxis.axis_label = yc
        return p

    grid = [[make(xc=x, yc=y) for x in xs] for y in ys]
    from bokeh.layouts import gridplot
    w1 = None if width is None else width // min(len(xs), len(ys))
    h1 = None if height is None else height // min(len(xs), len(ys))
    grid_res = gridplot(grid, plot_width=w1, plot_height=h1)

    # TODO might be useful to include/exclude specific cols (e.g. datetime) while keeping them in annotations

    # TODO add the presence of the grid to the 'visual tests'
    # but if I swith it to raw bokeh -- it has Grid class.. might need to mess with
    # also maybe add extra axis under each plot in the grid? easier for a huge matrix of plots
    # some code in old dashboard
    if not regression:
        return grid_res

    # todo this would be need for plotly as well?
    import statsmodels.formula.api as smf  # type: ignore

    for plot in chain.from_iterable(grid):
        gs = plot.renderers
        if len(gs) == 0:
            # must be non-numeric? meh though
            continue
        [g] = gs
        xx = g.glyph.x
        yy = g.glyph.y

        if xx == yy:
            # diagonal thing, e.g. histogram. compute some stats??
            continue

        with pd.option_context('mode.use_inf_as_null', True):
            # FIXME proper error handling, display number of dropped items?
            dd = df[[xx, yy]].dropna()  # otherwise from_scatter fails
        # todo would be nice to display stats on the number of points dropped

        udd = dd.drop_duplicates()
        if len(udd) <= 1:
            # can't perform a reasonable regression then
            add_text(
                plot,
                x=0.0,
                y=0.0,
                text='ERROR: no points to correlate',
                text_color='red',
            )
            continue

        res = smf.ols(f"{yy} ~ {xx}", data=dd).fit()
        intercept = res.params['Intercept']
        slope = res.params[xx]
        r2 = res.rsquared

        ## TODO crap. is it really the best way to figure out relative position??
        relx = 0.01
        rely = 0.1

        # todo highlight high enough R2?
        minx, maxx = min(dd[xx]), max(dd[xx])
        miny, maxy = min(dd[yy]), max(dd[yy])
        # todo font size dependent on width?? ugh.
        txt = f'R2 = {r2:.4f}\nY ~ {slope:.3f} X'

        # todo need to add various regression properties, like intercept, etc
        # TODO hopefuly this overlays correctly?? not sure about nans, again
        from bokeh.models import Slope
        sl = Slope(gradient=slope,
                   y_intercept=intercept,
                   line_color='green',
                   line_width=3)
        plot.add_layout(sl)
        add_text(
            plot,
            text=txt,
            x=minx + (maxx - minx) * relx,
            y=miny + (maxy - miny) * rely,
            text_color=g.glyph.line_color,
        )

    # TODO dynamic resizing would be nice
    return grid_res
Exemple #40
0
# %%

# quantiles = np.arange(.05, .96, .1)
quantiles = [.1, .25, .5, .75, .9]


def fit_model(q):
    res = mod.fit(q=q)
    return [q, res.params['Intercept'], res.params['ageL']] + \
            res.conf_int().loc['ageL'].tolist()


models = [fit_model(x) for x in quantiles]
models = pd.DataFrame(models, columns=['q', 'a', 'b', 'lb', 'ub'])

ols = smf.ols('accumrateL ~ ageL', data).fit()
ols_ci = ols.conf_int().loc['ageL'].tolist()
ols = dict(a=ols.params['Intercept'],
           b=ols.params['ageL'],
           lb=ols_ci[0],
           ub=ols_ci[1])

print(models)
print(ols)

# %%
x = np.arange(data.ageL.min(), data.ageL.max() + 0.05, .05)
get_y = lambda a, b: 10**a * (10**x)**b

fig, ax = plt.subplots(figsize=(8, 6))
def runRegression(y,x,data, cov_type='HC0'):
    print("Covariance type: %s" %cov_type )
    form = '{0} ~ {1}'.format(y,x)
    mod = smf.ols(formula=form, data=data)
    res = mod.fit(cov_type=cov_type)
    return(res)
df.columns
df.head()
plt.hist(df.Salary)
plt.boxplot(df.Salary,0,'rs',0)
plt.plot(df.Salary,df.YearsExperience,'bo');plt.xlabel("Salary");plt.ylabel("Years of Experience")

#To find the correlation

df.Salary.corr(df.YearsExperience)

np.corrcoef(df.Salary,df.YearsExperience)

#Model building

import statsmodels.formula.api as smf
model=smf.ols("Salary~YearsExperience",data=df).fit()
model
pred=model.predict(df)
pred
model.params
model.summary()
#Data Vizualization

plt.scatter(x=df['YearsExperience'],y=df['Salary'],color='red');plt.plot(df['YearsExperience'],pred,color='black');plt.xlabel("YearsExperience");plt.ylabel("Salary")

pred.corr(df.Salary)
#Tranforming variables for accuracy

model1=smf.ols("Salary~np.log(YearsExperience)",data=df).fit()
model1
model1.summary()
import matplotlib.mlab as mlab
from pandas import DataFrame
import seaborn as sns
import statsmodels.formula.api as sm

df = DataFrame({
        "Treatment": np.repeat(["Kommerziell", "Vakuum", "Gemischt", "CO2"], [3, 3, 3, 3]),
        "steak_id":[7.66, 6.98, 7.80, 5.26, 5.44, 5.80, 7.41, 7.33, 7.04, 3.51, 2.91, 3.66]
        })

df = DataFrame({
        "Treatment": np.repeat(["Kommerziell", "Vakuum", "XO2", "Yemischt"], [3, 3, 3, 3]),
        "steak_id":[7.66, 6.98, 7.80, 5.26, 5.44, 5.80, 7.41, 7.33, 7.04, 10.51, 10.91, 10.66]
        })

fit = sm.ols("steak_id~Treatment", data=df).fit()
fit.summary()

help(sm.ols)

fit_pred = fit.get_prediction()
fit_pred.conf_int()

print(fit.summary())
print(fit.params)


print("T.CO2: " + str(3.3600 + 0))
print("T.Gemischt: " + str(3.3600 + 3.9000))
print("T.Kommerziell: " + str(3.3600 + 4.1200))
print("T.Vakuum]: " + str(3.3600 + 2.1400))
Exemple #44
0
def anova(df, formula):
    lm = ols(formula, data=df).fit()
    table = sm.stats.anova_lm(lm, typ=2)
    return table
def coefplot(formula, data, fontsize=5):
    """ Plots coefficients of a regression model.
        formula = patsy-style formula for regression model
        data = pandas dataframe with columns for the variables in the formula
        fontsize = 5 by default
        returns figure, axes
    """
    lm = smf.ols(formula, data=data).fit()
    lm0 = smf.ols(formula + "+ 0", data=data).fit()
    r.assign("data", data)
    r("""
    trunc_reg <- truncreg(%s,
                          data = data,
                          point = 0,
                          direction = 'left')
    summ <- summary(trunc_reg)
    coeffs <- trunc_reg$coefficients
    coeffs <- coeffs[names(coeffs) != "sigma"]
    coeffs_values <- as.vector(coeffs, mode="numeric")
    coeffs_bse <- head(summary(trunc_reg)$coefficients[,2], -1)
    """ % (formula))
    params = pd.DataFrame(data=r("coeffs_values"), index=r("names(coeffs)"))
    params.index = [":".join(sorted(name.split(":"))) for name in params.index]
    params = params.drop("(Intercept)")
    params.columns = ["truncreg"]
    truncreg_bse = pd.DataFrame(data=r("coeffs_bse"), index=r("names(coeffs)"))
    lm_params = lm.params.drop("Intercept")
    lm_params.index = [
        ":".join(sorted(name.split(":"))) for name in lm_params.index
    ]
    params["lm"] = lm_params
    lm0_params = lm0.params
    lm0_params.index = [
        ":".join(sorted(name.split(":"))) for name in lm0_params.index
    ]
    params["lm0"] = lm0_params
    params = params.sort_values("lm")
    lm_bse = lm.bse
    lm_bse.index = [":".join(sorted(name.split(":"))) for name in lm_bse.index]
    lm0_bse = lm0.bse
    lm0_bse.index = [
        ":".join(sorted(name.split(":"))) for name in lm0_bse.index
    ]
    fig, ax = plt.subplots()
    y = range(len(params.index))
    ax.scatter(list(params["truncreg"]), y, color="g", s=2)
    ax.scatter(list(params["lm"]), y, color="r", s=2)
    ax.scatter(list(params["lm0"]), y, color="b", s=2)
    for y in range(len(params.index)):
        sub = params.index[y]
        x = params.lm[sub]
        se = lm_bse[sub]
        ax.plot([x - se, x + se], [y, y], color="red")
        x = params.lm0[sub]
        se = lm0_bse[sub]
        ax.plot([x - se, x + se], [y, y], color="blue")
        x = params.truncreg[sub]
        for perm in list(itertools.permutations(sub.split(":"))):
            s = ":".join(perm)

            try:
                se = truncreg_bse.loc[s]
                ax.plot([x - se, x + se], [y, y], color="green")
            except KeyError:
                pass
    red_patch = mpatches.Patch(color='red', label='Linear Model')
    blue_patch = mpatches.Patch(color='blue', label='Forced Zero Intercept')
    green_patch = mpatches.Patch(color='green', label='Truncated Regression')
    plt.legend(handles=[red_patch, blue_patch, green_patch], loc=2)
    plt.yticks(range(len(params.index)), params.index)
    ax.set_ylim([-1, len(params)])
    ax.set_yticklabels(params.index, fontsize=fontsize)
    ax.set_ylabel("Substitutions")
    ax.set_xlabel("Coefficients")
    plt.title("Coefficient plot")
    plt.grid()
    fig.savefig("coefplot.png", dpi=200)
    file = open("lm_summary.txt", "w")
    file.write(str(lm.summary()))
    file.close()
    file = open("lm0_summary.txt", "w")
    file.write(str(lm0.summary()))
    file.close()
    file = open("truncreg_summary.txt", "w")
    file.write(str(r("summ")))
    file.close()
    return fig, ax
        ses_2_group[s] = 'HC'
    elif subjectNum > 100:
        ses_2_group[s] = 'MDD'
ses_2_info = {}
ses_2_info['FD'] = np.array(data2['FD'])
ses_2_info['subj'] = list(data2['subj'])
ses_2_info['ses'] = ses_2_col
ses_2_info['group'] = ses_2_group
ses_2_df = pd.DataFrame(data=ses_2_info)

FULL_DF = pd.concat([ses_1_df, ses_2_df])
aovrm2way = AnovaRM(FULL_DF, 'FD', 'subj', within=['group'])
res2way = aovrm2way.fit()
print(res2way)

model = ols('FD ~ group*ses', data=FULL_DF).fit()
print(
    f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}"
)
model.summary()
aov_table = sm.stats.anova_lm(model, typ=2)
aov_table

# interaction not significant - repeat

model = ols('FD ~ group + ses', data=FULL_DF).fit()
print(
    f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}"
)
model.summary()
aov_table = sm.stats.anova_lm(model, typ=2)
        plt.text(x + 0.5,
                 -1,
                 '%s' % array.columns[x],
                 horizontalalignment='center',
                 verticalalignment='center',
                 fontsize=fontsize)
    plt.tick_params(axis='x', which='both', bottom='on', top='off')
    plt.tick_params(axis='y', which='both', left='on', right='off')
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.spines['left'].set_position('zero')
    ax.spines['bottom'].set_position('zero')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    plt.title("%s=%.1f" % (predictor, coeffs[predictor]))
    fig.subplots_adjust(bottom=0.2)
    fig.savefig(predictor + "/" + predictor + ".png", dpi=500)
    return fig, ax


if __name__ == "__main__":
    predictors = [
        "KN145", "EK156", "LS157:NS193", "GK135", "DE190:IT214", "EG158_KR050",
        "LQ226"
    ]
    data = pd.read_csv("culled.csv", sep=" ")
    data = data.drop("Unnamed: 75", axis=1)
    data = collinify(data)
    model = smf.ols("AGDIST ~ %s" % ("+".join(predictors)), data=data)
    coefvis("EK156", model)
Exemple #48
0
    df = pd.merge(combined, params_df, on=['fips'])

    filter_ind = list(range(len(df)))
    new_filter_ind = np.array(
        [i for i, x in enumerate(df['statsmodels_p_value']) if x < 0.1])
    filter_ind = [i for i in filter_ind if i in new_filter_ind]

    new_filter_ind = np.array(
        [i for i, x in enumerate(df['new_positive_cnt_7_day_avg']) if x > 3])
    filter_ind = [i for i in filter_ind if i in new_filter_ind]

    df['Mar_temp_minus_67'] = df['Mar_temp'] - 67

    model = smf.ols(
        formula=
        'statsmodels_mean ~ Mar_temp_minus_67 + Mar_precip + frac_female + frac_black + frac_asian + frac_native + frac_hispanic + frac_25_to_44 + frac_45_to_64 + frac_over_64 + np.log(pop_density) + unemployment_rate + np.log(median_household_income)',
        data=df.iloc[filter_ind])
    results = model.fit()
    print(f'\n#######\n# Offset: {offset}\n#######')
    print(results.summary())

    map_offset_to_df[offset] = df
    map_offset_to_results[offset] = results

    df['log_pop_density'] = np.log(df['pop_density'])
    df['log_median_household_income'] = np.log(df['median_household_income'])

    corr_dict = dict()
    for plot_param_name in [
            'Mar_temp_minus_67', 'Mar_precip', 'frac_male', 'frac_female',
            'frac_white', 'frac_black', 'frac_asian', 'frac_native',
Exemple #49
0
print '\n\nOut of sample Data and Prediction'
x1n = np.linspace(20.5, 25, 10)
Xnew = np.column_stack((x1n, np.sin(x1n), (x1n - 5)**2))
Xnew = sm.add_constant(Xnew)
ynewpred = olsres.predict(Xnew)  # predict out of sample
print ynewpred

import matplotlib.pyplot as plt
# now plot the comparison
fig, ax = plt.subplots()
ax.plot(x1, y, 'o', label="Data")
ax.plot(x1, y_true, 'b-', label="True")
ax.plot(np.hstack((x1, x1n)),
        np.hstack((ypred, ynewpred)),
        'r',
        label="OLS prediction")
ax.legend(loc="best")

plt.show()

#Prediciton with formulas same as above
from statsmodels.formula.api import ols

data = {"x1": x1, "y": y}

res = ols("y ~ x1 + np.sin(x1) + I((x1-5)**2)", data=data).fit()

res.params

res.predict(exog=dict(x1=x1n))
Exemple #50
0
df_origins[df_origins.ARR_DELAY > 0].ARR_DELAY.hist(by=df_origins.ORIGIN,
                                                    sharex=True)

### plot delay times (y axis) and time of day (x axis)
colors = np.where(df_origins.CARRIER == "AA", "r", "b")

df_origins.plot(x="DEP_TIME",
                y="ARR_DELAY",
                kind="SCATTER",
                c=colors,
                alpha=.3)

###plot delay times over the course of the entire time

df_origins.groupby("ORIGIN").agg(np.percentile)["ARR_DELAY"]

df_origins.groupby("ORIGIN")["ARR_DELAY"].describe()

est_s = smf.ols(formula='ARR_DELAY ~ C(ORIGIN) ',
                data=df_origins).fit()  #+ C(CARRIER)
est_s.summary()

## just look at time
est_s = smf.ols(
    formula=
    "ARR_DELAY ~ DAY_OF_WEEK + DEP_TIME + AIR_TIME + DISTANCE + SECURITY_DELAY + TOTAL_ADD_GTIME",
    data=df_origins).fit()
est_s.summary()
## CARRIER

#DEp_TIME,
    'cluster3_precentral_postcentral_gyrus', 'cluster4_frontal_pole',
    'cluster5_temporal_pole', 'cluster6_left_hippocampus_amygdala',
    'cluster7_left_caudate_putamen', 'cluster8_left_thalamus',
    'cluster9_right_thalamus', 'cluster10_middle_temporal_gyrus'
]
features = pop_all[features_name].as_matrix()

df = pd.DataFrame()
df["age"] = pop_all["age"].values
df["sex"] = pop_all["sex_num"].values
df["site"] = pop_all["site_num"].values

i = 0
for f in features_name:
    df[f] = features[:, i]
    mod = ols("%s ~ age+sex+C(site)" % f, data=df).fit()
    res = mod.resid
    df["%s" % f] = res
    print(mod.summary())
    i = i + 1

features_corr = df[[
    'age', 'sex', 'site', 'cluster1_cingulate_gyrus',
    'cluster2_right_caudate_putamen', 'cluster3_precentral_postcentral_gyrus',
    'cluster4_frontal_pole', 'cluster5_temporal_pole',
    'cluster6_left_hippocampus_amygdala', 'cluster7_left_caudate_putamen',
    'cluster8_left_thalamus', 'cluster9_right_thalamus',
    'cluster10_middle_temporal_gyrus'
]]

features_corr.to_csv(
plt.show()

 # 3. Áp dụng ANOVA
fvalue, pvalue = stats.f_oneway(df.S_cars, df.M_cars, df.X_cars)
print(fvalue, pvalue)

# get ANOVA table as R like output
# reshape the d dataframe suitable for statsmodels package
df_melt = pd.melt(df.reset_index(), id_vars = ['index'], 
                  value_vars = ['S_cars', 'M_cars', 'X_cars'])

# replace column names
df_melt.columns = ['index', 'cars', 'value']

# Ordinary Least Squares (OLS) model
model = ols('value ~ C(cars)', data = df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ = 2)
print(anova_table)

# Giá trị P-value có ý nghĩa về mặt thống kê (P < 0.05),
# do đó, có thể kết luận rằng có sự khác biệt đáng kể giữa các loại xe.

# perform multiple pairwise comparison (Tukey HSD)
m_comp = pairwise_tukeyhsd(endog = df_melt['value'], groups = df_melt['cars'], alpha = 0.05)
print(m_comp)

# ngoại trừ X_cars và M_cars, tất cả các so sánh cặp khác đều bác bỏ H0
# và chỉ ra sự khác biệt đáng kể về mặt thống kê.

# Kiểm định Levene: phương sai bằng nhau ?
w, pvalue = stats.levene(df.S_cars, df.M_cars, df.X_cars)
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()
# Avoiding the dummy variable trap ( this is done automatically )
# all columns starting from index 1
X = X[:, 1:]
# # features scaling
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)

# Fitting Multiple linear Regression to the Training set
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)
# Predicting the Test set results
# recheck here y_pred doest show on variables
y_pred = regressor.predict(X_test)
# building the optimal model using backward elimination
import statsmodels.formula.api as sm

# we add b0 to the statsmodel a vector of 50 ones , as type(int) cast array to int
# ligne axises = 0 colomun = 1
X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
# X_opt only the highly significant variables
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.ols(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
Exemple #54
0
    ]
})
fig = plt.figure(1, figsize=(12, 4))
fig.suptitle('')
ax1, ax2 = fig.subplots(1, 2)
ax1.plot(regress['Temperatur'], regress['Spannung'], 'bo')
ax1.axis([0, 100, 2.5, 4.5])
ax1.set_xlabel('Temperatur $T$ / °C')
ax1.set_ylabel('Spannung $U$ / V')
ax1.set_title('Lineare Regression')
ax1.grid(True)
""" Lineares Regressionsmodell definieren und berechnen """

# model = ols("Spannung ~ Temperatur", regress).fit()
# model = ols("Spannung ~ Temperatur + I(Temperatur**2) + I(Temperatur**3)" , regress).fit()
model = ols("Spannung ~ I(Temperatur**2) + I(Temperatur**3)", regress).fit()
print(model.summary())
st, data, ss2 = summary_table(model, alpha=0.05)
""" Darstellung der Regressionsfunktion zusammen mit Originaldaten """

regress['Fit'] = data[:, 2]
# regress['Resid'] = data[:,3]
ax1.plot(regress['Temperatur'], regress['Fit'], 'b')
""" Berechnung und Darstellung der Residuen """

ax2.stem(regress['Temperatur'],
         model.resid,
         'r',
         use_line_collection=True,
         markerfmt='ro')
ax2.axis([0, 100, -0.2, 0.2])
Exemple #55
0
#!/usr/bin/env python
# coding: utf-8

# In[18]:

import pandas as pd
Location = "C:/gradedata.csv"
df = pd.read_csv(Location)
df.head()
df.corr()

# In[19]:

import statsmodels.formula.api as sm
result = sm.ols(formula='grade ~ exercise + hours + gender', data=df).fit()
result.summary()

# In[ ]:
Exemple #56
0
import statsmodels.api as sm
import statsmodels.formula.api as smf
import faraway.utils

#

import faraway.datasets.statedata
statedata = faraway.datasets.statedata.load()
statedata.index = statedata['State']
statedata = statedata.drop('State', 1)
statedata.head()

#

lmod = smf.ols(
    'LifeExp ~ Population + Income + Illiteracy + \
    Murder + HSGrad + Frost + Area', statedata).fit()
lmod.sumary()

#

lmod.pvalues.idxmax(), lmod.pvalues.max()

#

lmod = smf.ols(
    'LifeExp ~ Population + Income + Illiteracy + \
    Murder + HSGrad + Frost', statedata).fit()
lmod.pvalues.idxmax(), lmod.pvalues.max()

#
Exemple #57
0
# Dropping the 41 items that can't decide whether they're in a basement or not
srrs2 = srrs2[(srrs2['floor'] == 0) | (srrs2['floor'] == 1)]
srrs2 = srrs2[srrs2['activity'] > 0]

srrs2['uranium_in_county'] = srrs2['county'].map(
    lambda c: list(cty[cty['cty'] == c]['Uppm'])[0])

srrs2['jitter'] = np.random.normal(0, 0.01, len(srrs2))
lacquiparle = srrs2[srrs2['county'] == 'lacquiparle']
lacquiparle_uranium = list(lacquiparle['uranium_in_county'])[0]

# Model is like: y_i = gamma_0 + gamma_1 * u * G + beta * x_i
# log(radon) = g0 + g1 * log(uranium_in_county) + g2 * in_basement

pooled = smf.ols('np.log(activity) ~ floor', data=srrs2).fit()
print pooled.summary()
unpooled = smf.ols('np.log(activity) ~ floor + county - 1', data=srrs2).fit()
print unpooled.summary()


# Values from lmer(logradon ~ floor + (1 | county), srrs2) in R
def partial_predict_lacquiparle(floor):
    return 1.9169895 + -0.640674 * floor


plt.figure()
plt.grid()
plt.scatter(srrs2['floor'] + srrs2['jitter'], np.log(srrs2['activity']))
plt.plot([-0.1, 1.1],
         [pooled.predict({'floor': -0.1}),
Exemple #58
0
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)

#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_sales'], axis=1)
#adding lags
for inc in range(1, 13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)

# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_1', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['sales', 'date'], axis=1)
#split train and test set
train_set, test_set = df_model[:500].values, df_model[500:].values

#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
                   names=[
                       '0', '1', 'decimal date', 'average', 'interpolated',
                       'trend', '#days'
                   ])

# time (t) goes on the x-axis anc co2 goes on the y-axis

co2 = data['interpolated']

t = data['decimal date']  # by default, statsmodels doesn't compute y-intercept
T = sm.add_constant(t)
tsquared = t**2  # squaring provides the 'quadratic' aspect
Tsquared = sm.add_constant(
    tsquared)  # again, by default, statsmodels doesn't compute y-intercept

model = smf.ols(formula='co2 ~ T + Tsquared', data=data).fit()
results = model.fittedvalues
print(model.summary())

plt.figure(1)
plt.subplot(211)
plt.plot(t, co2)
plt.plot(t, results)
plt.title('Quadratic Model of Atmospheric CO2 at Mauna Loa Observatory')
plt.ylabel('CO2 Concentration (ppmv)')

residuals = model.resid
# c1 = np.cos(2*math.pi*t)
# s1 = np.sin(2*math.pi*t)
# residualModel = smf.ols(formula = 'residuals ~ T + Tsquared + c1 + s1', data = residuals).fit()
        if df[cols][i]:
            if df[cols][i] == str(df[cols][i]):
                cats.append(cols)
                break
        else:
            i += 1

# In[ ]:

#Choose the related categorical columns and create the final columns list
get_ipython().run_line_magic('matplotlib', 'inline')
import statsmodels.api as sm
from statsmodels.formula.api import ols
choice = []
for cat in cats:
    mod = ols('SalePrice ~ {}'.format(cat), data=df).fit()
    aov_table = sm.stats.anova_lm(mod, typ=2)
    choice.append([cat, aov_table['PR(>F)'][0]])


def Sort(sub_li):
    sub_li.sort(key=lambda x: x[1])
    return sub_li


choice = Sort(choice)

choice = [choice[x][0] for x in range(0, 5)]
print(choice)
#Merge the columns
final_columns = np.array(choice + pd.Series.tolist(categories))