def reduce_multi_model(orig_fitted, base_string, res, df, fit=None):
    """orig_fitted = an object returned from calling .fit() on a statsmodels logit model
    base_string = the right hand side of the formula used to estimate orig_fitted
    res = The string for the column name in df that has the classes.
    df = the pandas dataframe from which orig_fitted was estimated
    ==========
    Returns a fitted logistic regression model, and the base string used to estimate
    the model.
    
    If at least one variable has a p-value which is > 0.05, this function will
    removes the variable with the worst p-value, estimate a new logistic regression,
    and repeat the process until no more insignificant variables can be removed."""
    
    #Check the class of the function inputs
    assert isinstance(base_string, str)
    assert isinstance(res, str)
    assert isinstance(df, pd.DataFrame)
    
    #Try to reduce the number of variables in the original model
    new_bvars = whittle_multi_model_vars(orig_fitted, base_string)
    #Initialize a variable for the smallest model
    small_model = orig_fitted
    #Initialize a variable for the smallest model base_string
    small_base = base_string
    
    node_variables = isolate_node_cols(df)
    
    while new_bvars is not None: #If a reduced set of variables has been found
        #new_base = " + ".join(["0"] + new_bvars) #Create a new base_string
        #new_fstring = res + " ~ " + new_base #Create a new statsmodels formula string
        
        model_vars = combat_multi_collinearity(df, new_bvars, node_variables, max_cond=2000)
        new_base = " + ".join(model_vars) #Create a string of all variables using in the multivariate regression
        new_fstring = res + " ~ " + "0 + " + new_base #Create the new formula string
        
        try: #Try to fit a new logistic regression model
        #Use the if...else statement to accomodate various optimization methods
            if fit is None:
                new_model = smf.logit(new_fstring, data = df).fit(maxiter=2000, disp=False)
            else:
                new_model = smf.logit(new_fstring, data = df).fit(method=fit, maxiter=2000, disp=False)
        #Assign small_base to the smallest identified set of base variables so far
            small_base = " + ".join(new_bvars)  
        #Assign small_model to the model with smallest set of base variables so far
            small_model = new_model
        #Search for new base variables
            new_bvars =  whittle_multi_model_vars(new_model, new_base)
        except Exception as inst: #If the model could not be fit, print a message saying so
            #print "Estimating logit model failed when using formula: {}".format(new_fstring)
            #Note the line below is un-tested, but I added it because it seemed
            #that an infinite loop would result without it.
            #print inst
            new_bvars = None

    #Print the model results of the most reduced model.            
    #print "="*10
    #print "The reduced model results are:"
    #print small_model.summary()
    
    return small_model, small_base
Example #2
0
def RunLogisticModels(live):
    """Runs regressions that predict sex.

    live: DataFrame of pregnancy records
    """
    #live = linear.ResampleRowsWeighted(live)

    df = live[live.prglngth>30]
    # df = JoinFemResp(df)

    df['boy'] = (df.babysex==1).astype(int)
    df['isyoung'] = (df.agepreg<20).astype(int)
    df['isold'] = (df.agepreg<35).astype(int)
    df['season'] = (((df.datend+1) % 12) / 3).astype(int)

    # run the simple model
    model = smf.logit('boy ~ agepreg', data=df)    
    results = model.fit()
    print('nobs', results.nobs)
    print(type(results))
    SummarizeResults(results)

    # run the complex model
    model = smf.logit('boy ~ agepreg + hpagelb + birthord + C(race)', data=df)
    results = model.fit()
    print('nobs', results.nobs)
    print(type(results))
    SummarizeResults(results)

    # make the scatter plot
    exog = pandas.DataFrame(model.exog, columns=model.exog_names)
    endog = pandas.DataFrame(model.endog, columns=[model.endog_names])
    
    xs = exog['agepreg']
    lo = results.fittedvalues
    o = np.exp(lo)
    p = o / (o+1)

    #thinkplot.Scatter(xs, p, alpha=0.1)
    #thinkplot.Show()

    # compute accuracy
    actual = endog['boy']
    baseline = actual.mean()

    predict = (results.predict() >= 0.5)
    true_pos = predict * actual
    true_neg = (1 - predict) * (1 - actual)

    acc = (sum(true_pos) + sum(true_neg)) / len(actual)
    print(acc, baseline)

    columns = ['agepreg', 'hpagelb', 'birthord', 'race']
    new = pandas.DataFrame([[35, 39, 3, 1]], columns=columns)
    y = results.predict(new)
    print(y)
Example #3
0
def calculate_odds_ratio(genotypes, phen_vector1, phen_vector2, reg_type, covariates, response='',
                         phen_vector3=''):  # diff - done
    """
    Runs the regression for a specific phenotype vector relative to the genotype data and covariates.

    :param genotypes: a DataFrame containing the genotype information
    :param phen_vector: a array containing the phenotype vector
    :param covariates: a string containing all desired covariates
    :type genotypes: pandas DataFrame
    :type phen_vector: numpy array
    :type covariates: string

    .. note::
        The covariates must be a string that is delimited by '+', not a list.
        If you are using a list of covariates and would like to convert it to the pyPhewas format, use the following::

            l = ['genotype', 'age'] # a list of your covariates
            covariates = '+'.join(l) # pyPhewas format

        The covariates that are listed here *must* be headers to your genotype CSV file.
    """

    data = genotypes
    data['y'] = phen_vector1
    data['MaxAgeAtCPT'] = phen_vector2
    # f='y~'+covariates
    if response:
        f = response + '~ y + genotype +' + covariates
        if phen_vector3.any():
            data['phe'] = phen_vector3
            f = response + '~ y + phe + genotype' + covariates
    else:
        f = 'genotype ~ y +' + covariates
        if phen_vector3.any():
            data['phe'] = phen_vector3
            f = 'genotype ~ y + phe +' + covariates
    try:
        if reg_type == 0:
            logreg = smf.logit(f, data).fit(method='bfgs', disp=False)
            p = logreg.pvalues.y
            odds = logreg.params.y
            conf = logreg.conf_int()
            od = [-math.log10(p), logreg.params.y, '[%s,%s]' % (conf[0]['y'], conf[1]['y'])]
        else:
            linreg = smf.logit(f, data).fit(method='bfgs', disp=False)
            p = linreg.pvalues.y
            odds = linreg.params.y
            conf = linreg.conf_int()
            od = [-math.log10(p), linreg.params.y, '[%s,%s]' % (conf[0]['y'], conf[1]['y'])]
    except:
        odds = 0
        p = np.nan
        od = [np.nan, np.nan, np.nan]
    return (odds, p, od)
Example #4
0
def LogisticRegressionExample():
    """Runs a simple example of logistic regression and prints results.
    """
    y = np.array([0, 1, 0, 1])
    x1 = np.array([0, 0, 0, 1])
    x2 = np.array([0, 1, 1, 1])

    beta = [-1.5, 2.8, 1.1]

    log_o = beta[0] + beta[1] * x1 + beta[2] * x2 
    print(log_o)

    o = np.exp(log_o)
    print(o)

    p = o / (o+1)
    print(p)

    like = y * p + (1-y) * (1-p)
    print(like)
    print(np.prod(like))

    df = pandas.DataFrame(dict(y=y, x1=x1, x2=x2))
    results = smf.logit('y ~ x1 + x2', data=df).fit()
    print(results.summary())
Example #5
0
 def logistic_regression(self, use_glm=True):
     """
     (b) it seems the statistical significant predict variable is only Lag2. How disappointing...
     """
     formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
     model = (
         smf.glm(formula, data=self.df, family=sm.families.Binomial())
         if use_glm
         else smf.logit(formula, data=self.transformedDF)
     )
     result = model.fit()
     if use_glm:
         probs = result.fittedvalues
         """Beware the prob here is the index 0's prob, so we should use the lambda function below"""
         pred_values = probs.map(lambda x: 0 if x > 0.5 else 1)
     else:
         """The probability of being 1"""
         probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]])))
         pred_values = probs.map(lambda x: 1 if x > 0.5 else 0)
     """
     (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%.
         Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%.
         Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%.
     """
     tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
def logistic_model(data, explanatory_variables, response_variable, 
                   maxiter = 35, verbose = True):
    explanatory_vars = ' + '.join(explanatory_variables)
    formula = response_variable + ' ~ ' + explanatory_vars

    try:
        model = smf.logit(formula = formula, data = data).fit(maxiter = maxiter)
    except:
        print('Error "' + str(sys.exc_info()[1]) + '" while processing model', formula)
        model = None
    
    if verbose and model != None:
        print()
        print('MODEL:', formula, '\n')
        print(model.summary())
        print()

        # odds ratios with 95% confidence intervals
        print ("Odds Ratios")
        params = model.params
        conf = model.conf_int()
        conf['OR'] = params
        conf.columns = ['Lower CI', 'Upper CI', 'Odds Ratios']
        print (numpy.exp(conf))
        
    return(model)
Example #7
0
    def _corr(self, sel, suffix):
        formula = str('model_accuracy ~ human_accuracy')
        logreg = smf.logit(formula=formula, data=sel).fit()
        summ = logreg.summary()
        if self.html is None:
            print(summ)
        else:
            summ = summ.as_html().replace('class="simpletable"',
                                          'class="simpletable table"')

        sel = sel.rename(columns={'human_accuracy': 'human accuracy',
                                  'model_accuracy': 'model accuracy'})

        sns.lmplot('human accuracy', 'model accuracy', data=sel, x_jitter=.01,
                    y_jitter=.05, logistic=True, truncate=True)

        bins = np.digitize(sel['human accuracy'], np.arange(.05,1,.1))
        #bins[bins==11] = 10
        count = sel['model accuracy'].groupby(bins).count()
        mean = sel['model accuracy'].groupby(bins).mean()
        sns.plt.scatter(.1*mean.index, mean, s=10*count, c='.15',
                        linewidths=0, alpha=.8)
        sns.plt.title(models.NICE_NAMES[self.model_name])
        sns.plt.xlim([-.1, 1.1])
        sns.plt.ylim([-.1, 1.1])
        self.show(pref='corr_sil', suffix=self.model_name + '_' + suffix,
                  caption=suffix + summ)
Example #8
0
def run_logits(grouped, formula, var):
    for code, group in grouped:
        country = get_country(code).ljust(14)
        model = smf.logit(formula, data=group)    
        results = model.fit(disp=False)
        nobs, param, stars = extract_res(results, var=var)
        arrow = '<--' if stars and param > 0 else ''
        print(country, nobs, '%0.3g'%param, stars, arrow, sep='\t')
def log_reg(formula, df):
    try:
        model1 = smf.logit(formula = formula, data=df).fit()
        print model1.summary()
    except Exception:
        print "+" * 40
        print "bad formula"
        print "+" * 40
def logistic_regression_test():
  df = pandas.DataFrame.from_csv('./generated_logistic_data.csv')

  generated_model = smf.logit('y ~ variable_a + variable_b + variable_c', df)
  generated_fit = generated_model.fit()
  roc_data = sklearn.metrics.roc_curve(df['y'], generated_fit.predict(df))
  auc = sklearn.metrics.auc(roc_data[0], roc_data[1])
  print generated_fit.summary()
  print "AUC score: {0}".format(auc)
  assert auc > .8, 'AUC should be significantly above random'
Example #11
0
def fit_model(formula, model_file):
    """
    Saves a model
    :param formula: formula for the model
    :param model_file: name of file to save the model to
    """
    data = load_data()
    model = logit(formula=formula, data=data)
    fitted = model.fit()
    fitted.save(model_file)
def generate_model(df):
    '''
    Create a logistic regression model from loans data based on fields
    FICO.score, Interest.Rate, and Interest.below12
    :param df: a dataframe with fields for the independent vars fico and interest
    and the dependent var discrete_rate
    :return: a fitted logistic model
    '''
    model = smf.logit(formula='discrete_rate  ~ fico + interest', data=df)
    fitted_model = model.fit()
    return fitted_model
Example #13
0
File: Stats.py Project: alanhdu/Dex
    def logRegR(self, event):
        # would have to mess with Patsy formula parser to get more powerful...
        # too much work
        dlg = wx.TextEntryDialog(self.parent, "Enter the linear regression formula")
        if dlg.ShowModal() == wx.ID_OK:
            model = smf.logit(formula=dlg.GetValue(), data=self.parent.data.data)
            results = model.fit()
            self.parent.write("\n" + str(results.summary()) + "\n")
            sns.regplot(results.predict(), model.endog, ci=False, y_jitter=0.2)
            plt.show()

        dlg.Destroy()
def check_initial_specification(dataframe, result_string, new_var, min_specification, fit_word=None):
    assert isinstance(dataframe, pd.DataFrame) #Make sure dataframe is a pandas dataframe.
    assert isinstance(result_string, str) #Make sure the result_string is actually a string
    assert isinstance(new_var, list) #Make sure new_var is a list
    assert isinstance(min_specification, str) #Make sure the min_specification is a string
    
    base_vars = min_specification.split(" + ") #Extract the variables used in the minimum specification
    if "0" in base_vars: #Remove any zeros from the variables used in the minimum specification
        base_vars.remove("0")
    
    #Initialize starting values for the optimization
    start_vals = np.random.rand(len(base_vars + new_var))
    
    #Create the formula string for the logistic regression
    fString = result_string + " ~ " + min_specification + " + " + " + ".join(new_var)
    
    #Make sure the matrix for the logistic regression is invertible
    if not check_full_rank(dataframe, base_vars + new_var):
        #If not, raise an error
        raise Exception("The base model plus {} is not of full rank.".format(new_var))
    
    #Fit the logistic regression
    if fit_word is None:
        model = smf.logit(fString, data=dataframe).fit(start_params = start_vals, maxiter=2000, disp=False)
    else:
        model = smf.logit(fString, data=dataframe).fit(method=fit_word, start_params = start_vals, maxiter=2000, disp=False)
        
    if not model.mle_retvals["converged"]: #Check if the model converged
        #If it did not, raise an error
        raise Exception("The model for {} did not converge".format(new_var))
        
    lowest_pval = model.pvalues[new_var[0]] #Initialize a value for the lowest p-value
    for orig_var in new_var: #Iterate through the new variables
        current_pval = model.pvalues[orig_var]
        #If the current variables p-value is less than the lowest p-value
        if current_pval < lowest_pval:
            #Keep track of this number
            lowest_pval = current_pval
    return lowest_pval
def fit_model(y, formula, df):
    from statsmodels.formula.api import ols, logit

    # If you have a dichotomous variable then
    # we're going to run a logistic regression
    if df[y].nunique() == 2:
        lm = logit(formula, df).fit()
    # otherwise we'll run an ordinary least
    # squares regression
    else:
        lm = ols(formula, df).fit()

    return lm
def log_regression(wine_set):
    # # examining the data before recoding
    # print(wine_set["sulphates"].describe())
    # wine_set["sulphates_c"] = pd.qcut(wine_set["sulphates"], 4)
    # print(wine_set.groupby("sulphates_c").size())
    # print()
    # #
    # print(wine_set["alcohol"].describe())
    # wine_set["alcohol_c"] = pd.qcut(wine_set["alcohol"], 4)
    # print(wine_set.groupby("alcohol_c").size())
    # print()
    #
    # print(wine_set["quality"].describe())
    # wine_set["quality_c"] = pd.qcut(wine_set["quality"], 3)
    # print(wine_set.groupby("quality_c").size())
    # print()


    # recode quality into 2 groups: 0:{3,4,5,6}, 1:{7,8,9}
    recode = {3: 0, 4: 0, 5:0, 6:0, 7:1, 8:1, 9:1}
    wine_set['quality_c'] = wine_set['quality'].map(recode)

    # recode sulphates into 2 groups: 0: <= mean, 1: > mean
    def sulphates_to_cat(x):
       if x['sulphates'] <= wine_set['sulphates'].mean():
          return 0
       else:
          return 1
    wine_set['sulphates_c'] = wine_set.apply(lambda x: sulphates_to_cat(x), axis=1)

    # recode alcohol into 2 groups: 0: <= mean , 1: > mean
    def alcohol_to_cat(x):
       if x['alcohol'] <= wine_set['alcohol'].mean():
          return 0
       else:
          return 1
    wine_set['alcohol_c'] = wine_set.apply(lambda x: alcohol_to_cat(x), axis=1)
    # print(wine_set.head(10))

    # logistic regression for sulphates+alcohol -> quality
    print ("Logistic regression model for the association between wine's quality and sulphates&alcohol")
    model1 = smf.logit(formula="quality_c ~ sulphates_c + alcohol_c", data=wine_set)
    results1 = model1.fit()
    print(results1.summary())

    # odds ratios with 95% confidence intervals
    print("\nConfidence intervals")
    conf = results1.conf_int()
    conf['Odds ratio'] = results1.params
    conf.columns = ['Lower conf.int.', 'Upper conf.int.', 'Odds ratio']
    print(numpy.exp(conf))
Example #17
0
def report_logitreg(formula, data, verbose=True):
    """Fit logistic regression, print a report, and return the fit object."""
    results = smf.logit(formula, data=data).fit()
    summary = results.summary()
    margeff = results.get_margeff().summary()


    if verbose:
        report = """
{summary}\n\n
{margeff}\n""".format(summary=summary,margeff=margeff)

        print(report)

    return results
def test_log_regression():
	"""Tests the results of logistic regression.
	Explore on the beta coefficient

	"""
	run = load_in_dataframe(2)
	run_added = add_gainlossratio(run)
	run_final = organize_columns(run_added)

	#fit the logistic regression line
	fitted = logit("respcat ~ gain + loss", run_final).fit()
	#get the parameters
	fitted_params = fitted.params.as_matrix()
	test_fitted_params = log_regression(run_final).as_matrix()
	assert_array_equal(fitted_params,test_fitted_params)
Example #19
0
 def logistic_fit(self, glm_fit=True):
     '''
     The logit function would report error when y(Direction) is not transformed to 0/1
     So glm looks easier to use
     '''
     formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
     if glm_fit is True:
         model = smf.glm(formula, data=self.df, family=sm.families.Binomial())
     else:
         # In fact, this function has wrong fittedvalues, but it's predict value is still right.
         model = smf.logit(formula, data=self.df)
     result = model.fit()
     print result.summary()
     # In logit fit there are errors here. Not sure why...
     if glm_fit:
         self.output_binary_table(result, result.fittedvalues, model.endog.astype(int), glm_fit)
Example #20
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1 * (np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit()

        sml = sm.logit("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
Example #21
0
    def test_compare_logit(self):

        vs = Independence()
        family = Binomial()

        Y = 1*(np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                                family=family, cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.logit("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=10)
def log_regression(run_final):
	"""Do logistic regression on train cols to predict the subject's decision
	
	Parameters
    ----------
    run : data.frame
        behavioral data frame with organized columns

    Returns
    -------
    logit_pars : logistic regression result summary
        the logistic regression result summary

    """
	# do logistic regression
	x = logit("respcat ~ gain + loss", run_final).fit()

	# check the summary
	print(x.summary())

	#store the parameters of logistic regression
	logit_pars = x.params

	return logit_pars
    else:
        auc_value = auc(-final_sample.ix[:, i], final_sample.default_flag)
    gini_coeff = 2 * auc_value - 1
    print "The Gini Coefficient for transformed factor %s is %s" % (i,
                                                                    gini_coeff)

regf1 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + cur_rto_tran'
regf2 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + net_margin_rto_tran'
regf3 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + debt_to_ebitda_rto_tran + debt_to_tnw_rto_tran + cur_rto_tran + net_margin_rto_tran'
#regf4 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran'   # need some transformation
regf4 = 'default_flag ~ dsc_tran + tot_sales_amt_tran + yrs_in_bus_tran_bin + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran'  # use woe transform for yrs_in_bus
regf5 = 'default_flag ~ dsc_tran + C(_tot_sales2) + yrs_in_bus_tran_bin + debt_to_ebitda_rto_tran + net_margin_rto_tran + cur_rto_tran'  # use woe transform for yrs_in_bus

# final sample exclude s2014 data

regm1 = smf.logit(formula=str(regf1), data=final_sample).fit()
auc1 = auc(regm1.predict(), final_sample.default_flag)
auc_preddata = auc(regm1.predict(f121314), f121314.default_flag)
print "The AUC for current model m1 is: %s, and AUC for OOT data is %s" % (
    auc1, auc_preddata)

regm2 = smf.logit(formula=str(regf2), data=final_sample).fit()
auc2 = auc(regm2.predict(), final_sample.default_flag)
auc_preddata = auc(regm2.predict(f121314), f121314.default_flag)
print "The AUC for current model m2 is: %s, and AUC for OOT data is %s" % (
    auc2, auc_preddata)

regm3 = smf.logit(formula=str(regf3), data=final_sample).fit()
auc3 = auc(regm3.predict(), final_sample.default_flag)
auc_preddata = auc(regm3.predict(f121314), f121314.default_flag)
print "The AUC for current model m3 is: %s, and AUC for OOT data is %s" % (
Example #24
0
from sklearn.linear_model import LogisticRegression

train_data.columns

X = train_data.loc[: , 'Age': 'Married']
y = train_data.loc[: , 'Defaulter_Flag']


model = LogisticRegression()
model = model.fit(X,y)


import statsmodels.formula.api as smf
train_data.columns 

logitfit = smf.logit(formula = 'Defaulter_Flag ~ Age + YOE + Gender + Married', data = train_data).fit()

logitfit.summary()       # summary of the model
logitfit.predict()            # predict
logitfit.pred_table()          # confusion matrix


threshold = 0.5

predicted = logitfit.predict(test_data.loc[: , 'Age': 'Married']) 

predicted_choice = (predicted > threshold).astype(int)

Confusion_matrix = pd.crosstab(test_data.Defaulter_Flag, predicted_choice, rownames=['Defaulter_Flag_count'], colnames=["Predicted_count"])
Confusion_matrix
Accuracy = (Confusion_matrix[0][0] + Confusion_matrix[1][1])/ (Confusion_matrix[0][0] + Confusion_matrix[1][1] + Confusion_matrix[0][1] + Confusion_matrix[1][0])
""" LOGISTIC_BANK """

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

bank = pd.read_csv(
    "C:\\EXCELR\\NOTES WRITTEN\\SOLVING_ASSIGNMENTS\\Logistic Regression\\solution\\bank_data.csv"
)
bank.head(5)
bank.shape
bank.isnull().sum()

import statsmodels.formula.api as smf
logit_model = smf.logit('ATTORNEY~CLMAGE+LOSS+CLMINSUR+CLMSEX+SEATBELT',
                        data=claimants).fit()
logit_model.summary()

corrmat = bank.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(10, 10))
#plot heat map
g = sns.heatmap(bank[top_corr_features].corr(), annot=True, cmap="RdYlGn")
bank["pdays"] = 1
bank.pdays
#################################################################################################
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
Example #26
0

# binary nicotine dependence
def obamaORomney3(x):
    if x == 1:
        return 1
    else:
        return 0


result2_sorted['prevote_primvwho'] = result2_sorted["prevote_primvwho"].apply(
    lambda x: obamaORomney3(x))

# logistic regression with social phobia
lreg1 = smf.logit(
    formula='prevote_primvwho ~ econ_opnion_c + incgroup_prepost_c',
    data=result2_sorted).fit()
print(lreg1.summary())
# odds ratios
print("Odds Ratios")
print(numpy.exp(lreg1.params))

# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print(numpy.exp(conf))
# ----------------------------------------------------------------------------#
# Decision Tree
Example #27
0
        select_col = np.where(select_col == -99999, floor_value,
                              np.where(select_col == 99999, cap_value,
                                       select_col))  # replace -99999/99999
        select_col_after_fc = np.where(select_col <= floor_value, floor_value,
                                       np.where(select_col >= cap_value,
                                                cap_value,
                                                select_col))  #floor cap
        select_col_after_fc_impute = np.where(np.isnan(select_col_after_fc),
                                              impute_value,
                                              select_col_after_fc)  #impute
        select_col_after_fc_impute_normalized = (select_col_after_fc_impute -
                                                 mean_value) / std_value
        indata[var_x + '_tran'] = select_col_after_fc_impute
        indata[var_x + '_normalized'] = select_col_after_fc_impute_normalized
    tran_vars = [x for x in list(indata) if '_tran' in x]
    normalized_vars = [x for x in list(indata) if '_normalized' in x]
    return indata.ix[:, tran_vars + normalized_vars].describe()


## applying the function to f121314 to do transformation and normalization
gcca_test_tran_summary = normalize_test(gcca_test, normalized_summary_matrix)

tran_vars = [x + '_tran' for x in model_factors]
f = 'df ~ ' + ' + '.join(tran_vars)

f = 'df ~ net_mrgn_rto_tran + debt_srvc_cov_rto_tran + C(yrs_in_business_b) + debt_to_ebitda_rto_tran + cur_rto_tran'
m1 = smf.logit(formula=str(f), data=gcca_dev).fit()

print auc(m1.predict(gcca_dev), gcca_dev.df) * 2 - 1
print auc(m1.predict(gcca_test), gcca_test.df) * 2 - 1
Example #28
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  9 13:59:07 2016

@author: emg
"""
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from variable_manipulation import df
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
df = pd.read_csv(
    '/Users/emg/Google Drive/MSc SRMS/MSc Disseration/ass_reddit_comments_may_2015/practice_5_25.csv'
)

# make subreddits numeric
df['subreddit'] = pd.to_numeric(df['subreddit'],
                                errors='coerce')  # doesn't work
subnums = {"N/A": 0.0, "AskSocialScience": 1.0, "AskStatistics": 2.0}
df['sub_num'] = df['subreddit'].apply(subnums.get)

prac = df.loc[:, ['mod', 'score', 'sub_num']]
prac = df.loc[:, ['mod', 'score', 'subreddit']]

lreg1 = smf.logit(formula='mod ~ sub_num', data=df).fit()
print lreg1.summary()
# print(np.exp(clf.coef_)) #odds ratio
# print(clf.coef_) #relationship

print('method 2')

# logit only accepts 0/1 as target values
df_train.BOOKED.replace([1, 2], [0, 1], inplace=True)

est = sm.Logit(y, X)
est2 = est.fit()
print(est2.summary())

params = est2.params
conf = est2.conf_int()
conf['Odds Ratio'] = params
conf.columns = ['5%', '95%', 'Odds Ratio']
print(np.exp(conf))

'''
alternate https://pythonfordatascience.org/logistic-regression-python/
'''
model = smf.logit(
    formula="BOOKED~ C(PSYYR2)+ C(IRSEX)+ C(EDUCCAT2)+ C(IRMARIT)+ C(CATAG3)+ C(NEWRACE2)+ C(GOVTPROG)+ C(EMPSTATY)+ C(HVYDRK2)+ C(MJOFLAG)+ C(SUMFLAG)",
    data=df_train).fit()
model.summary()

model_odds = pd.DataFrame(np.exp(model.params), columns=['OR'])
model_odds['z-value'] = model.pvalues
model_odds[['2.5%', '97.5%']] = np.exp(model.conf_int())
print(model_odds)
Example #30
0
plt.boxplot(df_affairs["rating"])   # No outlier is present

######Need log transform age for outlier###########
x=np.log(df_affairs["age"])
plt.boxplot(x)
 #####Split the data into train and test#############
 from sklearn.model_selection import train_test_split 
train_data,test_data=train_test_split(df_affairs,test_size=0.3)
train_data=train_data.reset_index()
test_data=test_data.reset_index()
train_data=train_data.drop(["index"],axis=1)
test_data=test_data.drop(["index"],axis=1)
########Building the model############
import statsmodels.formula.api as sm
train_data.isnull().sum()
m1=sm.logit("AF~np.log(age)+yearsmarried+religiousness+education+occupation+rating+gender_female+gender_male+children_no+children_yes", data=train_data).fit()
m1.summary()
m1.summary2()
#AIC=486

train_pred=m1.predict(train_data)


from scipy import stats
import scipy.stats as st
st.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

train_data["train_pred"]=np.zeros(420)

train_data.loc[train_pred>=0.5,"train_pred"]=1
Example #31
0
listtrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_list_train.csv')  # coupon_id_hash level
listtrain.columns = [x.lower() for x in listtrain.columns]

# coupon_area_train.csv - the coupon listing area for the training set coupons
areatrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_area_train.csv')
areatrain.columns = [x.lower() for x in areatrain.columns]

# coupon_detail_train.csv - the purchase log of users buying coupons during the training set time period. You are not provided this table for the test set period.
detailtrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_detail_train.csv') # purchaseid level
detailtrain.columns = [x.lower() for x in detailtrain.columns]

# coupon_visit_train.csv - the viewing log of users browsing coupons during the training set time period. You are not provided this table for the test set period.
visittrain = pd.read_csv(r'H:\python\kaggle\06coupons\coupon_visit_train.csv')
visittrain.columns = [x.lower() for x in visittrain.columns]

capsule = pd.read_excel(r'H:\python\kaggle\06coupons\jpn_2_english.xlsx', sheetname = 'capsule')
capsule.columns = [x.lower() for x in capsule.columns]

genre = pd.read_excel(r'H:\python\kaggle\06coupons\jpn_2_english.xlsx', sheetname = 'genre')
genre.columns = [x.lower() for x in genre.columns]

# user list file
userlist = pd.read_csv(r'H:\python\kaggle\06coupons\user_list.csv')
userlist.columns = [x.lower() for x in userlist.columns]

# merge visit table with coupon info
train_data = pd.merge(listtrain, visittrain, left_on = 'coupon_id_hash', right_on = 'view_coupon_id_hash', how = 'inner')

f = 'purchase_flg ~ C(genre_name) + price_rate + discount_price + C(usable_date_mon)'
logfit = smf.logit(formula = str(f), data = train_data).fit()
            marker='o',
            edgecolors='r',
            facecolors='none')
plt.ylim([0, 80000])
plt.xlim([0, 2800])
plt.legend(('default', 'no default'), loc='upper right')

# 6 - What can you infer from this plot?
# It appears that the balance is more correlated with default than income
'''
PART II - LOGISTIC REGRESSION
'''

# 1 - Run a logistic regression on the balance variable
# 2 - Is the beta  value associated with balance significant?
balance = smf.logit('default ~ balance', data=train).fit()
balance.summary()
np.exp(balance.params.balance)

# Beta is significant!
# 2 - Predict the probability of default for someone with a balance of $1.2k and $2k
prob = balance.predict({'balance': [1200, 2000]})

# What does beta mean? Let's create some plots to find out!
x = np.linspace(test.balance.min(), test.balance.max(), 500)
beta = [balance.params.Intercept, balance.params.balance]

y = np.exp(beta[0] + beta[1] * x) / (1 + np.exp(beta[0] + beta[1] * x))
odds = np.exp(beta[0] + beta[1] * x)
log_odds = beta[0] + beta[1] * x
Example #33
0
data.shape
new_data['y'] = y
new_data.shape
new_data.columns
"""
(['education', 'age', 'rating', 'religiousness', 'occupation',
       'yearsmarried', 'gender', 'children', 'y'],
      dtype='object')
"""
from sklearn.model_selection import train_test_split
train, test = train_test_split(new_data, test_size=0.3)

import statsmodels.formula.api as smf
logit_model = smf.logit(
    'y~education+age+rating+religiousness+occupation+yearsmarried+gender+children',
    data=train).fit()
logit_model.summary()
"""
LLR p-value:                     0.000
Log-Likelihood:                -200.07
LL-Null:                       -225.77
LLR p-value:                 2.201e-08

=================================================================================
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         2.4451      1.091      2.241      0.025       0.307       4.583
education        -0.0456      0.062     -0.731      0.465      -0.168       0.077
age              -0.0558      0.023     -2.443      0.015      -0.101      -0.011
rating           -0.4970      0.113     -4.411      0.000      -0.718      -0.276
###############################################################################
"""
Running a smf.logit regression to understand the coefficient weight of the 
features chosen for the models tested previously. There I identified the
following variables that will increase chances of survival (will be negative
coefficients since I am using IsDead): bk5_only, isMarried, book4, dateOfBirth.
By doing so I can have a better idea as to what is good and what is bad for 
survival. 
"""

log_got_p = smf.logit(formula="""isDead ~ 
                                                 got_df['alive_by_age'] 
                                                + got_df['hm_books'] 
                                                + got_df['popularity'] 
                                                + got_df['numDeadRelations'] 
                                                + got_df['bk5_only'] 
                                                + got_df['isNoble'] 
                                                + got_df['isMarried'] 
                                                + got_df['book4'] 
                                                + got_df['bk1_only'] 
                                                + got_df['dateOfBirth']""",
                      data=got_df)

results_logistic_full = log_got_p.fit()

results_logistic_full.summary()

###############################################################################
##################### BEST MODEL IN TERMS OF AUC ##############################
###############################################################################
"""
This is the model I created with specific features that gave me the best Test
Example #35
0
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

import pandas as pd
df = pd.read_csv('nes.dat', sep=r'\s+')
df = df[['presvote', 'year', 'income', 'black']]
df = df[df['presvote'] < 3]  # sadece 2 partinin oylarini al
# 1,2 oylari 1,0 yap, Cumhuriyetciye verildi mi evet/hayir
# haline getir
df['vote'] = df['presvote'].map(lambda x: x - 1)
df = df.drop('presvote', axis=1)
df = df.dropna()

df2 = df[df['year'] == 1992]
mdlm = smf.logit("vote ~ income", df2)
mdlmf = mdlm.fit()
print(mdlmf.summary())
Example #36
0
def predict_ci(fitted, df, alpha=0.05):
    """
    Compute predicted probabilities with confidence intervals based on a
    logistic regression model

    Parameters
    ----------
    fitted  A logistic regression model fitted using the statsmodels formula interface
    df      A pandas dataframe with input data for prediction
    alpha   Significance level (0-1). Default is 0.05

    Returns
    -------
    A dataframe with probability predictions and lower and upper confidence bounds

    Example
    -------
    import numpy as np
    import statsmodels.formula.api as smf
    import pandas as pd

    # simulate data
    np.random.seed(1)
    x1 = np.arange(100)
    x2 = pd.Series(["a", "b", "c", "a"], dtype="category").sample(100, replace=True)
    y = (x1 * 0.5 + np.random.normal(size=100, scale=10) > 30).astype(int)
    df = pd.DataFrame({"y": y, "x1": x1, "x2": x2})

    # estimate the model
    model = smf.logit(formula="y ~ x1 + x2", data=df).fit()
    model.summary()
    pred = predict_ci(model, df)

    plt.clf()
    plt.plot(x1, pred["prediction"])
    plt.plot(x1, pred["2.5%"], color='black', linestyle="--", linewidth=0.5)
    plt.plot(x1, pred["97.5%"], color='black', linestyle="--", linewidth=0.5)
    plt.show()
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("alpha must be a numeric value between 0 and 1")

    # generate prediction
    prediction = fitted.predict(df)

    # adding a fake endogenous variable
    df = df.copy()  # making a full copy
    df["__endog__"] = 1
    form = "__endog__ ~ " + fitted.model.formula.split("~", 1)[1]
    df = smf.logit(formula=form, data=df).exog

    low, high = [alpha / 2, 1 - (alpha / 2)]
    Xb = np.dot(df, fitted.params)
    se = np.sqrt((df.dot(fitted.cov_params()) * df).sum(-1))
    me = norm.ppf(high) * se
    lb = np.exp(Xb - me)
    ub = np.exp(Xb + me)

    return pd.DataFrame({
        "prediction": prediction,
        f"{low*100}%": lb / (1 + lb),
        f"{high*100}%": ub / (1 + ub),
    })
Example #37
0
affairs.affairs.value_counts()

#for categorizing in 0 and 1 for logistic_regression
affairs["Att_val"] = np.zeros(601)
# converting the affairs to binary variable
affairs.loc[affairs.affairs >= 1,"Att_val"] = 1
affairs.drop(["affairs"],axis=1,inplace=True)

#encoding the string values
affairs.iloc[:,0:1].columns
affairs["gender"] = pd.get_dummies(affairs["gender"])
affairs["children"] = pd.get_dummies(affairs["children"])

#model for logistics regression
import statsmodels.formula.api as sm
logit_model = sm.logit('Att_val~age+yearsmarried+religiousness+rating',data = affairs).fit()

logit_model.summary()
y_pred = logit_model.predict(affairs)

y_pred_val = y_pred

affairs["y_pred"]=y_pred
plt.hist(y_pred)
affairs.loc[y_pred>=0.5,"y_pred"] = 1
affairs.loc[y_pred<0.5,"y_pred"] = 0

from sklearn.metrics import classification_report
classification_report(affairs.Att_val,affairs.y_pred)
#classificaion report
'''
def sm_logit(
    df,
    f=None,
    features=None,
    outcome='outcome_field',
    add_constant=True,
    categorical=None,
    maxiter=35,
    reg_method=None,
    reg_alpha=10,
    missing='raise',
    #log_trans=None,
    #sort_by='z',
    #outcome_behavior=None,
    subset=None,
    method='newton',
):
    """reg_method: regularization method. None (default), 'l1' or 'l1_cvxopt_cp'.

    reg_alpha: weight to apply regularization penalty. Default: 1.0.
    higher alpha = more coeff equal to zero

    missing: what to do with rows with missing values. 'raise' (default) or 'drop'.
    """
    if features is not None:
        df = df[features]
    else:
        features = df.columns.tolist()

    #if add_constant:
    #    df = sm.tools.add_constant(df, prepend=False, has_constant='raise')

    if f is None:
        these_features = [x for x in features if x != outcome]
        if categorical is not None:
            f = '{} ~ '.format(outcome) + ' + '.join([
                'C({})'.format(x) if x in categorical else x
                for x in these_features
            ])
        else:
            f = '{} ~ '.format(outcome) + ' + '.join(
                [x for x in these_features])
    # debug
    print(f)

    if reg_method is not None:
        # if subset is not None:
        #     df = df.loc[subset, :]
        # y, X = patsy.dmatrices(f, df, return_type='dataframe')

        # reg_alpha = reg_alpha * np.ones(X.shape[1])
        # reg_alpha[X.columns.tolist().index('Intercept')] = 0

        # results_log = sm.Logit(y, X, missing=missing).fit_regularized(method=reg_method, alpha=reg_alpha)

        results_log = smf.logit(f, df, subset=subset,
                                missing=missing).fit_regularized(
                                    method=reg_method, alpha=reg_alpha)
    else:
        #results_log = sm.Logit.from_formula(f, df, missing='raise').fit(maxiter=maxiter)
        results_log = smf.logit(f, df, subset=subset,
                                missing=missing).fit(maxiter=maxiter,
                                                     method=method)

    #print_sm_logit_results(results_log, sort_by=sort_by, log_trans=log_trans, outcome_behavior=outcome_behavior)

    return results_log
Example #39
0

featuresdf['status_group_relabel'] = featuresdf.apply(lambda row: relabel(row),
                                                      axis=1)

# do a table for status group and show the mean population
pivot1 = featuresdf.pivot_table(
    index="status_group",
    values=["amount_tsh", "population", "gps_height"],
    aggfunc=[np.mean, np.median, statistics.mode])

ax = pivot1.plot.bar(rot=0)
plt.show()

for i in yvals:
    lreg1 = smf.logit(formula=i + '~ amount_tsh + population',
                      data=featuresdf).fit()

    print(lreg1.summary())
    print('')

    # odds ratios
    print("Odds Ratios")
    print(np.exp(lreg1.params))

    # odd ratios with 95% confidence intervals
    params = lreg1.params
    conf = lreg1.conf_int()
    conf['OR'] = params
    conf.columns = ['Lower CI', 'Upper CI', 'OR']
    print(np.exp(conf))
Example #40
0
def regression(df, rf):
    model = smf.logit(data=df, formula=rf).fit()
    print model.summary()
bank1 = pd.concat([ag_log,bank1],axis=1)
#concating log_age column with the dataset
bank1.drop(["age"],inplace = True, axis=1)
#since age column is insignificant and not needed 





x= bank1.iloc[:,0:42]#creating a new object with only taking the features or i/p variables

#x.drop(["y"],inplace= True , axis=1)


model1= sm.logit("y_yes~x", data = bank1 ).fit()
model1.summary()
model1.summary2()
## AIC:  21644.8803



#Removing all the insignificant columns which are not needed

## Majorly insignificant colmns is being removed and seeing if the insignificance of the variables are removed 

x.iloc[:,23].name
x0= x.drop(["default_yes"],axis=1)

model2= sm.logit("y_yes~x0", data = bank1).fit()
model2.summary()
data = {"x1":x1.flatten(), "x2":x2.flatten(), 
        "y_discrete":y_discrete.flatten(), "y_auxiliar":y_auxiliar.flatten()}
df = pd.DataFrame(data)

# Plot the data. What type is it? What should we expect from it?
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['x1'], df['x2'],  df['y_discrete'], label="y measured", color="k")
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('y')
plt.legend(loc="upper left", fontsize=10, numpoints=1)
plt.show()

# Ordinary Least Squares = Linear Regression
model = smf.logit(formula="y_discrete ~ x1 + x2", data=df)
fitted_model = model.fit()
coeffs = fitted_model.params
print fitted_model.summary()
print "The model obtained is y = 1./(1 + exp(-({0} + {1}*x1 + {1}*x2)))".format(*coeffs)
print coeffs

# Plot the data. What type is it? What should we expect from it?
y_model = fitted_model.predict(df[["x1","x2"]])
y_prediction = np.where(y_model<0.5, 0, 1)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['x1'], df['x2'],  df['y_discrete'], label="y measured", color="k")
ax.plot_trisurf(df['x1'], df['x2'],  y_model, label="Model", color="w", alpha=0.25)
ax.scatter(df['x1'], df['x2'],  y_prediction + 0.05, label="Prediction", color="r")
ax.set_xlabel('x1')
Example #43
0
## Generating auc_score based on the optimal KNN model
print('knn_auc_train Score', knn_train_auc_score.round(4))
#score - 0.852
print('knn_auc_test Score', knn_test_auc_score.round(4))
#score - 0.828

###############################################################################
## Classification with logistic
###############################################################################
logistic_full = smf.logit(formula="""isAlive ~ 
                       got['male']+
                       got['age']+
                       got['isMarried']+
                       
                       got['isNoble']+
                       got['book1_A_Game_Of_Thrones']+
                       got['book3_A_Storm_Of_Swords']+
                       got['book4_A_Feast_For_Crows']+
                       got['popularity']+
                       got['house_alive_pct']+
                       got['title_alive_pct']""",
                          data=got)

results_logistic_full = logistic_full.fit()

results_logistic_full.summary()

results_logistic_full.pvalues

###############################################################################
# Hyperparameter Tuning with Logistic Regression
Example #44
0
# RainTomorrow : 종속변수 , 나머지열 : 독립변수

# train / test dataset 으로 분리 : 과적합(overfitting) 방지목적

train , test = train_test_split(data2, test_size = 0.3, random_state = 42) # 데이터 셔플링후 30 % 의 데이터를 뽑음
print(data.shape,train.shape,test.shape)

# 분류 모델 
my_formula = 'RainTomorrow ~ MinTemp + MaxTemp + Rainfall....'
col_select = '+'.join(train.columns.difference(['RainTomorrow']))
my_formula = 'RainTomorrow ~' + col_select
print(my_formula)

# 분류를 위한 학습모델의 생성
#model = smf.glm(formula = my_formula, data = train, family = sm.families.Binomial()).fit() #모델을 fitting 시킬땐 trian
model = smf.logit(formula = my_formula, data = train).fit() #모델을 fitting 시킬땐 trian

#print(model.summary())
#print(model.params())
print('예측값:' , np.rint(model.predict(test)[:5])) # 모델을 예측할때는 test
print('실제값:' , test['RainTomorrow'][:5])

# 분류 정확도 

conf_mat = model.pred_table()
print(conf_mat)
print((conf_mat[0][0]+ conf_mat[0][0]) / len(train))

from sklearn.metrics import accuracy_score
pred = model.predict(test)
print('분류 정확도 : ', accuracy_score(test['RainTomorrow'],np.around(pred)))
Example #45
0
def binning(oldDF, dataSeries, binList, newColName, labelList, deleteOldColumn):
    columnForBins = pandas.cut(x = dataSeries, bins = binList, labels = labelList).to_frame()
    columnForBins.columns = [newColName]
    df_new = pandas.concat([oldDF,columnForBins],axis = 1)
    if deleteOldColumn is True:
        df_new = df_new.drop(dataSeries.name,axis=1)
    return df_new


dataForFrame = {'IPP': data['incomeperperson'], 'IUR': data['internetuserate'], 'LE': data['lifeexpectancy']}
df = pandas.DataFrame(data = dataForFrame).dropna()
df = binning(df, df['LE'], [0, 70, 100], 'BinnedLE', ['0-70','70-100'], True)
df = df.replace(to_replace={'BinnedLE' : {'0-70' : 0, '70-100' : 1}})

# logistic regression with Income per Person
lreg1 = smf.logit(formula = 'BinnedLE ~ IPP', data = df).fit()
print (lreg1.summary())

# odd ratios with 95% confidence intervals
params = lreg1.params
conf1 = lreg1.conf_int()
conf1['OR'] = params
conf1.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf1))

# logistic regression with IUR
lreg2 = smf.logit(formula = 'BinnedLE ~ IUR', data = df).fit()
print (lreg2.summary())

# odd ratios with 95% confidence intervals
params = lreg2.params
sub1['hh_income_c'] = sub1['hh_income'] - sub1['hh_income'].mean()
sub1['hh_income_c'].describe()

sub1['hh_income_c'] = sub1['hh_income_c'] / 10000
sub1['hh_income_c'].describe()

reg1 = smf.ols('neg_outlook ~ hh_income_c', data=sub1).fit()
print(reg1.summary())  #Adj. R-squared = -0.000 #household income insignificant

reg2 = smf.ols('neg_outlook ~ soc_class', data=sub1).fit()
print(reg2.summary())  #Adj. R-squared = 0.002

reg3 = smf.ols('neg_outlook ~ soc_class + C(ethnicity)', data=sub1).fit()
print(reg3.summary())  #Adj. R-squared = 0.002 -> 0.045

lreg1 = smf.logit(formula='neg_outlook ~ hh_income_c', data=sub1).fit()
print(lreg1.summary())  #household income insignificant
print("Odds Ratios")
print(numpy.exp(lreg1.params))

lreg2 = smf.logit(formula='neg_outlook ~ soc_class', data=sub1).fit()
print(lreg2.summary())

params = lreg2.params
conf = lreg2.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print(numpy.exp(conf))

lreg3 = smf.logit(formula='neg_outlook ~ soc_class + C(ethnicity)',
                  data=sub1).fit()
election.iloc[:,2:] = election.iloc[:,2:].apply(lambda x:x.fillna(x.mean()))

election.PR=election.PR.fillna(election.PR.mean())
election.iloc[:,3:] = election.iloc[:,3:].apply(lambda x:x.fillna(x.mean()))

# Checking if we have na values or not 
election.isnull().sum() # No null values
from scipy import stats
import scipy.stats as st
st.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)


#Model building 

import statsmodels.formula.api as sm
logit_model=sm.logit('Result~Year+AS+PR',data=election).fit()


#summary
logit_model.summary()
y_pred = logit_model.predict(election)

election["pred_prob"] = y_pred
# Creating new column for storing predicted class of Attorney

# filling all the cells with zeroes
election["Att_val"] = 0

# taking threshold value as 0.5 and above the prob value will be treated 
# as correct value 
election.loc[y_pred>=0.5,"Att_val"] = 1
Example #48
0
import statsmodels.api as sm
from statsmodels.formula.api import logit, probit, poisson, ols

print(sm.datasets.fair.SOURCE)

print(sm.datasets.fair.NOTE)

dta = sm.datasets.fair.load_pandas().data

dta['affair'] = (dta['affairs'] > 0).astype(float)
print(dta.head(10))

print(dta.describe())

affair_mod = logit(
    "affair ~ occupation + educ + occupation_husb"
    "+ rate_marriage + age + yrs_married + children"
    " + religious", dta).fit()

print(affair_mod.summary())

# How well are we predicting?

affair_mod.pred_table()

# The coefficients of the discrete choice model do not tell us much. What we're after is marginal effects.

mfx = affair_mod.get_margeff()
print(mfx.summary())

respondent1000 = dta.iloc[1000]
print(respondent1000)
#We now apply these equations to our variables to get new columns

#Note: we don't include longitude as our linear regression model showed no strong association
#between it and the crater diameter

data2['LATITUDE_BIN'] = data2['LATITUDE'].apply(lambda x: georegion(x))
data2['CRATER_SIZE_BIN'] = data2['DIAMETER'].apply(lambda x: cratersize(x))
data2['NUMBER_LAYERS_BIN'] = data2['NUMBER_LAYERS'].apply(lambda x: layers(x))
data2['DEPTH_BIN'] = data2['DEPTH'].apply(lambda x: depth(x))
data2.head(5)

#now we'll look at our logistic regression just using our primary variable

print('Modelling between crater size and latitude bin')
model1 = smf.logit(formula='CRATER_SIZE_BIN ~ LATITUDE_BIN',data=data2).fit()
print(model1.summary())
print('Odds Ratios')
print(numpy.exp(model1.params))

#odds ratios with 95% confidence intervals
params = model1.params
conf = model1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
numpy.exp(conf)

print('Modelling between crater size and latitude bin and layers')
model2 = smf.logit(formula='CRATER_SIZE_BIN ~ LATITUDE_BIN + NUMBER_LAYERS_BIN',data=data2).fit()
print(model2.summary())
print('Odds Ratios')
Example #50
0
plt.plot(mean_income_by_educ, 'o', alpha = 0.5)

# Plot the predictions
pred = results.predict(df)
plt.plot(df['educ'], pred, label='Age 30')

# Label axes
plt.xlabel('Education (years)')
plt.ylabel('Income (1986 $)')
plt.legend()
plt.show()


# Predicting a binary variable
# Let's use logistic regression to predict a binary variable. Specifically, we'll use age, sex, and education level to predict support for legalizing cannabis (marijuana) in the U.S.

# In the GSS dataset, the variable grass records the answer to the question "Do you think the use of marijuana should be made legal or not?"


# Recode grass
gss['grass'].replace(2, 0, inplace=True)

# Run logistic regression
results = smf.logit('grass~ age + age2 + educ + educ2 + C(sex)', data = gss).fit()
results.params





Example #51
0
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

y = stats.binom.rvs(1, 0.5, size=100)
x = stats.norm.rvs(0, 1, size=100) + 2 * y
sim_data = pd.DataFrame({'y': y, 'x': x})

# estimation:
reg_lin = smf.ols(formula='y ~ x', data=sim_data)
results_lin = reg_lin.fit()
reg_logit = smf.logit(formula='y ~ x', data=sim_data)
results_logit = reg_logit.fit(disp=0)
reg_probit = smf.probit(formula='y ~ x', data=sim_data)
results_probit = reg_probit.fit(disp=0)

# calculate partial effects:
PE_lin = np.repeat(results_lin.params['x'], 100)

xb_logit = results_logit.fittedvalues
factor_logit = stats.logistic.pdf(xb_logit)
PE_logit = results_logit.params['x'] * factor_logit

xb_probit = results_probit.fittedvalues
factor_probit = stats.norm.pdf(xb_probit)
PE_probit = results_probit.params['x'] * factor_probit
Example #52
0
new_data.columns
"""
we need o add y output variable to the new data set
"""

data.shape
new_data['y'] = y
new_data.shape
new_data.columns
from sklearn.model_selection import train_test_split
train, test = train_test_split(new_data, test_size=0.3)

import statsmodels.formula.api as smf
logit_model = smf.logit(
    'y~duration+balance+pdays+poutsuccess+previous+campaign+con_unknown+housing+con_cellular+joretired',
    data=train).fit()
logit_model.summary()
"""
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                31647
Model:                          Logit   Df Residuals:                    31636
Method:                           MLE   Df Model:                           10
Date:                Sat, 07 Dec 2019   Pseudo R-squ.:                  0.2941
Time:                        06:05:38   Log-Likelihood:                -8074.2
converged:                       True   LL-Null:                       -11438.
                                        LLR p-value:                     0.000
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Example #53
0
File: que5.py Project: chinadd/GWAS

def lr_pvalue(null, full, df):
	# compare a two models, null and full
	# df should be the difference in the number of parameters
	# return a the p-value for the deviation (2 * difference in log-likelihood)
	lrstat = -2 * null.llf + 2 * full.llf
	return stats.chi2.sf(lrstat, df=df) # using chi-square


if __name__ == "__main__":

	# subsample
	samplesize = 1541;thres=float(0.05)/float(samplesize);p_lr=[]
     	# load SNP data
	df = pandas.read_csv("mycsv.csv")
	df = df.loc[np.random.choice(df.index, samplesize, replace=False)]
     
	# null-model with just sex
	nullmodel = logit("PHENOTYPE ~ SEX", data=df).fit(disp=False)
      
        for i in xrange(0, 1110):
            mymodel = logit("PHENOTYPE ~ snp_%d + SEX" % i, data=df).fit(disp=False)
            p = lr_pvalue(nullmodel, mymodel, 1)
            if p<thres:
                p_lr.append(['snp_%d'% i, 'p-value %.5g' %p])

with open ('question5.txt','wt') as fout:
    for line in p_lr:
        fout.writelines(str(line))
        fout.write("\n")
Example #54
0
######3from sklearn.preprocessing import Normalizer
####33x=bank.iloc[:,:].values
####3x
##z=pd.DataFrame(x)
#labelencoder_x=LabelEncoder()
##x[:,1]=labelencoder_x.fit_transform(x[:,1])

#z=pd.DataFrame(x)
###3onehotencoder=OneHotEncoder(categorical_features=[1])
#####x= onehotencoder.fit_transform(x).toarray()
bank.columns
bank.corr()
banks=pd.DataFrame(bank)
banks.shape
import statsmodels.formula.api as sm
logit_model=sm.logit('yy~age+balance+day+duration+campaign+pdays+previous+jobs+maritals+educations+defaults+housings+loans+contacts+months',data=banks).fit()
logit_model.summary()
logit_model2=sm.logit('yy~age+balance+day+duration+campaign+pdays+previous+maritals+educations+defaults+housings+loans+contacts+months',data=banks).fit()
logit_model2.summary()
y_pred=logit_model2.predict(banks)
y_pred
banks['pred_prob']=y_pred
banks
banks["Att_val"] = 0
bankss.loc[y_pred>=0.5,"Att_val"] = 1
banks.Att_val
from sklearn.metrics import classification_report
classification_report(banks.Att_val,banks.yy)
confusion_matrix = pd.crosstab(banks['yy'],banks.Att_val)
confusion_matrix
accuracy=(39150+1112)/(39150+772+4177+1112)
    sum(so) as p_stout
  from pitching
  group by playerid) p
  on h.playerid = p.playerid;'''
  
new = pandas.read_sql(data, con)
con.close()




##############################################################
###  Start with considering as many explanatory variables  ###
##############################################################

model1 = logit('inducted ~ p_wins + p_loss + p_shout + p_saves + p_stout', data = new).fit() # model1 is our fitted model.
print model1.summary()


#==============================================================================
#                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
#------------------------------------------------------------------------------
#Intercept     -2.3188      0.310     -7.477      0.000        -2.927    -1.711
#p_wins         0.0273      0.006      4.377      0.000         0.015     0.039
#p_loss        -0.0346      0.007     -4.957      0.000        -0.048    -0.021
#p_shout        0.0193      0.022      0.886      0.376        -0.023     0.062
#p_saves    -5.344e-06      0.004     -0.001      0.999        -0.008     0.008
#p_stout        0.0005      0.000      1.430      0.153        -0.000     0.001
#==============================================================================

Example #56
0
data['posipoli'] = data.apply(lambda row: POSIPOLI(row), axis=1)

# check the new positive polity score variable
print('Check positive polity score counts:')
posipolicheck = data['posipoli'].value_counts(sort=False, dropna=False)
print (posipolicheck)
print()

# center quantitative IVs for regression analysis
data['incomeperperson_c'] = (data['incomeperperson'] - data['incomeperperson'].mean())
data['employrate_c'] = (data['employrate'] - data['employrate'].mean())

# logistic regression with posipoli
print('Logistic regression for positive polity score and urban rate:')
lreg1 = smf.logit(formula = 'posipoli ~ urbanrate', data = data).fit()
print (lreg1.summary())
print()
# odds ratios
print ("Odds Ratios for Positive Polity Score and Urban Rate:")
print (numpy.exp(lreg1.params))
print()

# odd ratios with 95% confidence intervals
print('Odds Ratios for Positive Polity Score and Urban Rate with 95% Confidence Intervals:')
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (numpy.exp(conf))
print()
print(urbanization_threshold)

# Set binary flag that urbanization rate is greater than the threshold
def urbanrate_higher_than_threshold(urbanrate):
    if urbanrate > urbanization_threshold:
        return 1
    else:
        return 0

subset['high_urbanrate'] = subset['urbanrate'].apply(urbanrate_higher_than_threshold)

counts = subset.groupby('high_urbanrate').size()
print(counts)

# logistic regression with society type
lreg1 = smf.logit(formula = 'high_income ~ full_democracy', data = subset).fit()
print (lreg1.summary())

# odd ratios with 95% confidence intervals
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))

# logistic regression with society type and urbanization rate
lreg2 = smf.logit(formula = 'high_income ~ full_democracy + high_urbanrate', data = subset).fit()
print (lreg2.summary())

# odd ratios with 95% confidence intervals
params = lreg2.params
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

import statsmodels.api as sm
import statsmodels.formula.api as smf
logit_model=smf.Logit(y_train, X_train)
results=logit_model.fit()
print(results.summary2())

# Column names for reference above
df_column_name = pd.DataFrame(list(df_full_data.drop(['Attrition'], axis=1).columns.values))
df_column_name.index = np.arange(1, len(df_column_name) + 1)
df_column_name

# note: the new clustering dataset has slightly different columns
#model= smf.logit(formula="Attrition~ ClusterSegment	+ Age + DailyRate + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + NumCompaniesWorked + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + OverTime + PerformanceRating + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single + DistanceFromHomeRange_1_4 + DistanceFromHomeRange_5_9 + DistanceFromHomeRange_10_19 + DistanceFromHomeRange_20_30 + DistanceFromHomeRange_Over30 + NumCompaniesWorkedRange_0_2 + NumCompaniesWorkedRange_3_5 + NumCompaniesWorkedRange_6_10 + NumCompaniesWorkedRange_10over + YearsAtCompanyRange_0_2 + YearsAtCompanyRange_3_5 + YearsAtCompanyRange_6_10 + YearsAtCompanyRange_10over", data= df_full_data).fit(method='lbfgs')
model= smf.logit(formula="Attrition~ Age + DailyRate + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + OverTime + PerformanceRating + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single", data= df_full_data).fit()
model.summary()

# GETTING THE ODDS RATIOS, Z-VALUE, AND 95% CI
model_odds = pd.DataFrame(np.exp(model.params), columns= ['OR'])
model_odds['z-value']= model.pvalues
model_odds[['2.5%', '97.5%']] = np.exp(model.conf_int())
model_odds

"""# Model 6: Neural Network"""

# Random seeds
np.random.seed(123)
rn.seed(123)
tf.set_random_seed(123)
data['bin2alcohol'] = data.apply (lambda row: bin2alcohol (row),axis=1)

# create binary Female employee rate
def bin2femalemployee(row):
   if row['femaleemployrate'] <= 50 :
      return 0
   elif row['femaleemployrate'] > 50 :
      return 1
#Apply the new variable bin2alcohol to the gapmind dataset
data['bin2femalemployee'] = data.apply (lambda row: bin2femalemployee (row),axis=1)

##############################################################################
#                    LOGISTIC REGRESSION
##############################################################################
# logistic regression with binary breast cancer per 100th women
lreg1 = smf.logit(formula = 'bin2cancer ~ bin2alcohol',
                  data = data).fit()
print (lreg1.summary())
# odds ratios
print ("Odds Ratios")
print (np.exp(lreg1.params))

# odd ratios with 95% confidence intervals
print ('Logistic regression with binary alcohol consumption')
print ('Odd ratios with 95% confidence intervals')
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print (np.exp(conf))
print "\n-----------------------------\n"
# logistic regression with binary income per person and binary alcohol consumption
# df.info()
# df.Species.unique()

# filtering for two species

# df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica")].copy()
# print(df_subset.Species.unique())\
#
# df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0})
#
# df_subset.rename(columns={"Sepal.Length": "Sepal_Length", "Sepal.Width": "Sepal_Width",
#                           "Petal.Length": "Petal_Length", "Petal.Width": "Petal_Width"}, inplace=True)
# # creating a model
# model = smf.logit("Species ~ Petal_Length + Petal_Width",data=df_subset)
# result = model.fit()
# # print(result.summary())
#
# # predicting response values
#
# df_new = pd.DataFrame({"Petal_Length": np.random.randn(20)*0.5 + 5,
#                        "Petal_Width": np.random.randn(20)*0.5 + 1.7})
# df_new["P-Species"] = result.predict(df_new)
# df_new["P-Species"].head(3)

dataset = sm.datasets.get_rdataset("biopsy", package="MASS").data
dataset.rename(columns={"class": "Class"}, inplace=True)
dataset.Class = dataset.Class.map({"benign": 0, "malignant": 1})
model = smf.logit("Class ~ V1", data=dataset)
result = model.fit()

print(result.prsquared)