Ejemplo n.º 1
0
def gmm_logit_test(Y, y_pred, sample_condition_labels, condition_labels):
    Y = [sample_condition_labels[i] for i in Y]
    y_pred = np.append(y_pred, [0, 0, 1, 1])
    Y.extend([
        condition_labels[0], condition_labels[1], condition_labels[0],
        condition_labels[1]
    ])
    Y = pd.get_dummies(Y)
    Y['intercept'] = 1
    logit = dm.Logit(y_pred, Y[['intercept', condition_labels[1]]])
    with warnings.catch_warnings():
        warnings.filterwarnings('error')
        try:
            logit_mod = logit.fit(disp=0)
            logit_pvalue, logit_coef = logit_mod.pvalues[1], logit_mod.params[
                1]
        except ConvergenceWarning:
            logit_mod, logit_pvalue, logit_coef = "NC", 1, "NC"
    if logit_pvalue == 0:
        logit_pvalue = np.finfo(np.float).tiny
    logit_results = {
        'pvalue': logit_pvalue,
        'coef': logit_coef,
        'model': logit_mod
    }
    return (logit_results)
Ejemplo n.º 2
0
 def _fit_logit(self, X, y, keep_significant, sig_value=0.05):
     f = sm.Logit(y, X).fit()
     if keep_significant:
         sig_cols = list(f.params[f.pvalues <= sig_value].index)
         return f, sig_cols
     else:
         return f
Ejemplo n.º 3
0
def fit_model_disaster(df, model):
    '''Fit logistic regression model for outcome that if a disaster occurr.

       Parameters
       ----------
       df: dataframe
        Contains columns of outcome and predictors.
       model: str
        A formula string used to construct a design matrix.
        Ex. 'has_disaster ~ C(iso3) + C(region_id) + year_id + cz_prop'

       Returns
       -------
       modeled_df: dataframe
        Contains draws of predicted events.
        1 indicates event occurs; 0 indicates otherwise.

    '''
    # Return matrix of y and X based on model formula.
    y, X = dmatrices(model, df, return_type='dataframe')
    # Construct logit model.
    logit = sm.Logit(y, X)
    # Fit model
    result = logit.fit()
    modeled_df = df.copy()
    # Return predicted probability based on fitted model.
    modeled_df['predicted_prob'] = result.predict()
    # Create columns to store samples of sampled Bernoulli variables.
    modeled_df = modeled_df.reindex(columns=list(modeled_df.columns) +
                                    DRAW_COLS)
    # Draw random samples from Bernoulli distribution
    # based on predicted probability.
    modeled_df[DRAW_COLS] = np.array([bernoulli.rvs(p, size=DRAW) \
                                for p in modeled_df.predicted_prob.values])
    return modeled_df
Ejemplo n.º 4
0
def backward_selection(df, dv, regression=True, alpha=.05):
    flag = 0
    cols_dropped = [dv]
    if regression:
        while flag == 0:
            model = lm.OLS(endog=np.array(df[dv]),
                           exog=np.array(df.drop(cols_dropped, axis=1)))
            results = model.fit()
            pvalues = list(results.pvalues)
            drop_index = pvalues.index(max(pvalues))
            col_drop = df.drop(cols_dropped, axis=1).columns[drop_index]
            print(col_drop + '-' + str(pvalues[drop_index]))
            if pvalues[drop_index] > alpha:
                cols_dropped.append(col_drop)
            else:
                flag = 1
    else:
        while flag == 0:
            model = sm.Logit(endog=np.array(df[dv]),
                             exog=np.array(df.drop(cols_dropped, axis=1)))
            results = model.fit()
            pvalues = list(results.pvalues)
            drop_index = pvalues.index(max(pvalues))
            col_drop = df.drop(cols_dropped, axis=1).columns[drop_index]
            print(col_drop + '-' + str(pvalues[drop_index]))
            if pvalues[drop_index] > alpha:
                cols_dropped.append(col_drop)
            else:
                flag = 1

    cols_dropped.remove(dv)
    return cols_dropped
Ejemplo n.º 5
0
    def __init__(self):
        from .results.results_glm import Lbw
        self.res2 = Lbw()
        self.res1 = GLM(self.res2.endog, self.res2.exog,
                family=sm.families.Binomial()).fit()

        modd = discrete.Logit(self.res2.endog, self.res2.exog)
        self.resd = modd.fit(start_params=self.res1.params * 0.9, disp=False)
Ejemplo n.º 6
0
def standard_ci_sm(X, y, active, leftout_indices, alpha=0.1):
    XE = X[:, active]
    X2, y2 = XE[leftout_indices, :], y[leftout_indices]
    import statsmodels.discrete.discrete_model as sm
    logit = sm.Logit(y2, X2)
    result = logit.fit(disp=0)
    LU = result.conf_int(alpha=alpha)
    return LU.T
Ejemplo n.º 7
0
    def check_betas(data):
        empirical_betas = []
        for i in range(data.shape[1] - 1):
            model = sm.Logit(data.iloc[:, 0],
                             sm.tools.add_constant(data.iloc[:, i + 1]))
            result = model.fit(disp=0)
            empirical_betas.append(result.params[1])

        return np.array(empirical_betas)
Ejemplo n.º 8
0
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Logit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
 def fit(self, X, Y):
     self.X=sm.add_constant(X)
     self.Y=Y
     self.model = dis_mod.Logit(self.Y.values, self.X)
     try:
         self.fitted_model = self.model.fit()
         return
     except np.linalg.LinAlgError:
         print('uh oh !!! some linear algebra broke ignore this set and move on')
         return -1
Ejemplo n.º 10
0
    def fit_logit_model(self, X_df, y):
        '''Fit a logit model.
        
        Args:
            | X_df: the independent variables (covariates and or aggregated genotypes) for 1 gene. 
            | y (numpy.ndarray): values are 0/1 (unaffected/affected). 
        
        Returns:
            logit_result (statsmodels.discrete.discrete_model.BinaryResultsWrapper): contains results from fitting logit regression model.
        '''

        logit_model = sm.Logit(y, X_df.transpose())
        logit_result = logit_model.fit(method='bfgs')
        return logit_result
Ejemplo n.º 11
0
def regression(cps_alldata):
    cps_alldata['intercept'] = np.ones(len(cps_alldata))
    model = sm.Logit(endog=cps_alldata.ss_indicator,
                     exog=cps_alldata[[
                         'Aged_yn', 'Disabled_yn', 'Widowed_yn', 'ssi_yn',
                         'sur_yn', 'vet_yn', 'paw_yn', 'hed_yn', 'hcsp_yn',
                         'hfdval', 'mcare', 'mcaid', 'uc_yn', 'wc_yn',
                         'intercept'
                     ]])

    results = model.fit()
    print results.summary()
    ypred = results.predict()
    cps_alldata['Prob_Received'] = ypred

    return cps_alldata
def q2():
    print()
    print()
    print()

    data = pd.read_csv(
        "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
    )

    # Turn string attribute values into 0 or 1
    dummy_ranks = pd.get_dummies(data["Sex"], prefix="Sex")

    # Save class labels
    classLabels = data["Survived"]

    # Remove attributes that don't help with classification or has been turned into an int already (sex)
    data = data.drop([
        "PassengerId", "Name", "Ticket", "Cabin", "Embarked", "Sex", "Survived"
    ], 1)
    data = data.join(dummy_ranks.ix[:'Sex_female'])

    # Create training and testing data (80/20 split)
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
        data, classLabels, test_size=0.2)

    # Create and train logistic model
    logit = discrete_model.Logit(y_train, x_train, missing='drop')
    fitted = logit.fit()

    # Predict testing data
    result = fitted.predict(x_test)

    result = np.asarray(result)
    y_test = np.asarray(y_test)

    total = 0
    correct = 0
    for i in range(0, len(result)):
        if not math.isnan(result[i]):
            total += 1
            if round(result[i]) == y_test[i]:
                correct += 1

    accuracy = correct / float(total)
    print("Accuracy of Logistic Regression: " + str(accuracy))
Ejemplo n.º 13
0
    def logistic_with_interactions(self,
                                   treatment_var,
                                   interaction_vars,
                                   y_var,
                                   other_vars=None,
                                   use_bootstrapped=False):
        """
        Uses statamodels packages to do logisitic regressions on the treatment variable
        and up to two interaction terms and optionally other covariates. Uses either the downsampled
        or the bootstrapped data.
        Inputs:
              treatment_var = string with treatment var name
              interaction_vars = a list of up to two variable to interact with the treatment
              y_var = outcome variable (labels)
              other_vars = list of other covariates to add to the specification.
              use_bootstrapped = default = False, uses the .df_downsampled data, if True uses
                      .bootstrapped
        Outputs:
            The summary table with the regression results.
        """
        if use_bootstrapped:
            df = self.bootstrapped.copy()
        else:
            df = self.df_downsampled.copy()

        if len(interaction_vars) == 2:
            for i in range(len(interaction_vars) - 1):
                df[interaction_vars[i] + 'x' + interaction_vars[i + 1]] = df[
                    interaction_vars[i]] * df[interaction_vars[i + 1]]
                interaction_vars.append(interaction_vars[i] + 'x' +
                                        interaction_vars[i + 1])
        treatment_ints = [treatment_var]
        for i in interaction_vars:
            df['Tx' + i] = df[treatment_var] * df[i]
            treatment_ints.append('Tx' + i)
        if other_vars is not None:
            X = df[[*treatment_ints, *interaction_vars, *other_vars]]
        else:
            X = df[[*treatment_ints, *interaction_vars]]
        y = df[[y_var]]
        LogitSM = sm.Logit(np.asarray(y.astype(int)), X.astype(int))

        return LogitSM.fit().summary()
Ejemplo n.º 14
0
import numpy as np

# read in the data & create matrices
df = pd.read_csv('https://stats.idre.ucla.edu/stat/data/binary.csv')
y, X = dmatrices('admit ~ gre + gpa + C(rank)', df, return_type = 'dataframe')
#y = ravel.column_or_1d(y, warn=True)
#y=y.reshape(-1,1)
y
X
# sklearn output
model = LogisticRegression(fit_intercept = True, C = 1e9)
mdl = model.fit(X, np.ravel(y))
model.coef_

# sm
logit = sm.Logit(y, X)
logit.fit().params



#%%

import statsmodels.api as sm
from sklearn.datasets import make_blobs

x, y = make_blobs(n_samples=50, n_features=2, cluster_std=5.0,
                  centers=[(0,0), (2,2)], shuffle=False, random_state=12)
x
logit_model = sm.Logit(y, sm.add_constant(x)).fit()
print( logit_model.summary2())
Ejemplo n.º 15
0
CPS_dataset.disability = np.where(CPS_dataset.pediseye == 'Yes', 1,
                                  CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisout == 'Yes', 1,
                                  CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 'Yes', 1,
                                  CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 'Yes', 1,
                                  CPS_dataset.disability)

#Regression
CPS_dataset['intercept'] = np.ones(len(CPS_dataset))
CPS_dataset['indicator'] = CPS_dataset.WIC_infant
CPS_dataset['infant'] = np.where(CPS_dataset.a_age < 1, 1, 0)
model = sm.Logit(endog=CPS_dataset.indicator,
                 exog=CPS_dataset[[
                     'intercept', 'hfdval', 'cov_hi', 'ch_mc', 'infant',
                     'fwsval'
                 ]])
logit_res = model.fit()
print logit_res.summary()
probs = logit_res.predict()
CPS_dataset['probs'] = probs

# CPS total benefits and Administrative total benefits
state_benefit = {}
state_recipients = {}
CPS_dataset['WIC_val'] = 0

for fip in Admin_totals.Fips:
    this_state = (CPS_dataset.gestfips == fip)
    CPS_dataset.loc[this_state & (CPS_dataset.indicator == 1),
Ejemplo n.º 16
0
def analyze_data(dataset, ysets, xsets, type='LinR', normalize=False):
    #dataset = fix_dataset(dset[ysets+xsets[0]])
    regre_type = ''
    if type == 'LinR':
        regre_type = 'Linear Regression'
    elif type == 'LogR':
        regre_type = 'Logistic Regression'
    else:
        print('Error Unknown regression method {:s}'.format(type))
        quit()
    old_rsqr = 0
    old_fstat = 10e20
    del_rsqr = 0
    del_fstat = 0
    num_sig = 0
    for y in ysets:
        print(
            '##############################################################################'
        )
        print('\t\t\t\t\t\t', y)
        print(
            '##############################################################################'
        )
        cnt = 0
        for x in xsets:
            # my method up above to take care of missing or unusable values
            dmodel = fix_dataset(dataset[[y] + x])
            Y = dmodel[y]
            print()
            print()
            print(
                '################################################################################'
            )
            print(
                '#####################################    Testing x set {:d}'.
                format(cnt + 1))
            print(
                '#####################################    Using {:s} on dependent variable {:s}'
                .format(regre_type, y))
            print(
                '################################################################################'
            )
            print('\t\tX or dependent variables:\n', x)
            print(
                '################################################################################'
            )
            print(
                '################################################################################'
            )
            print(
                '################################################################################'
            )
            print()
            X = dmodel[x]
            #print('+++++++++++++++++++++++++++++++++++++++++Before: ', X[0,0])
            if normalize:
                #X = pd.DataFrame(minmaxscale(X, axis=1), columns=x)
                X = pd.DataFrame(minmaxscale(X, axis=0),
                                 columns=x,
                                 index=dmodel.index)
            #print('+++++++++++++++++++++++++++++++++++++++++After: ', X.iloc[0,0])
            #print(X['per_capita_income'])
            #X.loc[:, 'per_capita_income'] = (dmodel['per_capita_income'].values - dmodel['per_capita_income'].mean())/dmodel['per_capita_income'].std()
            #print(X['per_capita_income'])
            #X = dataset.loc[:, x]
            X2 = sm.add_constant(X)
            if type == 'LinR':
                est = sm.OLS(Y, X2)
                print('\n\nThe basic dirs are\n', dir(est))
                est2 = est.fit()
                print('\n\nThe fitted dirs are\n', dir(est2))
                rsqr = est2.rsquared
                if rsqr > old_rsqr:
                    old_rsqr = rsqr
                pvals = est2.pvalues
                fval = est2.fvalue
                ftest = est2.f_test
                print('R-squared:', rsqr)
                print('P-values:\n', pvals)
                find_significant(x, pvals)
                print('Fvalue\n', fval)
                print(est2.summary())
                print('\n\nThe summary dirs are:\n', dir(est2.summary()))
                vif = calculate_vif(X2)
                print('VIF:\n', vif)
            elif type == 'LogR':
                #clf = LogisticRegression(solver='lbfgs',max_iter=1000).fit(X2, Y)
                #params = clf.coef_
                #log_like = np.log(np.abs(params))
                #print(params)
                #print(log_like)
                model = dis_mod.Logit(Y, X2)
                model2 = model.fit()
                loglikly = calculate_log_like(x, model2.params)
                print(dir(model))
                print(model.df_model)
                print(model2.summary())
                print('model 2', dir(model2))
                print('R squared:', model2.prsquared)
                #print(dir(model2.summary().tables))
                print('The log likelyhoods are:')
                show_labeled_list(loglikly, x)
                print('pvalue for {:s}: {:f}'.format(x[0],
                                                     model2.pvalues.loc[x[0]]))
                y_pred = model2.predict(X2, linear=True)
                #print(y_pred)
                yp = list()
                for e in y_pred:
                    if e > 0:
                        yp.append(1)
                    else:
                        yp.append(0)
                #print(model.loglikeobs(x))
                #df_confusion = pd.crosstab(Y, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
                plot_confusion_matrix(
                    Y,
                    yp,
                    classes=['A', 'NA'],
                    title='Confusion matrix, without normalization')
                #plot_confusion_matrix(df_confusion)
                #vif = pd.Series([VIF(X2.values, i)
                #           for i in range(X2.shape[1])],
                #          index=X2.columns)
                vif = calculate_vif(X2)
                print('VIF:\n', vif)
                plt.show()
            cnt += 1
            print()
            print()
    return
Ejemplo n.º 17
0
#

# In[21]:

import statsmodels.discrete.discrete_model as sm
import statsmodels.formula.api as smf
import statsmodels.api as sma
from sklearn.model_selection import train_test_split

x = sma.add_constant(x)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=1)
results = sm.Logit(y_train, x_train).fit()
y_pred = pd.DataFrame(results.predict(x_test))
compare = pd.concat([y_test, y_pred], axis=1)
compare.columns = ['y_test', 'y_pred']
compare.loc[compare['y_pred'] >= 0.5, 'y_pred_r'] = 1
compare.loc[compare['y_pred'] < 0.5, 'y_pred_r'] = 0
compare['match'] = compare['y_pred_r'] == compare['y_test']
print(results.summary())
print('')
print('This model predicted default with ' +
      str(len(compare[compare['match'] == True]) / len(compare) * 100) +
      '% accuracy')
print('')
print(results.params)

# In[22]:
plt.hist(y_train)
## explore skewness
plt.title('Histogram for labels')
plt.xlabel('Class')
plt.ylabel('frequency')
plt.show()
print("in the train set, we have ", y_train.sum(), "Class 1, ", len(y_train) - y_train.sum(), "Class 0. The skewness is not severe")
## logistic regression
### The first model we want to try is logistic regression. Logistic regression
### is simple and it can give us a test of how our features work.
### scale data set at first
from sklearn import preprocessing
x_train_s = preprocessing.scale(x_train)
x_test_s = preprocessing.scale(x_test)
import statsmodels.discrete.discrete_model as sm
logit2=sm.Logit(y_train, x_train_s).fit(maxiter=200)
print(logit2.summary())
print(logit2.pvalues)
[(e1,e2) for (e1, e2) in zip(logit2.pvalues, x_train.columns)]
from sklearn.linear_model import LogisticRegression
lor = LogisticRegression()
lor.get_params()
param = {'C' : [0.1, 1, 5, 50, 100]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(lor, param_grid = param, cv = 10)
grid_search.fit(x_train_s, y_train)
lor_cv = grid_search.cv_results_
lor_cv_score_mean = lor_cv['mean_test_score']
lor_cv_score_sd = lor_cv['std_test_score']
lor_cv_c = ["0.1", "1", "5", "50", "100"]
plt.plot(lor_cv_c, lor_cv_score_mean, color = 'blue')
Ejemplo n.º 19
0
    skLogitModel.fit(X_trainData, y_trainData)
    skYPredict = skLogitModel.predict(X_trainData)

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_trainData, skYPredict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)
    crosstab = pd.crosstab(y_trainData,
                           skYPredict,
                           rownames=['True'],
                           colnames=['Predicted'],
                           margins=True)
    print("-----------\"Confusion Matrix\"-------------")
    print(crosstab)
    print("-----------\"Params\"-------------")
    statLogitModel = sm.Logit(y_trainData, X_trainData).fit_regularized()
    print(statLogitModel.params)
    print("-----------\"P-values\"-------------")
    print(statLogitModel.pvalues)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
             label='ROC' + str(area))
    plt.plot([0, 1], [0, 1], linestyle='dotted')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC & AUC')
    plt.legend()
    plt.show()
Ejemplo n.º 20
0
 def model(self):
     y, X = dmatrices('Y~X', self.data, return_type='dataframe')
     logit = sm.Logit(y, X)
     result = logit.fit()
     print result.summary()
Ejemplo n.º 21
0
CPS_dataset.cov_hi = CPS_dataset.cov_hi.astype(int)

#Regression
dummies = pd.get_dummies(CPS_dataset['a_mjind'], drop_first=True)
CPS_dataset = pd.concat([CPS_dataset, dummies], axis=1)
# Make it so most likely age to receive is mid forties
CPS_dataset['age_squared'] = ((CPS_dataset.a_age - 47) *
                              (CPS_dataset.a_age - 47)) * -1
CPS_dataset['intercept'] = np.ones(len(CPS_dataset))
CPS_dataset['indicator'] = CPS_dataset.wc_yn
model = sm.Logit(endog=CPS_dataset.indicator,
                 exog=CPS_dataset[[
                     'intercept', 'Armed Forces', 'Construction',
                     'Educational and health services', 'Financial activities',
                     'Information', 'Leisure and hospitality', 'Manufacturing',
                     'Mining', 'Other services',
                     'Professional and business services',
                     'Public administration', 'Transportation and utilities',
                     'Wholesale and retail trade', 'age_squared', 'dis_cs',
                     'dis_hp', 'finc_dis', 'cov_hi', 'a_sex'
                 ]])
logit_res = model.fit()
probs = logit_res.predict()
targets = CPS_dataset.wc_yn
print 'Accuracy score for logistic regression estimation', accuracy(
    targets, probs)
CPS_dataset['probs'] = probs

# CPS total benefits and Administrative total benefits

CPS_totalb = (CPS_dataset.wc_val[CPS_dataset.indicator == 1] *
pd.Series(boots).hist()
pd.Series(f1).hist()
pd.Series(f1_sub).hist()
plt.title("F1 distribution of selected features")


#### find out the p-values of the log regression

###matrix outputs NAN because features are correlated
import statsmodels.discrete.discrete_model as sm
y_train=pd.DataFrame(y_train)
y_train.reset_index(inplace=True)
y_train=y_train.squeeze()["Survived"]


logit = sm.Logit(y_train,X_trans[feature_subset].dropna())
f = logit.fit()
print(f.params)
print(f.summary())
####based on the p-values I should exclude definetly:
# - small_fam (1) --> highy correlated with large family (skeptical)
#  - 1class (1)   --> skeptical 
# - B cabin (0.9)
# - E_S (embarked at S) (0.7)
# - No cabin (0.2)
# fem-family (1)
# F- cabin (0.2)

###Fare_family, 1class
feature_2=['Age',   'large_fam', 
'female',   '3class', "1class",
Ejemplo n.º 23
0
y_train = np.array(y_train)
X_test = np.array(x_test)
y_test = np.array(y_test)

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_true = y_test
y_pred = clf.predict(X_test)
y_hat = clf.predict_proba(X_test)[:, 1]

y_hat = np.ravel(y_hat)

fprs, tprs, thresholds = metrics.roc_curve(y_true, y_hat)

plt.plot(fprs, tprs, 'k--', lw=2)
plt.scatter(fprs, tprs, c='k', marker='x', s=50)
plt.plot(np.arange(-.05, 1.05, .01),
         np.arange(-.05, 1.05, .01),
         '--',
         color='lightgray')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0 - .05, 1 + .05])
plt.ylim([0 - .05, 1 + .05])
plt.show()
print(metrics.roc_auc_score(y_test, y_hat))
print(metrics.confusion_matrix(y_true, y_pred))
res = sm.Logit(y_train, X_train).fit(method='bfgs')
print(res.summary())
Ejemplo n.º 24
0
#===========================6.1 Load the dataset============================
data = loadtxt(trainName, delimiter=',')
X = data[:, 0:2]
y = data[:, 2]
#================6.2 Separate positive and begative examples================
pos = where(y == 1)
neg = where(y == 0)
m, n = X.shape
y.shape = (m, 1)
#=======================6.3 Calculate the parameters========================
paramFile = open(paramName, 'w')
#6.3.1 Calculate based on sequencing depth and breadth===========================
it = np.ones(shape=(m, n + 1))
it[:, 1:n + 1] = X
try:
    logit = sm.Logit(y, it)
    theta = logit.fit().params
#except:
#	iniTheta = np.ones(shape=(m, n+1))
#	iniTheta[:, 1:n+1] = X
#	theta=decorated_cost(it, y, n)
except:
    sys.stderr.write(
        'Warning! Perfect separation found. Couldnt opimize the regression parameters.\n'
    )
    sys.stderr.write(
        '       Target genome is too different from training genomes, to avoid errors consider training with another set\n'
    )
    theta = [-34.738273, 550.229, 1080.350]
paramFile.write(
    str(theta[0]) + "," + str(theta[1]) + "," + str(theta[2]) + "\n")
Ejemplo n.º 25
0
        recon = [x_te_recon, y_te_recon, z_te_recon]
        # group counterfactual reconstruction data
        recon_cf = [x_te_recon_cf, y_te_recon_cf, z_te_recon_cf]

        # sample and numpy
        x_te_recon, y_te_recon, z_te_recon = [torch_sample_np(data) for data in [x_te_recon, y_te_recon, z_te_recon]]
        x_te_recon_cf, y_te_recon_cf, z_te_recon_cf = [torch_sample_np(data) for data in
                                                       [x_te_recon_cf, y_te_recon_cf, z_te_recon_cf]]

        # show graph for to evaluate reconstruction output
        # compare_recon(x_te, np.array(np.concatenate([y_te_recon_cf, a_te[:, np.newaxis], z_te_recon_cf, x_te_recon], 1)))

        # Fit models using train setRF_ypred_te
        input_tr = np.concatenate((np.ones(len(a_tr))[:, np.newaxis], x_tr, a_tr[:, np.newaxis]), 1)
        # logistic regression
        LR = sm.Logit(y_tr, input_tr)
        LR_fit = LR.fit()

        RF = RandomForestClassifier(n_estimators=10, max_depth=4)
        RF.fit(input_tr, y_tr)

        # ------------------ test model accuracy on test set
        input_te = np.concatenate((np.ones(len(a_te))[:, np.newaxis], x_te, a_te[:, np.newaxis]), 1)

        def get_accuracies(input):
            # normal LR
            LR_ypred_te = LR_fit.predict(input)
            # normal LR with fixed a
            input_te_adjusted = input.copy()
            input_te_adjusted[:, -1] = 0
            LR_adj_ypred_te = LR_fit.predict(input_te_adjusted)
Ejemplo n.º 26
0
from patsy import dmatrices
dftrain, dftest = train_test_split(dataframenew, test_size=0.2)
y_train1, x_train1 = dmatrices(formula1, data=dftrain, return_type='dataframe')
y_test1, x_test1 = dmatrices(formula1, data=dftest, return_type='dataframe')
y_train1num = np.squeeze(y_train1)
x_train1num = np.squeeze(x_train1)
y_test1num = np.squeeze(y_test1)
x_test1num = np.squeeze(x_test1)
x_train1.columns

# Use this Train Test Split without changing Features

# In[598]:

import statsmodels.discrete.discrete_model as sm
model1 = sm.Logit(y_train1, x_train1)
res = model1.fit()
res.summary()

# In[599]:

print(res.summary().as_latex())

# In[600]:

from statsmodels.nonparametric.kde import KDEUnivariate
kde_res1 = KDEUnivariate(res.predict())
kde_res1.fit()
plt.figure(figsize=(9, 6))
plt.plot(kde_res1.support, kde_res1.density)
plt.fill_between(kde_res1.support, kde_res1.density, alpha=0.2)
Ejemplo n.º 27
0
# plotting

fig, ax = plt.subplots()
plt.scatter(failures_freq.index, failures_freq, c='red', s=20)
plt.scatter(no_failures_freq.index,
            np.zeros(len(no_failures_freq)),
            c='blue',
            s=40)
plt.xlabel('X: Temperature')
plt.ylabel('Number of Failures')
ax.grid()

#get the data in correct format
y, X = dmatrices('Y ~ X', data, return_type='dataframe')
#build the model
logit = sm.Logit(y, X)
result = logit.fit()

# summarize the model
print(result.summary(), '\n')

print('Parameters: ', result.params, '\n')

yhat = logit.predict(
    result.params, exog=None, linear=False
)  #Predict response variable of a model given exogenous variables.
yhatsum = yhat**5
fig, ax = plt.subplots()
plt.plot(yhat, c='red')
ax.set_ylabel('Probability of failures')
ax.set_xlabel('Temperature')
Ejemplo n.º 28
0
df.dropna(axis=0, inplace=True)
df.shape
df.isnull().sum(axis=0)

#model
df['Diabetes'] = np.where(df.Diabetes == 2, 1, 0)
df['Gender'] = np.where(df.Gender == 'Male', 1, 0)
y = df.Diabetes
x = df.iloc[:, :9]
df.head()
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=0)
model = sm.Logit(y_train, x_train)
result = model.fit()
result.summary2()

#deleting the insignificant row
del x_train['Alkphos']
del x_test['Alkphos']
model1 = sm.Logit(y_train, x_train)
result1 = model1.fit()
result1.summary2()

#parameters of the model
result1.params

#odds ratio
np.exp(result1.params)
Ejemplo n.º 29
0
CPS_dataset.disability = np.where(CPS_dataset.pedisphy == 'Yes', 1,
                                  CPS_dataset.disability)
CPS_dataset.disability = np.where(CPS_dataset.pedisrem == 'Yes', 1,
                                  CPS_dataset.disability)

#Regression
CPS_dataset['ptotval'] = np.where(
    CPS_dataset.ptotval > 5200, 1,
    0)  # Did you make the enough money during your base period?
CPS_dataset['intercept'] = np.ones(len(CPS_dataset))
CPS_dataset['fmoop'] = np.where(CPS_dataset.fmoop > 0, 1, 0)
CPS_dataset['F_MV'] = np.where(CPS_dataset.f_mv_fs > 0, 1, 0)
CPS_dataset['indicator'] = CPS_dataset.uc_yn
model = sm.Logit(endog=CPS_dataset.indicator,
                 exog=CPS_dataset[[
                     'intercept', 'weuemp', 'ptotval', 'pruntype', 'a_explf',
                     'lkweeks', 'lkstrch', 'F_MV', 'disability'
                 ]])
logit_res = model.fit()
probs = logit_res.predict()
targets = CPS_dataset.uc_yn
print 'Accuracy score for logistic regression estimation', accuracy(
    targets, probs)
CPS_dataset['probs'] = probs

# CPS total benefits and Administrative total benefits
state_benefit = {}
state_recipients = {}

for fip in Admin_totals.Fips:
    this_state = (CPS_dataset.gestfips == fip)
Ejemplo n.º 30
0
for ret in ret_list:
    sig = Data.loc[:, retention] == ret
    x = Data.loc[sig, ad_cols].fillna(0)
    y = Data.loc[sig, target]
    zero_cols = x.columns[x.sum() == 0]
    x.drop(columns = zero_cols , inplace = True)
    x['intercept'] = 1
    if is_sklearn == True:
        model = LR(**lr_d)
        model.fit(x , y)
        score = model.score(x,y)
        aic_list.append((ret, -score , model))
        print('Score of model is %s' %score)
    else:
        try:
            model = sm.Logit(y, x).fit()
        except:
            model = sm.Logit(y, x).fit(method='powell')
        aic_list.append((ret, model.aic, model))
        print('F1 Score is: ', f1_score(y, model.predict() >= 0.5))

ret, aic, model = min(aic_list, key=operator.itemgetter(1))
print(ret, aic)

# Slicing Based on Retention Period and Cleaning Columns
ret_Data_df = Data.loc[(Data.loc[:, retention] == ret) | Data.retention.isna(), :].fillna(0)

rm_cols = set(pk_cols + useless_cols).intersection(ret_Data_df.columns)
ret_Data_df.drop(columns = list(rm_cols) , inplace = True)

org_cols = list(set(ret_Data_df.columns).intersection(nmws.columns))