def forward_selected(train_data, target):

    remaining = set(train_data.columns)
    remaining.remove(target)
    remaining.remove('intercept')

    selected = ['intercept']
    current_score, best_new_score = float("inf"), float("inf")

    while remaining and current_score == best_new_score:
        scores_candidates = []
        for candidate in remaining:
            #formula = "{} ~ {} + 1".format(target,  ' + '.join(selected + [candidate]))
            score = smf.Logit(train_data[target],
                              train_data[selected + [candidate]]).fit().bic
            #score = smf.logit(formula, train_data).fit().bic

            scores_candidates.append((score, candidate))

        scores_candidates.sort(reverse=True)
        print(scores_candidates)

        best_new_score, best_candidate = scores_candidates.pop()

        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    #formula = "{} ~ {} + 1".format(target, ' + '.join(selected))
    model = smf.Logit(train_data[target], train_data[selected]).fit()

    return model
Exemple #2
0
def logRegress(logger, df):
    '''Performs logistic regression
    
    This function gets the logistic regression coefficients for a dataframe that is passed in.
    
    Decorators:
        lD.log
    
    Arguments:
        logger {logging.Logger} -- logs error information
        df {dataframe} -- input dataframe where first column is 'sud'
    '''

    try:

        print("Performing Logistic Regression...")

        train_cols = df.columns[1:]
        logit = sm.Logit(df['sud'], df[train_cols])
        result = logit.fit()

        # Get odds, which are assessed by coeff[race/agebin/sex/setting]
        params = result.params
        conf = result.conf_int()
        conf['OR'] = params

        conf.columns = ['2.5%', '97.5%', 'OR']
        CI_OR_df = np.exp(conf)
        resultsDF = CI_OR_df[['OR']].join(CI_OR_df.ix[:, :'97.5%'])

    except Exception as e:
        logger.error('logRegress failed because of {}'.format(e))

    return resultsDF
def predict_winner_looser_housing(df_hh):

    df_hh['hh_income_2'] = df_hh['hh_income'] ** 2
    df_hh['natural_gas:accommodation_size'] = df_hh['accommodation_size'] * df_hh['natural_gas']
    df_hh['domestic_fuel:accommodation_size'] = df_hh['accommodation_size'] * df_hh['domestic_fuel']

    df_hh['winner'] = 0 + 1 * (df_hh['housing_expenditures_increase'] < 55 * df_hh['nb_beneficiaries']) # weird to put 55, but works better

    variables = ['hh_income', 'hh_income_2', 'consumption_units', 'nb_beneficiaries', 'natural_gas', #'domestic_fuel:accommodation_size', 'natural_gas:accommodation_size',
        'domestic_fuel', 'accommodation_size', 'age_18_24', 'age_25_34', 'age_35_49', 'age_50_64']
    variables_ols = ['natural_gas', 'domestic_fuel', 'accommodation_size', 'age_18_24', 'age_25_34', 'age_35_49', 'age_50_64']

    logit = smf.Logit(df_hh['winner'], df_hh[variables]).fit()
    
    probit = smf.Probit(df_hh['winner'], df_hh[variables]).fit()

    ols = smf.ols(formula = 'winner ~ \
        natural_gas + domestic_fuel + accommodation_size + \
        age_18_24 + age_25_34 + age_35_49 + age_50_64',
        data = df_hh).fit()
#        natural_gas * accommodation_size + domestic_fuel * accommodation_size + \

    clf = tree.DecisionTreeClassifier(max_depth=3)
    clf = clf.fit(df_hh[variables], df_hh['winner'])
#    regr = tree.DecisionTreeRegressor(max_depth=3)
#    regr.fit(df_hh[variables], df_hh['winner'])
    
    return logit, probit, ols, clf
def CreateLogReg(x, y):
    """
    Returns the logistic regression
    """
    X2 = add_constant(x)
    est = sm.Logit(y, X2)
    est2 = est.fit()
    return est2
Exemple #5
0
def Backward_Elimination(result,threshold,train_y,train_x):
    while np.amax(result.pvalues) > threshold:
      value_name=pd.Series(list(result.pvalues.values),index=result.pvalues.index)
      rem.append(value_name.idxmax())
      train_xx=train_x.drop(rem,axis=1)
      model=sm.Logit(train_y,train_xx)
      result=model.fit()
    return result
Exemple #6
0
def fit_firth(y, X, start_vec=None, step_limit=1000, convergence_limit=0.0001):

    logit_model = smf.Logit(y, X)

    if start_vec is None:
        start_vec = np.zeros(X.shape[1])

    beta_iterations = []
    beta_iterations.append(start_vec)
    for i in range(0, step_limit):
        pi = logit_model.predict(beta_iterations[i])
        W = np.diagflat(np.multiply(pi, 1 - pi))
        var_covar_mat = np.linalg.pinv(
            -logit_model.hessian(beta_iterations[i]))

        # build hat matrix
        rootW = np.sqrt(W)
        H = np.dot(np.transpose(X), np.transpose(rootW))
        H = np.matmul(var_covar_mat, H)
        H = np.matmul(np.dot(rootW, X), H)

        # penalised score
        U = np.matmul(np.transpose(X),
                      y - pi + np.multiply(np.diagonal(H), 0.5 - pi))
        new_beta = beta_iterations[i] + np.matmul(var_covar_mat, U)

        # step halving
        j = 0
        while firth_likelihood(new_beta, logit_model) > firth_likelihood(
                beta_iterations[i], logit_model):
            new_beta = beta_iterations[i] + 0.5 * (new_beta -
                                                   beta_iterations[i])
            j = j + 1
            if (j > step_limit):
                raise Exception('Firth regression failed')
                return None

        beta_iterations.append(new_beta)
        if i > 0 and (
                np.linalg.norm(beta_iterations[i] - beta_iterations[i - 1]) <
                convergence_limit):
            break

    return_fit = None
    if np.linalg.norm(beta_iterations[i] -
                      beta_iterations[i - 1]) >= convergence_limit:
        raise Exception('Firth regression failed')
    else:
        # Calculate stats
        fitll = -firth_likelihood(beta_iterations[-1], logit_model)
        beta = beta_iterations[-1]
        bse = np.sqrt(np.diagonal(-logit_model.hessian(beta_iterations[-1])))

        # Wald test
        pvalues = 2 * (1 - stats.norm.cdf(np.abs(beta / bse)))
        return_fit = beta, bse, fitll, pvalues, i

    return return_fit
Exemple #7
0
def test_mode():
    df = get_data()
    data = df.drop(columns = ['PassengerId','Ticket','Name','Cabin'])
    y, X = dmatrices("Survived ~ Pclass+Sex+Age+SibSp+Parch+Fare+Embarked", df, return_type = 'dataframe')
    
    #logit = sm.Logit(data["Survived"],data[])
    logit = sm.Logit(y,X)
    result = logit.fit()
    print result.summary2()
 def regressor(y, X, model_type=model_type):
     if model_type == "linear":
         regressor = sm.OLS(y, X).fit()
     elif model_type == "logistic":
         regressor = sm.Logit(y, X).fit()
     else:
         print("\nWrong Model Type : " + model_type +
               "\nLinear model type is seleted.")
         model_type = "linear"
         regressor = sm.OLS(y, X).fit()
     return regressor
Exemple #9
0
 def test_firth_likelihood(self):
     p = np.loadtxt(P_BINARY)
     m = np.loadtxt(M)
     firth_vars = np.loadtxt(FIRTH_VARS)
     mod = smf.Logit(p, m)
     fll = firth_likelihood(firth_vars, mod)
     self.assertAlmostEqual(fll, 97.13375906431875)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         fll = firth_likelihood(firth_vars + 100, mod)
     self.assertAlmostEqual(fll, np.inf)
def backwardElimination(x,y, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.Logit(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
def binarylogit(df):
    """
    Specifies a binary logistic model which aims simply to determine whether or 
    not the room is occupied. 
    """

    print("""
    BINARY LOGISTIC MODEL 
    ___________________________________
    """)

    # Stores the time for output.
    query_time = df.counts_time.max()

    # Trains a logit model. Prints the coefficients and their significance levels.
    y = df["counts_truth_is_occupied"]
    X = np.array(df["counts_associated"].reshape(-1, 1))
    log = sm.Logit(y, X).fit()
    print(log.summary())

    log = LogisticRegression()
    log.fit(X, y)

    pseudo_r = log.score(X, y)
    print("Model score: ", pseudo_r)

    # Splits the dataset into 60% training and 40% testing.
    df_train, df_test = train_test_split(df, test_size=0.4, random_state=5)

    # Trains model on the training set.
    y_train = df_train["counts_truth_is_occupied"]
    X_train = np.array(df_train["counts_associated"].reshape(-1, 1))
    log_train = LogisticRegression()
    log_train.fit(X_train, y_train)

    # Tests model on the test set.
    y_test = df_test["counts_truth_is_occupied"]
    X_test = np.array(df_test["counts_associated"].reshape(-1, 1))
    predicted = log_train.predict(X_test)
    probs = log_train.predict_proba(X_test)

    # Prints accuracy score, confusion matrix, classification report and MSE.
    ascore = metrics.accuracy_score(y_test, predicted)
    print("Accuracy score (1 = perfect prediction) ", ascore)
    print("Confusion matrix:\n", metrics.confusion_matrix(y_test, predicted))
    print("Residual sum of squares: %.2f" % np.mean(
        (log_train.predict(X_test) - y_test)**2))

    return [pseudo_r, 0, ascore, query_time]
Exemple #12
0
def forward_select(data, response):
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    while remaining:
        aic_with_candidates=[]
        for candidate in remaining:
            aic = smf.Logit(data[response],data[selected+[candidate]]).fit().aic
            aic_with_candidates.append((aic, candidate))
        aic_with_candidates.sort(reverse=True)
        best_new_score, best_candidate=aic_with_candidates.pop()
        if current_score > best_new_score: 
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print ('aic is {},continuing!'.format(current_score))
        else:        
            print ('forward selection over!')
            break
    return selected
 def trained_pipeline(self):
     np.random.seed(44)
     log_reg = sm.Logit(self.y_train, self.X_train)
     param_array = np.zeros(len(self.independent_variables) + 1)
     iterations = 5
     for i in range(0, iterations):
         model = log_reg.fit(maxiter=5000,
                             avextol=.0001,
                             epsilon=.1,
                             full_output=1,
                             disp=0)
         params = list(model.params)
         param_array = [a + b for a, b in zip(param_array, params)]
     start = [p / iterations for p in param_array]
     model = log_reg.fit(start_params=start,
                         maxiter=5000,
                         avextol=.0001,
                         epsilon=.1,
                         full_output=1,
                         disp=1)
     return model
Exemple #14
0
def fit_lineage_effect(lin, c, k):
    """Fits the model `k ~ Wa` using binomial error with logit link.
    W are the lineages (either a projection of samples, or cluster indicators)
    and covariates.
    Returns the index of the most significant lineage

    Args:
        lin (numpy.array)
            Population structure matrix or lineage association
            binary matrix (n, k)
        c (numpy.array)
            Covariants matrix (n, j)
        k (numpy.array)
            Variant presence-absence vector (n, 1)

    Returns:
        max_lineage (int or None)
            Index of the most significant lineage
            or None is could not fit
    """
    if c.shape[0] == lin.shape[0]:
        X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin, c),
                           axis=1)
    else:
        X = np.concatenate((np.ones(lin.shape[0]).reshape(-1, 1), lin), axis=1)

    lineage_mod = smf.Logit(k, X)
    try:
        lineage_res = lineage_mod.fit(method='newton', disp=False)

        wald_test = np.divide(np.absolute(lineage_res.params), lineage_res.bse)
        # excluding intercept and covariates
        max_lineage = np.argmax(wald_test[1:lin.shape[1] + 1])
    # In case regression fails
    except (statsmodels.tools.sm_exceptions.PerfectSeparationError,
            np.linalg.LinAlgError):
        max_lineage = None

    return max_lineage
Exemple #15
0
def trained_model(train_features, train_outcomes):
    np.random.seed(44)
    log_reg = sm.Logit(train_outcomes, train_features)
    param_array = np.zeros(len(train_features) + 1)
    iterations = 5
    for i in range(0, iterations):
        model = log_reg.fit(maxiter=5000,
                            avextol=0.0001,
                            epsilon=0.1,
                            full_output=1,
                            disp=0)
        params = list(model.params)
        param_array = [a + b for a, b in zip(param_array, params)]
        start = [p / iterations for p in param_array]
        model = log_reg.fit(
            start_params=start,
            maxiter=5000,
            avextol=0.0001,
            epsilon=0.1,
            full_output=1,
            disp=1,
        )
    return model
Exemple #16
0
 def test_fit_firth(self):
     p = np.loadtxt(P_BINARY)
     m = np.loadtxt(M)
     mod = smf.Logit(p, m)
     start_vec = np.zeros(m.shape[1])
     start_vec[0] = np.log(np.mean(p) / (1 - np.mean(p)))
     (intercept, kbeta, beta, bse, fitll) = fit_firth(mod, start_vec, m, p)
     self.assertAlmostEqual(intercept, 0.13954805021495864)
     self.assertAlmostEqual(kbeta, -0.31901219992017243)
     tbeta = [
         1.9588025, 0.7251749, -0.5605268, -0.5396909, 0.0594742,
         -0.2001795, -1.4873298, 0.5050208
     ]
     self.assertTrue(abs((np.array(beta) - np.array(tbeta)).max()) < 1E-7)
     self.assertAlmostEqual(bse, 2.848207537910185)
     self.assertAlmostEqual(fitll, -58.249948818380204)
     fitll = fit_firth(mod,
                       start_vec,
                       m,
                       p,
                       step_limit=10,
                       convergence_limit=1E-10)
     self.assertEqual(fitll, None)
Exemple #17
0
def logit(RV, df_norm, keys):
    #%%
    '''
    X contains all precursor data, incl train and test
    X_train, y_train are split up by TrainIsTrue
    Preciction is made for whole timeseries    
    '''
    if keys is None:
        no_data_col = ['TrainIsTrue', 'RV_mask', 'fit_model_mask']
        keys = df_norm.columns
        keys = [k for k in keys if k not in no_data_col]

    X = df_norm[keys]
    X = add_constant(X)
    y = RV.RV_bin_fit
    # Get training years
    TrainIsTrue = df_norm['TrainIsTrue']
    # Get mask to make only prediction for RV_mask dates
    pred_mask = df_norm['RV_mask']

    model_set = sm.Logit(y[TrainIsTrue], X[TrainIsTrue], disp=0)
    try:
        model = model_set.fit(disp=0, maxfun=60)
        prediction = model.predict(X[pred_mask])
    except np.linalg.LinAlgError as err:
        if 'Singular matrix' in str(err):
            model = model_set.fit(method='bfgs', disp=0)
            prediction = model.predict(X[pred_mask])
        else:
            raise
    except Exception as e:
        print(e)
        model = model_set.fit(method='bfgs', disp=0)
        prediction = model.predict(X)
    #%%
    return prediction, model
data.replace({'diagnosis': 'M'}, 1, inplace=True)
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
data[data.columns] = minmax.fit_transform(data.values)

## Model Building

features = [i for i in data.columns if i != 'diagnosis']
x = data[features]
y = pd.DataFrame(data['diagnosis'])

import statsmodels.formula.api as sm
import statsmodels.discrete.discrete_model as sm

model = sm.Logit(y, x)
result = model.fit(method='ncg')
print(result.summary())

#ROC Curve
from sklearn.metrics import roc_curve, auc

x['predict'] = result.predict(x)
fpr, tpr, thresholds = roc_curve(y, x['predict'])
roc_auc = auc(fpr, tpr)
print("area under the ROC curve:%f" % roc_auc)

# Optimal Cutoff
i = np.arange(len(tpr))
roc = pd.DataFrame({
    'fpr': pd.Series(fpr, index=i),
Exemple #19
0
predict2 = lr.predict(age2)
predict2
##### for age=105 value is greater than one.

#######From this linear regression,we can not interpret whether a person buys or not

################ Lab: Logistic Regression ######################

#Dataset: Product Sales Data/Product_sales.csv
sales = pd.read_csv(
    "C:\\Koti\\data science\\DS_batch1\\datasets\\Product_sales.csv")

# Build a logistic Regression line between Age and buying
import statsmodels.formula.api as sm

logit = sm.Logit(sales['Bought'], sales['Age'])
logit
result = logit.fit()
result
result.summary2()

###coefficients Interval of each coefficient
print(result.conf_int())

#One more way of fitting the model
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(sales[["Age"]], sales["Bought"])

#A 4 years old customer, will he buy the product?
Exemple #20
0
    trainingSet=data1
cat_vars=['Sex','Embarked']
data_vars=trainingSet.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]
data_final=trainingSet[to_keep]
data_final.columns.values
data_final_vars=data_final.columns.values.tolist()
y=['Survived']

X=[i for i in data_final_vars if i not in y]
cols=["Age", "Sex_male","Sex_female"]
X=data_final[cols]
y=data_final['Survived']


model =  sm.Logit(y,X)
result=model.fit()
print(result.summary())


logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X)
print(y_pred)
print('Accuracy of logistic regression classifier on training set: {:.2f}'.format(logreg.score(X, y)))


#Transform categorical data inot dummies

cat_vars=['Sex','Embarked']
for var in cat_vars:
Exemple #21
0
dataset = df[df['gender'] == 2]
dataset_data = np.array(dataset[['loan', 'months']])

predicted_2 = model.predict(dataset_data)
df_gender = list(df.gender)

j = 0
for i in range(0, len(df_gender) - 1):
    if df_gender[i] == 2:
        df_gender[i] = predicted_2[j]
        j += 1
df.gender = df_gender

#SM requires that intercept be manually entered
intercept = [1] * len(df)
df['intercept'] = intercept

x = df[[
    'months', 'group', 'female', 'Asia', 'North_America', 'South_America',
    'Europe', 'Africa', 'intercept'
]]
y = np.array(df[['expired']])
y = y.ravel()

#To determine % change from coefficientsl
#coefs = sm.Logit(y, x).fit().params.values
#for i in coefs:
#    print abs(math.exp(i) - 1)

print sm.Logit(y, x).fit().summary2()
Exemple #22
0
calc_prob = model.predict_proba(X_holdout)[:, 1]
Final_Output = df_holdout
Final_Output['Probability_of_Attrition'] = calc_prob
Final_Output['Final_Prediction'] = final
Final_Output.head()

# Save Dataframe to CSV
Final_Output.to_csv("Final_Output.csv")

# Exploration of Log Reg Coefficients
# Reference used:
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

import statsmodels.api as sm
import statsmodels.formula.api as smf
logit_model=smf.Logit(y_train, X_train)
results=logit_model.fit()
print(results.summary2())

# Column names for reference above
df_column_name = pd.DataFrame(list(df_full_data.drop(['Attrition'], axis=1).columns.values))
df_column_name.index = np.arange(1, len(df_column_name) + 1)
df_column_name

import statsmodels.api as sm
import statsmodels.formula.api as smf
#logit_model=smf.Logit(y_train, X_train)
#results=logit_model.fit()
#print(results.summary2())

model= smf.logit(formula="Attrition~ Age + DailyRate + DistanceFromHome + EnvironmentSatisfaction + JobInvolvement + JobSatisfaction + NumCompaniesWorked + RelationshipSatisfaction + TotalWorkingYears + TrainingTimesLastYear + WorkLifeBalance + YearsAtCompany + MaritalStatus_Divorced + MaritalStatus_Married + MaritalStatus_Single + OverTime_No + OverTime_Yes", data= df_full_data).fit()
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
              step=0.01),
    np.arange(start=X_set[:, 1].min() - 1,
              stop=X_set[:, 1].max() + 1,
              step=0.01))
plt.contourf(X1,
             X2,
             classifier.predict(np.array([X1.ravel(),
                                          X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75,
             cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0],
                X_set[y_set == j, 1],
                c=ListedColormap(('red', 'green'))(i),
                label=j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

import statsmodels.formula.api as sm

model = sm.Logit(y_train, X_train)

result = model.fit()
Exemple #24
0
# Estimate a standard random utility model

# Descrition

from __future__ import division

import statsmodels.formula.api as smf

from sc_4_1_build_final_data import select_variables_final_dataset

data = select_variables_final_dataset(weekend=False, selection=0)

data_suburb = data.query('option_dt == 0')
data_suburb['excess_cost_vp'] = data_suburb['cost_vp'] - data_suburb['cost_tc']
data_suburb['excess_consumption_vp'] = data_suburb['income'] - data_suburb[
    'excess_cost_vp']

variables = ['excess_consumption_vp']

logit = smf.Logit(data_suburb['option_vp'], data_suburb[variables]).fit()
print logit.summary()
params = logit.params

print logit.get_margeff().summary()

#probit = smf.Probit(data['option_downtown'], data[variables]).fit()
#print probit.summary()

# Bosser sur un logit imbriqué (banlieue vs centre ville, puis vp vs tc)
plot_roc(lr_prob)





#STATSMODELS 


import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn import linear_model,cross_validation, feature_selection,preprocessing

x_train,x_test, y_train, y_test = cross_validation.train_test_split(X, y, train_size=0.80, random_state=5)
model = sm.Logit(y_train, add_constant(X_train)).fit()
model.summary()


# Spark 






from __future__ import print_function
import sys
from pyspark.sql import SparkSession

print("X:", type(X))
print(X.columns)
model = smf.OLS(y, X)
result = model.fit()
result.summary()
model = smf.OLS.from_formula('quality ~ alcohol', data=dataset)
results = model.fit()
print(results.params)
#Classification using stats model.
dataset['rate_code'] = (dataset['quality'] > 4).astype(np.float32)

y, X = dmatrices('rate_code ~ alcohol', data=dataset)
sns.distplot(X[y[:, 0] > 0, 1])
sns.distplot(X[y[:, 0] == 0, 1])

model = smf.Logit(y, X)
result = model.fit()
result.summary2()

yhat = result.predict(X)
sns.distplot(yhat[y[:, 0] > 0])
sns.distplot(yhat[y[:, 0] == 0])

yhat = result.predict(X) > 0.955
print(sklearn.metrics.classification_report(y, yhat))

#Classification using sklrean logistic regression.
model = sklearn.linear_model.LogisticRegression()
y, X = dmatrices(
    'rate_code ~ alcohol + sulphates + citric_acid + fixed_acidity',
    data=dataset)
Exemple #27
0
    # auc  
    print('auc : %.3f' % auc(fpr,tpr))
    # ROC 曲线
    plt.plot(fpr,tpr,'k--',label='ROC (area = %.3f)' % auc(fpr,tpr))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(' Roc Curve')
    plt.legend()
    plt.show()
    
dat4x = dat4.drop(['churn'],axis=1)
dat4y = dat4.churn

X_train,X_test,y_train,y_test = train_test_split(dat4x,dat4y,test_size=0.3)

model4 = smf.Logit(y_train,X_train).fit()
y_test_proba_4 = model4.predict(X_test)

# auc 面积高达0.913  是不是有问题?
FuncScore(y_test,y_test_proba_4)
#%%
# 2.2.4  使用LASSO 和Ridge
# 查看下样本比例,4:3 ,接近1:1
print(dat3.churn.value_counts())
# 使用网格搜索交叉验证
# 训练集 、 测试集拆分
from sklearn.model_selection import GridSearchCV

dat5x = dat3.drop(['churn'],axis=1)
dat5y = dat3.churn
dummy = pd.get_dummies(x['Gender'])  #########

x = pd.concat((x[['Age', 'EstimatedSalary']], dummy[['Female']]), axis=1)
# add constant!
x = statsmodels.api.add_constant(x)  #####################

# split into training and validation data
import sklearn.model_selection

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, Y, train_size=0.8)

# logistic regression
import statsmodels.formula.api as sm

model = sm.Logit(y_train, x_train)  # upper case
result = model.fit()  # 参数都在result里面
result.summary()  # 0.42

y_pre = result.predict(x_test)


def check(x):
    if x >= 0.5:
        i = 1
    else:
        i = 0
    return (i)


y_pred = y_pre.map(check)
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
df=pd.read_excel('Final_Fluview_Practical_dataset.xlsx')
df_regress=df[['Virus Strain','Age','Gender','Hospitalized?','Swine Contact?','Attended Agricultural Event?']]
print(df_regress[df_regress.isna().any(axis=1)])
df_regress=df_regress.dropna()
for column in df_regress:
    print(column,df_regress[column].unique())
df_regress['Virus Strain']=df_regress['Virus Strain'].map({'Influenza A H3N2v':1,'Influenza A H1N1v':0,'Influenza A H1N2v':0,'Influenza A H7N2':0})
df_regress['Age']=df_regress['Age'].map({'<18 Years':0,'>=18 Years':1})
df_regress['Gender']=df_regress['Gender'].map({'Male':0,'male':0,'Female':1,'female':1})
df_regress['Hospitalized?']=df_regress['Hospitalized?'].map({'No':0,'no':0,'Yes':1,'yes':1})
df_regress['Swine Contact?']=df_regress['Swine Contact?'].map({'No':0,'no':0,'Yes':1,'yes':1})
df_regress['Attended Agricultural Event?']=df_regress['Attended Agricultural Event?'].map({'No':0,'no':0,'Yes':1,'yes':1})
for column in df_regress:
    print(column,df_regress[column].unique())
endog=df_regress['Virus Strain']
exog=df_regress[['Age',
'Gender',
'Hospitalized?',
'Swine Contact?',
'Attended Agricultural Event?']]
exog = sm.add_constant(exog)
endog=endog.values
exog=exog.values
print(sum(endog))
logit=smf.Logit(endog,exog)
result=logit.fit()
print(result.summary())
# Create boxplots showing team salary distribution by world series wins
box_data = binary[['std_salary', 'WSWin_Y']]
bp = box_data.boxplot(by='WSWin_Y')
ax = plt.gca()
plt.title(
    'Salary Distribution For \n World Series Winners (1) vs. Non Winners (0)')
plt.suptitle("")
ax.set_ylabel('Standardized Salary')
ax.set_xlabel('World Series Winner? 1 = Yes, 0 = No')

# To quantify the relationship between standardized salary and world series wins, I ran a logistic regression, which is the appropriate model for a situation where the dependent variable is binary. The model yields a positive coefficient for standardized salary, but interestingly, the coefficient is not statistically significant at the 95% confidence level. It might be at the 90% confidence level, but here it is not. Contrast that with the winning percentage OLS regression above, where the coefficient was statistically significant. To me, this says that the case for salary as a determinant of world series championships is perhaps weaker than it is for winning percentage.

# In[56]:

# Logistic regression of WS Win on Standardized Salary
logit = sm.Logit(binary['WSWin_Y'], binary['std_salary'])
log_result = logit.fit()
print log_result.summary()

# (4) Conclusion
#
# At the beginning of this project, I set out to understand the relationship between salary compensation and performance in baseball, ultimately using winning percentage and world series wins as measures of performance. Using python pandas and inferential statistics, I discovered that there is in fact a significant relationship between a standardized salary metric and winning percentage, though not one as strong as I had hypothesized. There is a nominally positive relationship also between salary paid and world series wins, although it is not sigificant at the 95% level.
#
# My advice for general managers in baseball, based on this data, is that to have a shot at the world series you want to have a salary that is above the average compared to other teams in the league (see the box plot above). After that, it's hard to say what improves your odds as having an even higher salary level doesn't necessarily do so. Further research might look at key injuries in a season, the relationship between player performance and salary, or other factors.

# (5) Potential Limitations
#
# There are definitely limitations to this analysis which prevent it from *proving* that salary drives winning. First, as mentioned above, the inferential statistics methods I used do indicate 'statistical significance' at a 95% level. But at a higher level of confidence, we might fail to reject the null hypothesis that the coefficient for salary in this specification is actually zero. That is even clearer in the second exercise where I looked at salary as a driver of world series wins. In that case, we could not reject the null hypothesis.
#
# As my first reviewer also pointed out, there is also the limitation in statistics that "correlation does not imply causation." It is possible that instead of salary driving performance, the causal link could run the other direction: teams that start winning have to pay players more to keep them. You could imagine a team of young players that becomes successful, and as a result payroll becomes very expensive. We did not look within the dataset to see if this was true, and there may be data limitations which prevent us from doing so.
#