def weight_generating(x, y, param=0):
    print('---------- Generating weights ----------')
    if param == 0:
        model = Probit(y, add_constant(x), missing='drop')
        Probit_model = model.fit()
    elif param == 1:
        while True:
            random_index = [random.choice([True, False]) for _ in range(len(y))]
            X = x[random_index]
            Y = y[random_index]
            model = Probit(Y, add_constant(X), missing='drop')
            Probit_model = model.fit()
            if not np.isnan(Probit_model.params[0]):
                break
    IM_list = []
    for count, i in tqdm(enumerate(y), desc='Computing inverse Mills ratios', ncols=100):
        if param == 0:
            IM_list.append(im(Probit_model.fittedvalues[count]))
        elif param == 1:
            tmp = [1]
            tmp.extend(list(x.iloc[count, ]))
            IM = im(dot(tmp, Probit_model.params))
            IM_list.append(IM)
    weight = np.true_divide(IM_list, np.mean(IM_list))
    return weight
Exemple #2
0
    def estimate_recession_probability(cls,
                                       macro_data=None,
                                       macro_indicator=None,
                                       start_date=None):
        import scipy
        from statsmodels.discrete.discrete_model import Logit, Probit
        recession_data, recession_start_dates, recession_end_dates = \
            cls.retrieve_recession_data(macro_data=macro_data,
                                        start_date=start_date)

        # Settings for the analysis
        recession_prediction_window = 252
        windows = [1, 3, 6, 9, 12, 18, 24]
        halflifes = [0.5, 1, 2, 3, 6, 9, 12]

        r2 = []
        ll = []
        recession_probs = pd.DataFrame(index=macro_indicator.index,
                                       columns=halflifes)
        recession_in_window = pd.DataFrame(index=macro_indicator.index)

        tmp = recession_data.rolling(window=recession_prediction_window,
                                     center=False).sum()
        tmp2 = tmp.shift(-recession_prediction_window)
        recession_in_window['RIW'] = np.minimum(1, tmp2)
        logits = []
        regs = []

        for i in range(0, len(windows)):
            logit_iv = pd.DataFrame(index=recession_data.index)
            logit_iv['Recession'] = recession_data
            logit_iv['RIW'] = recession_in_window['RIW']

            # Exponential moving average of the macro indicator
            window_ma = macro_indicator.ewm(halflife=halflifes[i], ignore_na=False,
                                            min_periods=0, adjust=True).mean()

            logit_iv['F1M'] = window_ma.copy()
            logit_iv['LF1'] = window_ma.shift(windows[i])
            logit_iv['DF1'] = window_ma.diff(halflifes[i] * 2)
            logit_iv['INT'] = logit_iv['F1M'] * logit_iv['DF1']
            logit_iv = logit_iv[(~np.isnan(logit_iv['DF1']))
                                & (~np.isnan(logit_iv['F1M']))]
            logit_iv['RIW'][np.isnan(logit_iv['RIW'])] = 0

            # tmp = Logit(logit_iv['RIW'], logit_iv[['F1M', 'DF1', 'INT']])
            tmp = Probit(logit_iv['RIW'], logit_iv['F1M'])
            # tmp = linear_model.OLS(logit_iv['RIW'], pd.DataFrame(logit_iv['F1M']))
            result = tmp.fit()
            predictions = result.fittedvalues[~np.isnan(result.fittedvalues)]
            logits.append(result)

            ll.append(result.llr)
            # recession_probs[str(i)] = np.exp(predictions) / (1 + np.exp(predictions))
            recession_probs.loc[logit_iv.index, halflifes[i]] \
                = scipy.stats.norm.cdf(predictions)
Exemple #3
0
def probit_parameterized_ranked(x,
                                y,
                                arrival_choice,
                                cut_choice,
                                cap_mode_choice=2,
                                save=True):
    """
    Run probit for specified choices
    Args:
        x (pd.DataFrame): Features
        y (pd.DataFrame): Target
        cap_mode_choice (int, optional): eco + 0 for slot capacity only, 1 for slot + daily average, 2 for all
        arrival_choice (int): Choice of arrival day. -1 for all.
        cut_choice: (int): -1 for all, 0 for before cut 1, 2 for before cut 2 and after and 2 for both 0 and 1
        save (bool): If true, saves output in text file

    Returns: Model
    """
    x = sm.add_constant(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=0)

    model = Probit(y_train, x_train)
    probit_model = model.fit()
    print(probit_model.summary())
    file_name = 'probit_arrival-{}_cut-{}_cap-{}.txt'.format(
        arrival_choice, cut_choice, cap_mode_choice)
    with open(
            os.path.join(os.path.join(results_path, zone, 'ranked'),
                         file_name), "w") as text_file:
        text_file.write(str(probit_model.summary()))

    r2 = r2_score(y_test, probit_model.predict(x_test))

    if save:
        output = {
            'mode': cap_mode_choice,
            'arrival_choice': arrival_choice,
            'cut_choice': cut_choice
        }
        for key, value in probit_model.params.items():
            output[key] = value
        output['r2'] = r2
        file_name = 'probit_arrival-{}_cut-{}_cap-{}.txt'.format(
            arrival_choice, cut_choice, cap_mode_choice)
        with open(
                os.path.join(os.path.join(results_path, zone, 'ranked'),
                             file_name), "w") as text_file:
            text_file.write(str(probit_model.summary()))
        save_result_to_file(pd.DataFrame([output]), 'probit.csv',
                            os.path.join(results_path, zone, 'ranked'))
    print('R2 score = ', r2)
    return probit_model
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Binomial(link=families.links.probit()))
        res = mod.fit(method="newton", tol=1e-10)
        from statsmodels.discrete.discrete_model import Probit
        mod2 = Probit(df['constrict'], df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(method="newton", tol=1e-10)

        cls.infl1 = MLEInfluence(res)  # res.get_influence()
        cls.infl0 = res2.get_influence()
def probitfn(dataset):
    Y = dataset.RawData['f']
    X = dataset.RawData['H']
    model = Probit(Y, X.astype(float))
    probit_model = model.fit()
    #a = probit_model.fittedvalues()
    #print(probit_model.summary())
    #print(np.mean(Probit.pdf(Y, X)))
    inverseM = -1 * Probit.pdf(Y, X) / Probit.cdf(Y, X)
    #print(inverseM)
    return inverseM
Exemple #6
0
def probit():
    '''
    A function for running the probit.
    '''
    p = {}
    p['γ'] = 0.8
    p['β'] = 1
    p['a'] = 0
    p['ρ'] = 1
    p['η'] = 0.2
    p['δ'] = -0.2
    p['δ0'] = -0.1
    p['ν'] = 0.5
    simdata_a0 = LaborSupplySim(p, 10000)
    simdata_a0.simulate()
    simdata_a0.generate_panel()

    Y = simdata_a0.PanelData['LFP']
    X = simdata_a0.PanelData['Z']
    model = Probit(Y, X.astype(float))
    probit_model = model.fit()
    print(probit_model.summary())
Exemple #7
0
def probit(df, y_var, X_vars, add_intercept=True):
    """
    This function replicates probit in STATA, for probit model.
    至少有一个固定效应变量,至多只能有两个。
    被解释变量y为0-1变量时,模型才有意义

    Inputs.
    ---------
    df:pd.DataFrame, the data for OLS.
    y_var:str, the column name of the dependent variable, 被解释变量y应为0-1变量
    X_vars:list of str, the list of explanatory variable names

    Outputs.
    ---------
    res:obj

    """
    new_df = df.copy()
    new_df = new_df.dropna()
    y = new_df[y_var]

    if add_intercept:
        new_df['intercept'] = 1.0
        X = new_df[['intercept'] + X_vars]
    else:
        X = new_df[X_vars]

    probit_mod = Probit(endog=y, exog=X, check_rank=True, missing="drop")
    res = probit_mod.fit(start_params=None,
                         method='newton',
                         maxiter=35,
                         full_output=1,
                         disp=1,
                         callback=None)

    return res
Exemple #8
0
import pandas as pd
import numpy as np
from preprocessor import Preprocessor
from statsmodels.discrete.discrete_model import Probit

pd.set_option('display.max_columns', 10)
df = pd.read_excel("Project 2 - Data.xls")

preprocessor = Preprocessor(df)
x_train, y_train, x_test, y_test = preprocessor.combine()

model = Probit(y_train, x_train)
probit_model = model.fit()
predict_proba = probit_model.predict(x_test)


def predict(predict_proba):
    prediction = []
    for probability in predict_proba:
        if probability > 0.5:
            prediction.append(1)
        else:
            prediction.append(0)
    return prediction


def model_score(prediction, target):
    score = 0
    for i in range(len(prediction)):
        if prediction[i] == target[i]:
            score += 1
def table1_reg(df_reg, disp_it):
    """Function to create the tables for the first probit models.
    
        Args:
        dataFrame containing the categorial variables as dummies and the interaction terms
        disp_it boolean value indicating whether information about iterations should be displayed
        
        Returns:
        -------
        A table containing the regression output of the first 4 model specifications.
    """
    #first model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970']]
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y, X)
    probit_model1 = model1.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model1.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeff1 = probit_model1.get_margeff()
    #probit_margeff1.summary()

    #second model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y, X)
    probit_model2 = model2.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model2.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeff2 = probit_model2.get_margeff()
    probit_margeff2.summary()

    #third model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\
                'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \
                'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970',
                'd_hinccat3X1970', 'd_hinccat4X1970']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y, X)
    probit_model3 = model3.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model3.summary())

    #compute margins (get_margeff)
    probit_margeff3 = probit_model3.get_margeff()
    #probit_margeff3.summary()

    #fourth model
    Y = df_reg['_oral']
    X = df_reg[['sales', 'd1970', 'dsalesX1970', '_Phys', 'd_PhysX1970', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'any', 'anyX1970','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35',\
                'd_agecat20X1970', 'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','_Catholic' ,'_CatholicX1970',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', 'd_ed_cat13X1970', \
                'd_ed_cat16X1970', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', 'd_hinccat1X1970', 'd_hinccat2X1970',
                'd_hinccat3X1970', 'd_hinccat4X1970', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970']]
    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y, X)
    probit_model4 = model4.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model4.summary())

    #compute margins (get_margeff)
    probit_margeff4 = probit_model4.get_margeff()
    #print(probit_margeff4.summary())

    table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []})
    table[' '] = ['Sales ban', '','p-value', 'Sales ban x 1(1970)', ' ','p-value', 'Observations', 'Log Likelihood', \
                         'Additional Covariates', 'Legal Variables']
    table = table.set_index(' ')
    table['(1)'] = [round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)), round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[2],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[2],3)), round(probit_margeff1.pvalues[2],3), round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\
                        'R','PX' ]
    table['(2)'] = [round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)), round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[2],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[2],3)), round(probit_margeff2.pvalues[2],3), round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\
                        'R','PX, AD' ]
    table['(3)'] = [round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)), round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[2],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[2],3)), round(probit_margeff3.pvalues[2],3), round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\
                        'R,A,C,E,I','PX, AD' ]
    table['(4)'] = [round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)), round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[2],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[2],3)), round(probit_margeff4.pvalues[2],3), round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\
                        'R,A,C,E,I','PX, AD, K' ]

    return table, model1, model2, model3, model4
def table2_reg(df_reg, disp_it):
    """Function to create the tables for the second probit models.
    
        Args:
        dataFrame containing the categorial variables as dummies and the interaction terms
        
        Returns:
        -------
        A table containing the regression output of the 8 model specifications for the second table.
    """
    #1. _everuse_d as dependent variable
    #first model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y, X)
    probit_model1 = model1.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model1.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeff1 = probit_model1.get_margeff()
    #probit_margeff1.summary()

    #second model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y, X)
    probit_model2 = model2.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model2.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeff2 = probit_model2.get_margeff()
    probit_margeff2.summary()

    #third model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y, X)
    probit_model3 = model3.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model3.summary())

    #compute margins (get_margeff)
    probit_margeff3 = probit_model3.get_margeff()
    probit_margeff3.summary()

    #fourth model
    Y = df_reg['_everuse_d']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \
                'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']]

    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y, X)
    probit_model4 = model4.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model4.summary())

    #compute margins (get_margeff)
    probit_margeff4 = probit_model4.get_margeff()
    probit_margeff4.summary()

    #store results
    model1_help = model1
    model2_help = model2
    model3_help = model3
    model4_help = model3

    #2. _barrier as dependent variable
    #first model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model1 = Probit(Y, X)
    probit_model1 = model1.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model1.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeffb1 = probit_model1.get_margeff()
    probit_margeffb1.summary()

    #second model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970', 'dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970', 'anyX1965']]
    X['int'] = np.repeat(1, len(Y))
    model2 = Probit(Y, X)
    probit_model2 = model2.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model2.summary()) #got same results as paper

    #compute margins (get_margeff)
    probit_margeffb2 = probit_model2.get_margeff()
    probit_margeffb2.summary()

    #third model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965']]
    X['int'] = np.repeat(1, len(Y))
    model3 = Probit(Y, X)
    probit_model3 = model3.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model3.summary())

    #compute margins (get_margeff)
    probit_margeffb3 = probit_model3.get_margeff()
    probit_margeffb3.summary()

    #fourth model
    Y = df_reg['_barrier']
    X = df_reg[['sales', 'd1970','d1965', 'dsalesX1970','dsalesX1965', '_Phys', 'd_PhysX1970', 'd_PhysX1965', 'dreg2', 'dreg3', 'dreg4', \
                'dreg2X1970', 'dreg3X1970', 'dreg4X1970','dreg2X1965', 'dreg3X1965', 'dreg4X1965', 'any', 'anyX1970',
                'anyX1965','d_agecat20', 'd_agecat25', 'd_agecat30', 'd_agecat35', 'd_agecat20X1970', \
                'd_agecat25X1970', 'd_agecat30X1970', 'd_agecat35X1970','d_agecat20X1965', 'd_agecat25X1965', \
                'd_agecat30X1965', 'd_agecat35X1965','_Catholic' ,'_CatholicX1970', '_CatholicX1965',\
                'd_ed_cat9', 'd_ed_cat12', 'd_ed_cat13', 'd_ed_cat16', 'd_ed_cat9X1970', 'd_ed_cat12X1970', \
                'd_ed_cat13X1970',  'd_ed_cat9X1965', 'd_ed_cat12X1965', 'd_ed_cat13X1965', \
                'd_ed_cat16X1970','d_ed_cat16X1965', 'd_hinccat1', 'd_hinccat2', 'd_hinccat3', 'd_hinccat4', \
                'd_hinccat1X1970', 'd_hinccat2X1970', \
                'd_hinccat3X1970', 'd_hinccat4X1970',  'd_hinccat1X1965', 'd_hinccat2X1965', 'd_hinccat3X1965', \
                'd_hinccat4X1965', 'd_idealcat2', 'd_idealcat3', 'd_idealcat4', 'd_idealcat5', 'd_idealcat2X1970', \
                'd_idealcat3X1970', 'd_idealcat4X1970', 'd_idealcat5X1970', 'd_idealcat2X1965', \
                'd_idealcat3X1965', 'd_idealcat4X1965', 'd_idealcat5X1965']]

    X['int'] = np.repeat(1, len(Y))
    model4 = Probit(Y, X)
    probit_model4 = model4.fit(cov_type='cluster',
                               cov_kwds={'groups': df_reg['_region']},
                               disp=disp_it)
    #print(probit_model4.summary())

    #compute margins (get_margeff)
    probit_margeffb4 = probit_model4.get_margeff()
    probit_margeffb4.summary()

    #3. create table for output

    table = pd.DataFrame({'(1)': [], '(2)': [], '(3)': [], '(4)': []})
    table[' '] = ['Ever used Pill','Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ','p-value', 'Sales ban x 1(1970)', ' ','p-value',\
                  'Obersvations', 'Log Likelihood', ' ', 'Ever used barrier', 'Sales ban', '','p-value', 'Sales ban x 1(1965)', ' ',\
                  'p-value', 'Sales ban x 1(1970)', ' ','p-value',\
                  'Obersvations', 'Log Likelihood', \
                  'Additional Covariates', 'Legal Variables']
    table = table.set_index(' ')
    table['(1)'] = [' ', round(probit_margeff1.margeff[0],3), '({})'.format(round(probit_margeff1.margeff_se[0],3)),\
                    round(probit_margeff1.pvalues[0],3), round(probit_margeff1.margeff[4],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[4],3)), round(probit_margeff1.pvalues[4],3),\
                    round(probit_margeff1.margeff[3],3), \
                    '({})'.format(round(probit_margeff1.margeff_se[3],3)), round(probit_margeff1.pvalues[3],3),\
                    round(probit_margeff1.results.nobs,3), round(probit_margeff1.results.llf,3),\
                    ' ', ' ', round(probit_margeffb1.margeff[0],3), '({})'.format(round(probit_margeffb1.margeff_se[0],3)),\
                    round(probit_margeffb1.pvalues[0],3), round(probit_margeffb1.margeff[4],3), \
                    '({})'.format(round(probit_margeffb1.margeff_se[4],3)), round(probit_margeffb1.pvalues[4],3),\
                    round(probit_margeffb1.margeff[3],3), '({})'.format(round(probit_margeffb1.margeff_se[3],3)),\
                    round(probit_margeffb1.pvalues[3],3), round(probit_margeffb1.results.nobs,3),\
                    round(probit_margeffb1.results.llf,3), 'R','PX']

    table['(2)'] = [' ', round(probit_margeff2.margeff[0],3), '({})'.format(round(probit_margeff2.margeff_se[0],3)),\
                    round(probit_margeff2.pvalues[0],3), round(probit_margeff2.margeff[4],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[4],3)), round(probit_margeff2.pvalues[4],3),\
                    round(probit_margeff2.margeff[3],3), \
                    '({})'.format(round(probit_margeff2.margeff_se[3],3)), round(probit_margeff2.pvalues[3],3),\
                    round(probit_margeff2.results.nobs,3), round(probit_margeff2.results.llf,3),\
                    ' ', ' ', round(probit_margeffb2.margeff[0],3), '({})'.format(round(probit_margeffb2.margeff_se[0],3)),\
                    round(probit_margeffb2.pvalues[0],3), round(probit_margeffb2.margeff[4],3), \
                    '({})'.format(round(probit_margeffb2.margeff_se[4],3)), round(probit_margeffb2.pvalues[4],3),\
                    round(probit_margeffb2.margeff[3],3), '({})'.format(round(probit_margeffb2.margeff_se[3],3)),\
                    round(probit_margeffb2.pvalues[3],3), round(probit_margeffb2.results.nobs,3),\
                    round(probit_margeffb2.results.llf,3), \
                    'R','PX, AD' ]

    table['(3)'] = [' ', round(probit_margeff3.margeff[0],3), '({})'.format(round(probit_margeff3.margeff_se[0],3)),\
                    round(probit_margeff3.pvalues[0],3), round(probit_margeff3.margeff[4],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[4],3)), round(probit_margeff3.pvalues[4],3),\
                    round(probit_margeff3.margeff[3],3), \
                    '({})'.format(round(probit_margeff3.margeff_se[3],3)), round(probit_margeff3.pvalues[3],3),\
                    round(probit_margeff3.results.nobs,3), round(probit_margeff3.results.llf,3),\
                    ' ', ' ', round(probit_margeffb3.margeff[0],3), '({})'.format(round(probit_margeffb3.margeff_se[0],3)),\
                    round(probit_margeffb3.pvalues[0],3), round(probit_margeffb3.margeff[4],3), \
                    '({})'.format(round(probit_margeffb3.margeff_se[4],3)), round(probit_margeffb3.pvalues[4],3),\
                    round(probit_margeffb3.margeff[3],3), '({})'.format(round(probit_margeffb3.margeff_se[3],3)),\
                    round(probit_margeffb3.pvalues[3],3), round(probit_margeffb3.results.nobs,3),\
                    round(probit_margeffb3.results.llf,3),
                    'R,A,C,E,I','PX, AD' ]

    table['(4)'] = [' ', round(probit_margeff4.margeff[0],3), '({})'.format(round(probit_margeff4.margeff_se[0],3)),\
                    round(probit_margeff4.pvalues[0],3), round(probit_margeff4.margeff[4],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[4],3)), round(probit_margeff4.pvalues[4],3),\
                    round(probit_margeff4.margeff[3],3), \
                    '({})'.format(round(probit_margeff4.margeff_se[3],3)), round(probit_margeff4.pvalues[3],3),\
                    round(probit_margeff4.results.nobs,3), round(probit_margeff4.results.llf,3),\
                    ' ', ' ', round(probit_margeffb4.margeff[0],3), '({})'.format(round(probit_margeffb4.margeff_se[0],3)),\
                    round(probit_margeffb4.pvalues[0],3), round(probit_margeffb4.margeff[4],3), \
                    '({})'.format(round(probit_margeffb4.margeff_se[4],3)), round(probit_margeffb4.pvalues[4],3),\
                    round(probit_margeffb4.margeff[3],3), '({})'.format(round(probit_margeffb4.margeff_se[3],3)),\
                    round(probit_margeffb4.pvalues[3],3), round(probit_margeffb4.results.nobs,3),\
                    round(probit_margeffb4.results.llf,3),
                    'R,A,C,E,I','PX, AD, K' ]

    return table, model1, model2, model3, model4, model1_help, model2_help, model3_help, model4_help
Exemple #11
0
#https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.Probit.html

from statsmodels.discrete.discrete_model import Probit
p = Probit(df.child.map({'yes': 1, 'no': 0}), df[['age']])
a = p.fit()
a.summary2()
def core(tsy, dfx, map={0: '女', 1: '男'}):

    assert set(tsy.unique()) == {0, 1}, 'Y值只能为0或1'

    dfx = sm.add_constant(dfx, prepend=True)
    dfx = dfx.rename(columns={'const': '截距'})
    p = Probit(tsy, dfx)
    res = p.fit()
    #summary2 = res.summary2()

    #predict result
    prediction_probs = res.predict()
    prediction_bins = pd.Series(
        [1 if i >= 0.5 else 0 for i in prediction_probs],
        name='predicted_bins',
        index=tsy.index)
    tsy_predict = prediction_bins
    tsy_predict.name = '预测的' + tsy.name
    df_predict_result = pd.concat([tsy, tsy_predict], axis=1)

    #confusion matrix
    df_confusion_matrix = pd.DataFrame(confusion_matrix(tsy, tsy_predict),
                                       index=tsy.unique(),
                                       columns=tsy.unique())
    df_confusion_matrix.index = df_confusion_matrix.index.map(map)
    df_confusion_matrix.columns = df_confusion_matrix.columns.map(map)

    #report
    df_report = pd.DataFrame(list(
        precision_recall_fscore_support(tsy, tsy_predict)),
                             index=['精确度', '召回率', 'F1-值', '样本个数']).T.round(5)

    df_report.index = df_report.index.map(map)

    #roc
    fpr, tpr, thresholds = roc_curve(tsy, prediction_probs)
    roc_auc = auc(fpr, tpr)
    #logging.info("Area under the ROC curve : %f" % roc_auc)
    i = np.arange(len(tpr))  # index for df
    df_roc = pd.DataFrame({
        '假阳性率': pd.Series(fpr, index=i),
        '真阳性率': pd.Series(tpr, index=i)
    })

    #model description
    tables = res.summary().tables
    df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables]
    dfinfo1 = df_list[1].fillna('Variables').set_index(0)
    dfinfo1 = dfinfo1.T.set_index('Variables').T
    dfinfo1.index.name = '项'
    dfinfo1.columns.name = '参数类型'
    dfinfo1.columns = ['回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)']
    dfinfo1['or值'] = np.exp(res.params)
    df_description = dfinfo1

    df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T)
    df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2
    df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2
    df_report = df_report.T
    df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量']

    df_confusion_matrix = df_confusion_matrix.append(
        df_confusion_matrix.sum().to_frame(name='总和/平均').T)
    df_confusion_matrix = df_confusion_matrix.T
    df_confusion_matrix['name'] = ['混淆矩阵', '混淆矩阵']
    df_confusion_matrix = df_confusion_matrix.append(
        df_report).reset_index().set_index(['name', 'index'])
    df_confusion_matrix = df_confusion_matrix.T
    df_confusion_matrix.columns.names = [None, None]

    df_predict_result = df_predict_result.round(5)
    df_confusion_matrix = df_confusion_matrix.round(5)
    df_roc = df_roc.round(5)
    df_description = df_description.round(5)

    #self._debug = df_confusion_matrix
    return [{
        'tables': [
            {
                'table_info': '二元Probit回归分析结果汇总',
                'table_json': '{}',
                'table_html': df_description.to_html(),
                'chart': ['line', 'bar']
            },
            {
                'table_info': '二元Probit回归预测效果汇总:',
                'table_json': df_confusion_matrix.T.reset_index().to_json(),
                'table_html': df_confusion_matrix.to_html(),
                'chart': []
            },
            {
                'table_info': "ROC曲线(曲线下面积:%0.3f)" % roc_auc,
                'table_json': df_roc.to_json(),
                'table_html': df_roc.to_html(),
                'chart': ['scatter']
            },
        ]
    }, [{
        'table_df': df_predict_result,
        'label': '实际值与预测值'
    }]]
Exemple #13
0
 def probit_reg(x, y):
     """Univariate probit regression"""
     x = np.append(np.ones(10).reshape(-1, 1), x.reshape(-1, 1),
                   axis=1).reshape(len(x), 2)
     pm = Probit(y, x)
     return pm.fit().params
Exemple #14
0
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
'''Probit analysis plus plotting 3D graph of hit rate distribution with respect to delta and theta'''

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Receive Data
data = pd.read_csv("HL.csv")
print(data)
col = ["delta", "epsilon", "cross_term"]
dep_var = data["hits"].tolist()
X = data[col]
theta = data["theta"]

z = Probit(dep_var, X)
result = z.fit()
print(result.summary())

z = np.array(data["hits"].tolist())
x = np.array(data["epsilon"].tolist())
y = np.array(data["delta"].tolist())
print(z)
ax.scatter(x, y, z, s=1, c=None, depthshade=True)
plt.show()
Exemple #15
0
def main():
    # Magic numbers
    dMux = 0
    dSigmax = 1
    dMuepsilon = 0
    dSigmaepsilon = 1
    dMueta = 0
    dSigmaeta = 1
    iNobs = 1000
    vdBeta = np.array([1, 2])
    vdZeta = np.array([3, 4])
    vdDezinho = np.array([0])
    iSeed = 6969
    iNgroups = 11
    iIter = 1000

    # Initialisation
    np.random.seed(iSeed)
    vdBeta = np.array(vdBeta).reshape(-1, 1)
    vdZeta = np.array(vdZeta).reshape(-1, 1)
    iLenbeta = len(vdBeta)

    # Start the iterations
    ## Create objects to store the ATE, variance, test statistics and R-Squares
    dvATE = np.ones(iIter)
    dvVar = np.ones(iIter)
    dvTtest = np.ones(iIter)
    dvRsquared = np.ones(iIter)

    for i in range(iIter - 1):
        mdX = fnGenX(iNobs, iLenbeta, dMux, dSigmax)
        iLenX = mdX.shape[1]
        vdEpsilon = fnGenError(iNobs, dMuepsilon, dSigmaepsilon)
        vdPstar = fnGenPstar(mdX, vdBeta, vdEpsilon)
        vdD = fnGenTreat(vdPstar)
        vdEta = fnGenError(iNobs, dMueta, dSigmaeta)
        vdY = fnGenY(vdD, vdDezinho, mdX, vdZeta, vdEta)

        ## Create a dataframe with everything together
        ### This is not good because of the names, if we change the size of X then we need to manually change this, but I can check later how to make this better if needed
        dfData = pd.DataFrame(np.hstack([vdY, vdD, mdX]),
                              columns=['vdY', 'vdD', 'vdX1', 'vdX2'])
        dfData["vdD"] = dfData["vdD"] == 1
        ### Can work out later in a better layout for these descriptives
        #print dfData.groupby('vdD').describe().unstack(1).reset_index()

        # Estimation
        model = Probit(dfData['vdD'],
                       dfData[dfData.columns[-mdX.shape[1]:]].copy())
        probit_model = model.fit()
        #print(probit_model.summary())
        dRsquare = probit_model.prsquared
        # Get the predicted probabilities
        vdProbs = probit_model.predict(
            dfData[dfData.columns[-mdX.shape[1]:]].copy())

        ## Looking at the estimated probabilities
        #plt.figure(figsize=[10,8])
        #n, bins, patches = plt.hist(x=vdProbs, bins=8, color='#0504aa',alpha=0.7, rwidth=0.85)
        #plt.grid(axis='y', alpha=0.75)
        #plt.xlabel('Value',fontsize=15)
        #plt.ylabel('Frequency',fontsize=15)
        #plt.xticks(fontsize=15)
        #plt.yticks(fontsize=15)
        #plt.ylabel('Frequency',fontsize=15)
        #plt.title('Propensity Score Histogram',fontsize=15)
        #plt.show()

        ## Building the groups
        vdGroups = np.linspace(0, 1, iNgroups)
        ## Putting back Y, treatment and the propensity score
        dfFinalData = pd.DataFrame(np.hstack(
            [vdY, vdD, vdProbs.reshape(-1, 1)]),
                                   columns=['vdY', 'vdD', 'vdPS'])

        #dfGroup1  = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[0]) & (dfFinalData['vdPS'] < vdGroups[1])]
        dfGroup2 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[1])
                                   & (dfFinalData['vdPS'] < vdGroups[2])]
        dfGroup3 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[2])
                                   & (dfFinalData['vdPS'] < vdGroups[3])]
        dfGroup4 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[3])
                                   & (dfFinalData['vdPS'] < vdGroups[4])]
        dfGroup5 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[4])
                                   & (dfFinalData['vdPS'] < vdGroups[5])]
        dfGroup6 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[5])
                                   & (dfFinalData['vdPS'] < vdGroups[6])]
        dfGroup7 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[6])
                                   & (dfFinalData['vdPS'] < vdGroups[7])]
        dfGroup8 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[7])
                                   & (dfFinalData['vdPS'] < vdGroups[8])]
        dfGroup9 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[8])
                                   & (dfFinalData['vdPS'] < vdGroups[9])]
        #dfGroup10 = dfFinalData.loc[(dfFinalData['vdPS'] >= vdGroups[9]) & (dfFinalData['vdPS'] < vdGroups[10])]

        #dMean1 = dfGroup1.groupby('vdD').mean().iloc[1, 0] - dfGroup1.groupby('vdD').mean().iloc[0, 0]
        dMean2 = (dfGroup2.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup2.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup2.shape[0] / float(iNobs))
        dMean3 = (dfGroup3.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup3.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup3.shape[0] / float(iNobs))
        dMean4 = (dfGroup4.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup4.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup4.shape[0] / float(iNobs))
        dMean5 = (dfGroup5.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup5.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup5.shape[0] / float(iNobs))
        dMean6 = (dfGroup6.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup6.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup6.shape[0] / float(iNobs))
        dMean7 = (dfGroup7.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup7.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup7.shape[0] / float(iNobs))
        dMean8 = (dfGroup8.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup8.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup8.shape[0] / float(iNobs))
        dMean9 = (dfGroup9.groupby('vdD').mean().iloc[1, 0] -
                  dfGroup9.groupby('vdD').mean().iloc[0, 0]) * (
                      dfGroup9.shape[0] / float(iNobs))
        #dMean10 = dfGroup10.groupby('vdD').mean().iloc[1, 0] - dfGroup10.groupby('vdD').mean().iloc[0, 0]

        dATE = dMean2 + dMean3 + dMean4 + dMean5 + dMean6 + dMean7 + dMean8 + dMean9

        # Add an extra column with the mean of the corresponding treatment or no treatment inside the same block
        dfGroup2['vdYmean'] = dfGroup2.groupby("vdD")["vdY"].transform('mean')
        dfGroup3['vdYmean'] = dfGroup3.groupby("vdD")["vdY"].transform('mean')
        dfGroup4['vdYmean'] = dfGroup4.groupby("vdD")["vdY"].transform('mean')
        dfGroup5['vdYmean'] = dfGroup5.groupby("vdD")["vdY"].transform('mean')
        dfGroup6['vdYmean'] = dfGroup6.groupby("vdD")["vdY"].transform('mean')
        dfGroup7['vdYmean'] = dfGroup7.groupby("vdD")["vdY"].transform('mean')
        dfGroup8['vdYmean'] = dfGroup8.groupby("vdD")["vdY"].transform('mean')
        dfGroup9['vdYmean'] = dfGroup9.groupby("vdD")["vdY"].transform('mean')

        # Take the difference between the individual Y and the average of the corresponding group (by treated and non-treated)
        dfGroup2['dvDiffSquared'] = (dfGroup2['vdY'] - dfGroup2['vdYmean'])**2
        dfGroup3['dvDiffSquared'] = (dfGroup3['vdY'] - dfGroup3['vdYmean'])**2
        dfGroup4['dvDiffSquared'] = (dfGroup4['vdY'] - dfGroup4['vdYmean'])**2
        dfGroup5['dvDiffSquared'] = (dfGroup5['vdY'] - dfGroup5['vdYmean'])**2
        dfGroup6['dvDiffSquared'] = (dfGroup6['vdY'] - dfGroup6['vdYmean'])**2
        dfGroup7['dvDiffSquared'] = (dfGroup7['vdY'] - dfGroup7['vdYmean'])**2
        dfGroup8['dvDiffSquared'] = (dfGroup8['vdY'] - dfGroup8['vdYmean'])**2
        dfGroup9['dvDiffSquared'] = (dfGroup9['vdY'] - dfGroup9['vdYmean'])**2

        # For each line, add the number of individuals in the same treatment (or no treatment) group
        dfGroup2['iSizeGroup'] = dfGroup2.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup3['iSizeGroup'] = dfGroup3.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup4['iSizeGroup'] = dfGroup4.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup5['iSizeGroup'] = dfGroup5.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup6['iSizeGroup'] = dfGroup6.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup7['iSizeGroup'] = dfGroup7.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup8['iSizeGroup'] = dfGroup8.groupby("vdD")["vdY"].transform(
            'count')
        dfGroup9['iSizeGroup'] = dfGroup9.groupby("vdD")["vdY"].transform(
            'count')

        # Divide the squared difference by the square of the size of the corresponding group
        dfGroup2['dvDiffSquaredDivided'] = dfGroup2[
            'dvDiffSquared'] / dfGroup2['iSizeGroup']**2
        dfGroup3['dvDiffSquaredDivided'] = dfGroup3[
            'dvDiffSquared'] / dfGroup3['iSizeGroup']**2
        dfGroup4['dvDiffSquaredDivided'] = dfGroup4[
            'dvDiffSquared'] / dfGroup4['iSizeGroup']**2
        dfGroup5['dvDiffSquaredDivided'] = dfGroup5[
            'dvDiffSquared'] / dfGroup5['iSizeGroup']**2
        dfGroup6['dvDiffSquaredDivided'] = dfGroup6[
            'dvDiffSquared'] / dfGroup6['iSizeGroup']**2
        dfGroup7['dvDiffSquaredDivided'] = dfGroup7[
            'dvDiffSquared'] / dfGroup7['iSizeGroup']**2
        dfGroup8['dvDiffSquaredDivided'] = dfGroup8[
            'dvDiffSquared'] / dfGroup8['iSizeGroup']**2
        dfGroup9['dvDiffSquaredDivided'] = dfGroup9[
            'dvDiffSquared'] / dfGroup9['iSizeGroup']**2

        # Sum the V term for treated and non-treated individuals and multiply by the size of the block divided by population squared
        dVGroup2 = (dfGroup2.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup2.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup2.shape[0] / float(iNobs))**2)
        dVGroup3 = (dfGroup3.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup3.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup3.shape[0] / float(iNobs))**2)
        dVGroup4 = (dfGroup4.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup4.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup4.shape[0] / float(iNobs))**2)
        dVGroup5 = (dfGroup5.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup5.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup5.shape[0] / float(iNobs))**2)
        dVGroup6 = (dfGroup6.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup6.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup6.shape[0] / float(iNobs))**2)
        dVGroup7 = (dfGroup7.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup7.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup7.shape[0] / float(iNobs))**2)
        dVGroup8 = (dfGroup8.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup8.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup8.shape[0] / float(iNobs))**2)
        dVGroup9 = (dfGroup9.groupby("vdD").sum().iloc[1, 5] +
                    dfGroup9.groupby("vdD").sum().iloc[0, 5]) * (
                        (dfGroup9.shape[0] / float(iNobs))**2)

        # Compute the variance
        dVar = dVGroup2 + dVGroup3 + dVGroup4 + dVGroup5 + dVGroup6 + dVGroup7 + dVGroup8 + dVGroup9

        # Output
        #print ("ATE= %g" % dATE)
        #print ("Estimated Variance = %g" % dVar)

        # Compute the test statistic
        dTTest = dATE / (math.sqrt(dVar / iNobs))

        # Store results
        dvATE[i] = dATE
        dvVar[i] = dVar
        dvTtest[i] = dTTest
        dvRsquared[i] = dRsquare

        # Report results

        pd.DataFrame(stats.describe(dvATE[:-1]))
Exemple #16
0
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit


# In[13]:


y = df["Outcome"]
x = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "Age", "DiabetesPedigreeFunction"]]

logit_model = sm.Logit(y,x)
result=logit_model.fit()
print(result.summary())


# # Probit Regression

# In[14]:


probitmodel = Probit(y,x)
probit_model = probitmodel.fit()

print(probit_model.summary())