Exemple #1
0
def general_logistic_regression():
    '''Example General Logistic Recression,
    Example 7.4.1, p. 135'''

    # Get the data
    inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls'
    df = get_data(inFile)

    # Define the variables so that they match Dobson
    df['n_y'] = df['n'] - df['y']
    df['newstor'] = df['storage'] - 1
    df['x'] = np.log(df['centrifuge'])

    # Model 1
    model1 = smf.glm('n_y + y ~ newstor*x',
                     data=df,
                     family=sm_families.Binomial()).fit()
    print(model1.summary())

    # Model 2
    model2 = smf.glm('n_y + y ~ newstor+x',
                     data=df,
                     family=sm_families.Binomial()).fit()
    print(model2.summary())

    # Model 3
    model3 = smf.glm('n_y + y ~ x', data=df,
                     family=sm_families.Binomial()).fit()
    print(model3.summary())
Exemple #2
0
def logistic_regression():
    '''Logistic regression example
    chapter 7.3, p 130
    [tbd]: the cloglog values are inconsistent with those mentioned in the book.
    This is probably due to the specific definitions of "loglog" and "cloglog"
    in the respective languages.
    '''

    inFile = r'GLM_data/Table 7.2 Beetle mortality.xls'
    df = get_data(inFile)

    # adjust the unusual column names in the Excel file
    colNames = [name.split(',')[1].lstrip() for name in df.columns.values]
    df.columns = colNames

    # fit the model
    df['tested'] = df['n']
    df['killed'] = df['y']
    df['survived'] = df['tested'] - df['killed']
    model = smf.glm('survived + killed ~ x',
                    data=df,
                    family=sm_families.Binomial()).fit()
    print(model.summary())

    print('-' * 65)
    print('Equivalent solution:')

    model = smf.glm('I(n - y) + y ~ x', data=df,
                    family=sm_families.Binomial()).fit()
    print(model.summary())

    # The fitted number of survivors can be obtained by
    fits = df['n'] * (1 - model.fittedvalues)
    print('Fits Logit:')
    print(fits)

    # The fits for other link functions are:
    model_probit = smf.glm('I(n - y) + y ~ x',
                           data=df,
                           family=sm_families.Binomial(
                               sm_families.links.probit)).fit()
    print(model_probit.summary())

    fits_probit = df['n'] * (1 - model_probit.fittedvalues)
    print('Fits Probit:')
    print(fits_probit)

    model_cll = smf.glm('I(n - y) + y ~ x',
                        data=df,
                        family=sm_families.Binomial(
                            sm_families.links.cloglog)).fit()
    print(model_cll.summary())
    fits_cll = df['n'] * (1 - model_cll.fittedvalues)
    print('Fits Extreme Value:')
    print(fits_cll)
Exemple #3
0
    def __init__(self,
                 endog,
                 exog,
                 exog_vc,
                 ident,
                 vcp_p=1,
                 fe_p=2,
                 fep_names=None,
                 vcp_names=None,
                 vc_names=None):

        super(BinomialBayesMixedGLM, self).__init__(endog,
                                                    exog,
                                                    exog_vc=exog_vc,
                                                    ident=ident,
                                                    vcp_p=vcp_p,
                                                    fe_p=fe_p,
                                                    family=families.Binomial(),
                                                    fep_names=fep_names,
                                                    vcp_names=vcp_names,
                                                    vc_names=vc_names)

        if not np.all(np.unique(endog) == np.r_[0, 1]):
            msg = "endog values must be 0 and 1, and not all identical"
            raise ValueError(msg)
Exemple #4
0
    def from_formula(cls,
                     formula,
                     vc_formulas,
                     data,
                     vcp_p=1,
                     fe_p=2,
                     vc_names=None):

        fam = families.Binomial()
        x = _BayesMixedGLM.from_formula(formula,
                                        vc_formulas,
                                        data,
                                        family=fam,
                                        vcp_p=vcp_p,
                                        fe_p=fe_p,
                                        vc_names=vc_names)

        return BinomialBayesMixedGLM(endog=x.endog,
                                     exog_fe=x.exog_fe,
                                     exog_vc=x.exog_vc,
                                     ident=x.ident,
                                     vcp_p=x.vcp_p,
                                     fe_p=x.fe_p,
                                     fep_names=x.fep_names,
                                     vcp_names=x.vcp_names,
                                     vc_names=x.vc_names)
Exemple #5
0
    def setup_class(cls):
        df = data_bin
        res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
              family=families.Binomial()).fit(attach_wls=True, atol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = MLEInfluence(res)
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0)
        cls.res2 = reslogit.results_constraint2_robust

        mod1 = GLM(spector_data.endog,
                   spector_data.exog,
                   family=families.Binomial())

        # not used to match Stata for HC
        # nobs, k_params = mod1.exog.shape
        # k_params -= 1   # one constraint
        cov_type = 'HC0'
        cov_kwds = {'scaling_factor': 32 / 31}
        # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)}
        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(constr,
                                         cov_type=cov_type,
                                         cov_kwds=cov_kwds,
                                         atol=1e-10)

        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1,
                                   R,
                                   q,
                                   fit_kwds={
                                       'atol': 1e-10,
                                       'cov_type': cov_type,
                                       'cov_kwds': cov_kwds
                                   })
        cls.constraints_rq = (R, q)
Exemple #7
0
def fit_logistic(X_hold,Y_hold,Firth=False,resBase=None,LRtest=True):
    """
    Fits a logistic regression model using standard (when Firth = False) or Firth's method (when Firth = True).
    resBase is the result of a previous call to a regression that is used to store data for Firth's method.
    LRtest indicates if the likelihood ratio test should be reported.
    """
    if not Firth:
        res = GLM(Y_hold, X_hold, family=families.Binomial()).fit()#XXX Confirm this with logistic using older XXXX
        # AICc adjustment
        res.aicc = statsmodels.tools.eval_measures.aicc(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1)
        # Correct BIC
        res.bic = statsmodels.tools.eval_measures.bic(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1)
    else:
        if resBase is None:
            sys.stderr.write('resBase must be provided to do Firth regression\n')
            sys.exit(1)
        elif type(resBase) is not statsmodels.genmod.generalized_linear_model.GLMResultsWrapper:
            sys.stderr.write('resBase must be type statsmodels.genmod.generalized_linear_model.GLMResultsWrapper\n')
            sys.exit(2)
        else:
            res = resBase
        #Do Firth's logistic regression
        (rint, rbeta, rbse, rfitll, pi) = fit_firth(Y_hold, X_hold, start_vec = None)
        
        if LRtest:    
            # LRT
            null_X = np.delete(arr=X_hold,obj=range(int(np.size(X_hold)/len(X_hold)))[1:int(np.size(X_hold)/len(X_hold))],axis=1)
            (null_intercept, null_beta, null_bse, null_fitll, null_pi) = fit_firth(Y_hold, null_X, start_vec = None)
            lrstat = -2.*(null_fitll - rfitll)
            lrt_pvalue = 1.
            if lrstat > 0.: # non-convergence
                lrt_pvalue = stats.chi2.sf(lrstat, 1)
            res.llnull = null_fitll
            res.lrstat = lrstat
            res.lrt_pval = lrt_pvalue
        
        # AICc adjustment for Firth model
        aicc = statsmodels.tools.eval_measures.aicc(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1])
        # AIC
        aic = statsmodels.tools.eval_measures.aic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1])
        # BIC
        bic = statsmodels.tools.eval_measures.bic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1])
        #Store parameters, standard errors, likelihoods, and statistics
        rint = np.array([rint])
        rbeta = np.array(rbeta)
        res.params = np.concatenate([rint,rbeta])
        res.bse = rbse
        res.llf = rfitll
        res.aicc = aicc
        res.aic = aic
        res.bic = bic
        
        #Get Wald p vals for parameters
        res.pvalues = 1. - chi2.cdf(x=(res.params/res.bse)**2, df=1)
        
        #Add predicted y
        res.predict = pi
        
    return res
Exemple #8
0
def iterate_logistic(X_hold,Y_hold, fixed_columns = [0], Firth=False):
    """
    Fits logistic regression to the provided data while using the fixed_columns in the regression.
    Firth specifies if Firth regression should be used.
    
    Returns matrices of fitted betas, pvalues, aic, aicc (second order aic), and bic
    """
    l = np.size(fixed_columns)+1
    k = np.shape(X_hold)[1]

    betas = np.zeros([k,l])
    pvalues = np.zeros([k,l])
    aic = np.zeros([k,1])
    aicc = np.zeros([k,1])
    bic = np.zeros([k,1])
    
    # Fit constant
    if Firth:
        null_X = np.delete(arr=X_hold,obj=range(int(np.size(X_hold)/len(X_hold)))[1:int(np.size(X_hold)/len(X_hold))],axis=1)
        (null_intercept, null_beta, null_bse, null_fitll, null_pi) = fit_firth(Y_hold, null_X, start_vec = None)
        
        #Using this as a way to return a model in the same class as GLM.
        res = GLM(Y_hold, null_X, family=families.Binomial()).fit()
        # AICc adjustment for Firth model
        res.aicc = statsmodels.tools.eval_measures.aicc(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1)
        # AIC
        res.aic = statsmodels.tools.eval_measures.aic(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1)
        # BIC
        res.bic = statsmodels.tools.eval_measures.bic(null_fitll, nobs=res.nobs, df_modelwc=res.df_model+1)
        #Store parameters, standard errors, likelihoods, and statistics
        res.params = np.array([null_intercept])
        #Get Wald p vals for parameters
        res.pvalues = 1. - chi2.cdf(x=(res.params/null_bse)**2, df=1)
    else:
        res = fit_logistic(X_hold[:,0],Y_hold)
    
    betas[0,:] = res.params
    pvalues[0,:] = res.pvalues
    aic[0] = res.aic
    aicc[0] = res.aicc
    bic[0] = res.bic
    
    #Set variable for use later
    resBase = copy.deepcopy(res)
    
    NAN = ~np.isnan(X_hold).any(axis=0)
    for i in range(1,k):
        if NAN[i]:
            if i not in fixed_columns:
                columns = fixed_columns.copy()
                columns.append(i)
                res = fit_logistic(X_hold[:,columns],Y_hold, Firth=Firth, resBase=resBase,LRtest=False)
                betas[i,:] = res.params
                pvalues[i,:] = res.pvalues
                aic[i] = res.aic
                aicc[i] = res.aicc
                bic[i] = res.bic
    return betas, pvalues,aic,aicc,bic
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Logit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
    def __init__(self, endog, exog_fe, exog_vc, ident, vcp_p=1,
                 fe_p=2, fep_names=None, vcp_names=None,
                 vc_names=None):

        super(BinomialBayesMixedGLM, self).__init__(
            endog=endog, exog_fe=exog_fe, exog_vc=exog_vc,
            ident=ident, vcp_p=vcp_p, fe_p=fe_p,
            family=families.Binomial(),
            fep_names=fep_names, vcp_names=vcp_names,
            vc_names=vc_names)
Exemple #11
0
    def setup_class(cls):
        df = data_bin
        mod = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
                  family=families.Binomial())
        res = mod.fit(method="newton", tol=1e-10)
        from statsmodels.discrete.discrete_model import Logit
        mod2 = Logit(df['constrict'], df[['const', 'log_rate', 'log_volumne']])
        res2 = mod2.fit(method="newton", tol=1e-10)

        cls.infl1 = res.get_influence()
        cls.infl0 = res2.get_influence()
    def setup_class(cls):
        endog_bin = (endog > endog.mean()).astype(int)
        cls.cov_type = 'cluster'

        mod1 = GLM(endog_bin, exog, family=families.Binomial(link=links.probit()))
        cls.res1 = mod1.fit(method='newton',
                            cov_type='cluster', cov_kwds=dict(groups=group))

        mod1 = smd.Probit(endog_bin, exog)
        cls.res2 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))
        cls.rtol = 1e-6
Exemple #13
0
def senility_and_WAIS():
    '''Another example of logistic regression.
    chapter 7.8, p 143
    [tbd]: I don't understand how the "Binomial model" (grouped response)
    is supposed to work, in either language'''

    inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls'
    df = get_data(inFile)
    
    # ungrouped
    model = smf.glm('s ~ x', data=df, family=sm_families.Binomial()).fit()
    print(model.summary())
Exemple #14
0
    def setup_class(cls):
        yi = np.array([0, 2, 14, 19, 30])
        ni = 40 * np.ones(len(yi))
        xi = np.arange(1, len(yi) + 1)
        exog = np.column_stack((np.ones(len(yi)), xi))
        endog = np.column_stack((yi, ni - yi))

        res = GLM(endog, exog, family=families.Binomial()).fit()

        cls.infl1 = res.get_influence()
        cls.infl0 = MLEInfluence(res)
        cls.cd_rtol = 5e-5
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        #res1ul = Logit(data.endog, data.exog).fit(method="newton", disp=0)
        cls.res2 = reslogit.results_constraint2

        mod1 = GLM(spector_data.endog, spector_data.exog,
                   family=families.Binomial())

        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(constr, atol=1e-10)

        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10})
        cls.constraints_rq = (R, q)
Exemple #16
0
    def setup_class(cls):
        cls.idx = slice(None)  # params sequence same as Stata
        cls.res2 = reslogit.results_constraint2

        mod1 = GLM(spector_data.endog, spector_data.exog,
                   family=families.Binomial())

        constr = 'x1 - x3 = 0'
        cls.res1m = mod1.fit_constrained(constr, atol=1e-10)

        # patsy compatible constraints
        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10})
        cls.constraints_rq = (R, q)
Exemple #17
0
    def setup_class(cls):
        cls.idx = slice(None)
        # params sequence same as Stata, but Stata reports param = nan
        # and we have param = value = 0

        cls.res2 = reslogit.results_constraint1

        mod1 = GLM(spector_data.endog, spector_data.exog,
                   family=families.Binomial())

        constr = 'x1 = 2.8'
        cls.res1m = mod1.fit_constrained(constr)

        R, q = cls.res1m.constraints
        cls.res1 = fit_constrained(mod1, R, q)
Exemple #18
0
def reply_analysis_report(data_input_path, data_output_path):
    reply_analysis = prepare_data_reply_analysis(data_input_path,
                                                 data_output_path)

    score = reply_analysis.assign(
        tweet_negative_score=lambda df: df.apply(
            lambda x: x["tweet_score"]
            if x["tweet_label"] == "NEGATIVE" else 1 - x["tweet_score"],
            axis=1),
        trump_negative_score=lambda df: df.apply(
            lambda x: x["trump_score"]
            if x["trump_label"] == "NEGATIVE" else 1 - x["trump_score"],
            axis=1))
    logits = scipy.special.logit(
        score[["negative_score_retweet", "negative_score_trump"]])
    logits.plot(kind="scatter",
                x="negative_score_trump",
                y="negative_score_retweet",
                alpha=0.1)
    logits.save("plots/logit_sentiment_score.png")
    print(
        "Naive Sentiment Score Calculation",
        scipy.stats.pearsonr(logits["negative_score_trump"],
                             logits["negative_score_retweet"]))

    tmp_data = reply_analysis[~reply_analysis["trump_label"].isnull()]
    x = sm.add_constant(tmp_data[[
        # "created_at_trump_day", "created_at_trump_month", "created_at_trump_year",
        "followers_count_norm",
        "friends_count_norm",
        "listed_count_norm",
        "statuses_count_norm"
    ]])
    res = GLM(tmp_data['trump_label'].astype("category").cat.codes,
              x,
              family=families.Binomial()).fit(attach_wls=True, atol=1e-10)
    print(res.summary())

    print(
        "ATE",
        backdoor_binary_respose_ate(reply_analysis, "trump_label", "NEGATIVE",
                                    "tweet_label", "NEGATIVE",
                                    "trump_created_at", "tweet_created_at",
                                    datetime.timedelta(minutes=0), "1D"))
Exemple #19
0
def probit_fit(x, resp):
    '''
    Probit fit with 95% CIs
    '''

    # binomial GLM with probit link
    model = GLM(resp,
                add_constant(x),
                family=families.Binomial(),
                link=families.links.probit())
    mod_result = model.fit(disp=0)
    xt = np.linspace(np.min(x), np.max(x), 100)

    r_hat = mod_result.predict(add_constant(xt))
    pred_summ = mod_result.get_prediction(
        add_constant(xt)).summary_frame(alpha=0.05)
    ci_5, ci_95 = pred_summ['mean_ci_lower'], pred_summ['mean_ci_upper']

    return mod_result.params, r_hat, (xt, ci_5, ci_95)
Exemple #20
0
def test_influence_glm_bernoulli():
    # example uses Finney's data and is used in Pregibon 1981

    df = data_bin
    results_sas = np.asarray(results_sas_df)

    res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']],
              family=families.Binomial()).fit(attach_wls=True, atol=1e-10)

    infl = res.get_influence(observed=False)

    k_vars = 3
    assert_allclose(infl.dfbetas, results_sas[:, 5:8], atol=1e-4)
    assert_allclose(infl.d_params, results_sas[:, 5:8] * res.bse.values, atol=1e-4)
    assert_allclose(infl.cooks_distance[0] * k_vars, results_sas[:, 8], atol=6e-5)
    assert_allclose(infl.hat_matrix_diag, results_sas[:, 4], atol=6e-5)

    c_bar = infl.cooks_distance[0] * 3 * (1 - infl.hat_matrix_diag)
    assert_allclose(c_bar, results_sas[:, 9], atol=6e-5)
    def from_formula(cls, formula, vc_formulas, data, vcp_p=1, fe_p=2):

        fam = families.Binomial()
        x = _BayesMixedGLM.from_formula(
            formula, vc_formulas, data, family=fam, vcp_p=vcp_p, fe_p=fe_p)

        # Copy over to the intended class structure
        mod = BinomialBayesMixedGLM(
            x.endog,
            x.exog,
            exog_vc=x.exog_vc,
            ident=x.ident,
            vcp_p=x.vcp_p,
            fe_p=x.fe_p,
            fep_names=x.fep_names,
            vcp_names=x.vcp_names,
            vc_names=x.vc_names)
        mod.data = x.data

        return mod
Exemple #22
0
    def setup_class(cls):
        vs = Independence()
        family = families.Binomial()
        np.random.seed(987126)
        Y = 1 * (np.random.normal(size=100) < 0)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        cls.result1 = mod1.fit()

        mod2 = GLM.from_formula("Y ~ X1 + X2 + X3", data=D, family=family)
        cls.result2 = mod2.fit(disp=False)
Exemple #23
0
def test_score_binomial(miss_frac):

    np.random.seed(23424)
    n, p = 100, 5

    for d in range(1, 5):

        # Generate the data
        icept = np.linspace(3, 5, p)
        fac = np.random.normal(size=(p, d))
        fac, _, _ = np.linalg.svd(fac, 0)
        sc = np.random.normal(size=(n, d))
        lp = np.dot(sc, fac.T) + icept
        mu = 1 / (1 + np.exp(-lp))
        endog = (np.random.uniform(size=(n, p)) < mu).astype(np.float64)
        valid = (np.random.uniform(size=(n, p)) > miss_frac).astype(np.bool)

        pca = GPCA(endog, d, family=families.Binomial(), valid=valid)

        par = np.concatenate((icept, fac.ravel()))
        grad = pca.score(par)
        ngrad = nd.Gradient(pca.loglike)(par)
        assert_allclose(grad, ngrad, rtol=1e-4, atol=1e-4)
Exemple #24
0
 def __init__(self):
     self.setup_class()  # why does nose do it properly
     from statsmodels.genmod.generalized_linear_model import GLM
     from statsmodels.genmod import families
     self.mod = lambda y, x: GLM(y, x, family=families.Binomial())
     self.y = self.y_bin
Exemple #25
0
def exercise7d1():
    '''Logistic regression example
    chapter 7.3, p 130
    [tbd]: the cloglog values are inconsistent with those mentioned in the book.
    This is probably due to the specific definitions of "loglog" and "cloglog"
    in the respective languages.
    '''
    
    inFile = r'GLM_data/Table 7.11 Hiroshima deaths.xls'
    df = get_data(inFile)

    df['radBin'] = np.array([0,1,10,50,100,200])

    # adjust the unusual column names in the Excel file
    colNames = df.columns.values
    colNames[2] = 'other'
    colNames[3] = 'total'
    df.columns = colNames
    df['n'] = df['total']
    df['y'] = df['leukemia']

    model = smf.glm('other + leukemia ~ radBin', data=df, family=sm_families.Binomial()).fit()
    print(model.summary())

    print('-'*65)
    print('Equivalent solution:')
    
    model = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial()).fit()
    print(model.summary())
    
    # The fitted number of survivors can be obtained by
    fits = df['n']*(1-model.fittedvalues)
    print('Fits Logit:')
    print(fits)
    
    # The fits for other link functions are:
    model_probit = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial(sm_families.links.probit)).fit()
    print(model_probit.summary())
    
    fits_probit = df['n']*(1-model_probit.fittedvalues)
    print('Fits Probit:')
    print(fits_probit)
    
    model_cll = smf.glm('I(n - y) + y ~ radBin', data=df, family=sm_families.Binomial(sm_families.links.cloglog)).fit()
    print(model_cll.summary())
    fits_cll = df['n']*(1-model_cll.fittedvalues)
    print('Fits Extreme Value:')
    print(fits_cll)

    x = np.arange(201)
    y = np.exp(model.params[0]+model.params[1]*x)/(1+np.exp(model.params[0]+model.params[1]*x))
    yUp = np.exp(model.params[0]+2*model.bse[0]+(model.params[1]+2*model.bse[1])*x)/(1+np.exp(model.params[0]+2*model.bse[0]+(model.params[1]+2*model.bse[1])*x))
    yLo = np.exp(model.params[0]-2*model.bse[0]+(model.params[1]-2*model.bse[1])*x)/(1+np.exp(model.params[0]-2*model.bse[0]+(model.params[1]-2*model.bse[1])*x))

    plt.plot(df['radBin'],df['leukemia']/df['total'],'.')
    plt.plot(x,1-y,linewidth=0.5,color='black',label='Logit Fit')
    plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds')
    plt.plot(x,1-yLo,'--',linewidth=0.5,color='red')
    plt.title('Logit Fit')
    plt.xlabel('Radiation Dose')
    plt.ylabel('Probability of death from Leukemia')
    plt.legend()

    plt.figure()
    y = norm.cdf(model_probit.params[0]+model_probit.params[1]*x)
    yUp = norm.cdf(model_probit.params[0]+2*model_probit.bse[0]+(model_probit.params[1]+2*model_probit.bse[1])*x)
    yLo = norm.cdf(model_probit.params[0]-2*model_probit.bse[0]+(model_probit.params[1]-2*model_probit.bse[1])*x)
    plt.plot(df['radBin'],df['leukemia']/df['total'],'.')
    plt.plot(x,1-y,linewidth=0.5,color='black',label='Probit Fit')
    plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds')
    plt.plot(x,1-yLo,'--',linewidth=0.5,color='red')
    plt.title('Probit Fit')
    plt.xlabel('Radiation Dose')
    plt.ylabel('Probability of death from Leukemia')
    plt.legend()

    plt.figure()
    y = 1-np.exp(-(np.exp(model_cll.params[0]+model_cll.params[1]*x)))
    yUp = 1-np.exp(-(np.exp(model_cll.params[0]+2*model_cll.bse[0]+(model_cll.params[1]+2*model_cll.bse[1])*x)))
    yLo = 1-np.exp(-(np.exp(model_cll.params[0]-2*model_cll.bse[0]+(model_cll.params[1]-2*model_cll.bse[1])*x)))
    plt.plot(df['radBin'],df['leukemia']/df['total'],'.')
    plt.plot(x,1-y,linewidth=0.5,color='black',label='CLL Fit')
    plt.plot(x,1-yUp,'--',linewidth=0.5,color='red',label='2 Sigma Bounds')
    plt.plot(x,1-yLo,'--',linewidth=0.5,color='red')
    plt.title('CLL Fit')
    plt.xlabel('Radiation Dose')
    plt.ylabel('Probability of death from Leukemia')
    plt.legend()
    pdb.set_trace()
Exemple #26
0
    def __init__(self, fam, nb_theta=None, mult_n=None):
        if fam == "poi":
            self.family = smf.Poisson()
        elif fam == "nb":
            if nb_theta is None:
                raise GlmpcaError(
                    "Negative binomial dispersion parameter 'nb_theta' must be specified"
                )
            self.family = smf.NegativeBinomial(alpha=1 / nb_theta)
        elif fam in ("mult", "bern"):
            self.family = smf.Binomial()
            if fam == "mult" and mult_n is None:
                raise GlmpcaError(
                    "Multinomial sample size parameter vector 'mult_n' must be specified"
                )
        else:
            raise GlmpcaError("unrecognized family type")
        #variance function, determined by GLM family
        vfunc = self.family.variance
        #inverse link func, mu as a function of linear predictor R
        ilfunc = self.family.link.inverse
        #derivative of inverse link function, dmu/dR
        hfunc = self.family.link.inverse_deriv
        self.glmpca_fam = fam
        if fam == "poi":

            def infograd(Y, R):
                M = ilfunc(R)  #ilfunc=exp
                return {"grad": (Y - M), "info": M}
        elif fam == "nb":

            def infograd(Y, R):
                M = ilfunc(R)  #ilfunc=exp
                W = 1 / vfunc(M)
                return {"grad": (Y - M) * W * M, "info": W * (M**2)}

            self.nb_theta = nb_theta
        elif fam == "mult":

            def infograd(Y, R):
                P = ilfunc(R)  #ilfunc=expit, P very small probabilities
                return {"grad": Y - (mult_n * P), "info": mult_n * vfunc(P)}

            self.mult_n = mult_n
        elif fam == "bern":

            def infograd(Y, R):
                P = ilfunc(R)
                return {"grad": Y - P, "info": vfunc(P)}
        else:  #this is not actually used but keeping for future reference
            #this is most generic formula for GLM but computationally slow
            raise GlmpcaError("invalid fam")

            def infograd(Y, R):
                M = ilfunc(R)
                W = 1 / vfunc(M)
                H = hfunc(R)
                return {"grad": (Y - M) * W * H, "info": W * (H**2)}

        self.infograd = infograd
        #create deviance function
        if fam == "mult":

            def dev_func(Y, R):
                return mat_binom_dev(Y, ilfunc(R), mult_n)
        else:

            def dev_func(Y, R):
                return self.family.deviance(Y, ilfunc(R))

        self.dev_func = dev_func
Exemple #27
0
 def mod(y, x):
     return GLM(y, x, family=families.Binomial())
Exemple #28
0
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families

import statsmodels.stats.tests.test_influence
test_module = statsmodels.stats.tests.test_influence.__file__
cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module))

file_name = 'binary_constrict.csv'
file_path = os.path.join(cur_dir, 'results', file_name)
df = pd.read_csv(file_path, index_col=0)

res = GLM(
    df['constrict'],
    df[['const', 'log_rate', 'log_volumne']],
    family=families.Binomial()).fit(
        attach_wls=True, atol=1e-10)
print(res.summary())

# ## get the influence measures
#
# GLMResults has a `get_influence` method similar to OLSResults, that
# returns and instance of the GLMInfluence class. This class has methods and
# (cached) attributes to inspect influence and outlier measures.
#
# This measures are based on a one-step approximation to the the results
# for deleting one observation. One-step approximations are usually accurate
# for small changes but underestimate the magnitude of large changes. Event
# though large changes are underestimated, they still show clearly the
# effect of influential observations
#
Exemple #29
0
plt.rc("figure", figsize=(16, 8))
plt.rc("font", size=14)

import statsmodels.stats.tests.test_influence

test_module = statsmodels.stats.tests.test_influence.__file__
cur_dir = cur_dir = os.path.abspath(os.path.dirname(test_module))

file_name = "binary_constrict.csv"
file_path = os.path.join(cur_dir, "results", file_name)
df = pd.read_csv(file_path, index_col=0)

res = GLM(
    df["constrict"],
    df[["const", "log_rate", "log_volumne"]],
    family=families.Binomial(),
).fit(attach_wls=True, atol=1e-10)
print(res.summary())

# ## get the influence measures
#
# GLMResults has a `get_influence` method similar to OLSResults, that
# returns and instance of the GLMInfluence class. This class has methods and
# (cached) attributes to inspect influence and outlier measures.
#
# This measures are based on a one-step approximation to the the results
# for deleting one observation. One-step approximations are usually accurate
# for small changes but underestimate the magnitude of large changes. Event
# though large changes are underestimated, they still show clearly the
# effect of influential observations
#
Exemple #30
0

if example == 1:
    print "normal"
    m = AdditiveModel(d)
    m.fit(y)
    x = np.linspace(-2,2,50)

    print m

import scipy.stats, time

if example == 2:
    print "binomial"
    mod_name = 'Binomial'
    f = families.Binomial()
    #b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)])
    b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(z)])
    b.shape = y.shape
    m = GAM(b, d, family=f)
    toc = time.time()
    m.fit(b)
    tic = time.time()
    print tic-toc
    #for plotting
    yp = f.link.inverse(y)
    p = b


if example == 3:
    print "Poisson"