Example #1
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        offset = -0.25 * np.ones(len(y))  # also check offset
        cov_type = 'HC0'
        modp = GLM(y,
                   x[:, :cls.k_nonzero],
                   family=family.Binomial(),
                   offset=offset)
        cls.res2 = modp.fit(cov_type=cov_type,
                            method='newton',
                            maxiter=1000,
                            disp=0)

        mod = GLMPenalized(y,
                           x,
                           family=family.Binomial(),
                           offset=offset,
                           penal=cls.penalty)
        mod.pen_weight *= 1  # lower than in other cases
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(cov_type=cov_type,
                           method='bfgs',
                           max_start_irls=0,
                           maxiter=100,
                           disp=0,
                           trim=0.001)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 1e-3
        cls.k_params = cls.k_nonzero
Example #2
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        modp = GLM(y, x[:, :cls.k_nonzero], family=family.Binomial())
        cls.res2 = modp.fit(disp=0)

        mod = GLMPenalized(y, x, family=family.Binomial(), penal=cls.penalty)
        mod.pen_weight *= .5
        mod.penal.tau = 0.05
        cls.res1 = mod.fit(method='bfgs', maxiter=100, disp=0)

        cls.exog_index = slice(None, cls.k_nonzero, None)

        cls.atol = 5e-3
Example #3
0
    def setup_class(cls):
        from statsmodels.datasets.star98 import load
        #from statsmodels.genmod.tests.results.results_glm import Star98
        data = load()
        exog = add_constant(data.exog, prepend=True)
        offset = np.ones(len(data.endog))
        exog_keep = exog[:, :-5]
        cls.mod2 = GLM(data.endog, exog_keep, family=family.Binomial(),
                       offset=offset)

        cls.mod1 = GLM(data.endog, exog, family=family.Binomial(),
                       offset=offset)
        cls.init()
Example #4
0
    def _initialize(cls):
        y, x = cls.y, cls.x
        x = x[:, :4]
        offset = -0.25 * np.ones(len(y))  # also check offset
        modp = GLM(y, x, family=family.Binomial(), offset=offset)
        cls.res2 = modp.fit(method='bfgs', max_start_irls=100)

        mod = GLMPenalized(y, x, family=family.Binomial(), offset=offset,
                           penal=cls.penalty)
        mod.pen_weight = 0
        cls.res1 = mod.fit(method='bfgs', max_start_irls=3, maxiter=100, disp=0,
                           start_params=cls.res2.params*0.9)

        cls.atol = 1e-10
        cls.k_params = 4
    def setup_class(cls):
        super(TestGAMBinomial, cls).setup_class()  #initialize DGP

        cls.family = family.Binomial()
        cls.rvs = stats.bernoulli.rvs

        cls.init()
Example #6
0
    def __init__(self):
        super(self.__class__, self).__init__()  #initialize DGP

        self.family = family.Binomial()
        self.rvs = stats.bernoulli.rvs

        self.init()
Example #7
0
def test_glmlogit_screening():

    y, x, idx_nonzero_true, beta = _get_logit_data()
    nobs = len(y)

    # test uses
    screener_kwds = dict(pen_weight=nobs * 0.75,
                         threshold_trim=1e-3,
                         ranking_attr='model.score_factor')

    xnames_true = ['var%4d' % ii for ii in idx_nonzero_true]
    xnames_true[0] = 'const'
    parameters = pd.DataFrame(beta[idx_nonzero_true],
                              index=xnames_true,
                              columns=['true'])

    xframe_true = pd.DataFrame(x[:, idx_nonzero_true], columns=xnames_true)
    res_oracle = GLMPenalized(y, xframe_true, family=family.Binomial()).fit()
    parameters['oracle'] = res_oracle.params

    #mod_initial = LogitPenalized(y, np.ones(nobs), pen_weight=nobs * 0.5)
    mod_initial = GLMPenalized(y, np.ones(nobs), family=family.Binomial())

    screener = VariableScreening(mod_initial, **screener_kwds)
    screener.k_max_add = 10
    exog_candidates = x[:, 1:]
    res_screen = screener.screen_exog(exog_candidates, maxiter=30)

    res_screen.idx_nonzero

    res_screen.results_final

    xnames = ['var%4d' % ii for ii in res_screen.idx_nonzero]
    xnames[0] = 'const'

    # smoke test
    res_screen.results_final.summary(xname=xnames)
    res_screen.results_pen.summary()
    assert_equal(res_screen.results_final.mle_retvals['converged'], True)

    ps = pd.Series(res_screen.results_final.params, index=xnames, name='final')
    # changed the following to allow for some extra params
    # parameters = parameters.join(ps, how='outer')
    parameters['final'] = ps

    assert_allclose(parameters['oracle'], parameters['final'], atol=0.005)
Example #8
0
    def __init__(self,
                 family_name='normal',
                 link_name='identity',
                 fam_params=None):
        """Constructor."""

        # Store link
        self.link_name = link_name
        if self.link_name.lower() == 'logit':
            self.link = L.logit
        elif self.link_name.lower() == 'log':
            self.link = L.log
        elif self.link_name.lower() == 'identity':
            self.link = L.identity
        elif self.link_name.lower() == 'sqrt':
            self.link = L.sqrt
        elif self.link_name.lower() == 'probit':
            self.link = L.probit
        family_kwargs = {}
        if self.link_name:
            family_kwargs['link'] = self.link
        # Store family
        self.family_name = family_name
        if self.family_name.lower() == 'normal':
            self.family = F.Gaussian(**family_kwargs)

            def rand(x):
                return np.random.normal(x, fam_params)
        elif self.family_name.lower() == 'binomial':
            self.family = F.Binomial(**family_kwargs)

            def rand(x):
                return np.random.binomial(1, x)
        elif self.family_name.lower() == 'poisson':
            self.family = F.Poisson(**family_kwargs)

            def rand(x):
                return np.random.poisson(x)

        self.rand = rand
        self.in_columns = None
        self.out_columns = None
    m.fit(y)
    x = np.linspace(-2, 2, 50)

    print(m)

    y_pred = m.results.predict(d)
    plt.figure()
    plt.plot(y, '.')
    plt.plot(z, 'b-', label='true')
    plt.plot(y_pred, 'r-', label='AdditiveModel')
    plt.legend()
    plt.title('gam.AdditiveModel')

if example == 2:
    print("binomial")
    f = family.Binomial()
    b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)])
    b.shape = y.shape
    m = GAM(b, d, family=f)
    toc = time.time()
    m.fit(b)
    tic = time.time()
    print(tic - toc)

if example == 3:
    print("Poisson")
    f = family.Poisson()
    y = y / y.max() * 3
    yp = f.link.inverse(y)
    p = np.asarray([scipy.stats.poisson.rvs(p) for p in f.link.inverse(y)],
                   float)
def test_umap_one():
    print('started')
    df = pd.read_csv(sys.argv[1], dtype={'location':str, 'Result':str})
    df=df.drop(df[df.BIRTH_DATETIME=='0'].index)
    phecodes = pd.read_csv(sys.argv[2], dtype=str)
    out = sys.argv[3]
    phe_list=[phe for phe in list(phecodes.PHECODE.unique()) if phe in df]
    phedf = df.loc[:, phe_list]
    phedf[phedf>0] = 1
    df[phe_list] = phedf
    print('loaded')
    #Create embeddings
    pca = PCA(n_components=50, random_state=42)
    pc_emb = pca.fit_transform(phedf)
    ump = umap.UMAP(metric='euclidean', n_components=10, random_state=42)
    ump_emb = ump.fit_transform(pc_emb)
    print('embedded')
    #create df
    reduced_df = pd.DataFrame(ump_emb, columns = ['UMP-'+str(i+1) for i in range(10)])
    reduced_df['CC_STATUS']=df['CC_STATUS']
    #Create visualization
    sns.set()
    sns.pairplot(reduced_df, hue="CC_STATUS", vars=['UMP-'+str(i+1) for i in range(10)], height=4, markers=['o', 's'], plot_kws=dict(alpha=0.1))
    plt.savefig(out)
    print('graphed')
    #test components
    reduced_df['newcc']=0
    reduced_df.loc[reduced_df['UMP-2']<-12, 'newcc']=1
    df['newcc']=reduced_df['newcc']
    print('opening file')
    out_file = open('files/umap_new_cases_chi_phecode_test_2.csv', 'w')
    out_file.write('phecode,chi2,p,dof,control_neg,case_neg,control_pos,case_pos\n')
    #Run univariate tests using this newcc col
    for phecode in phe_list:
        #Get count of people positive for this phecode in case
        case_pos = df.loc[(df.newcc==1) & (df[phecode]==1)].shape[0]
        #Get negative count in case
        case_neg = df.loc[(df.newcc==1) & (df[phecode]==0)].shape[0]
        #Get positive control
        control_pos = df.loc[(df.newcc==0) & (df[phecode]==1)].shape[0]
        #Get negative control
        control_neg = df.loc[(df.newcc==0) & (df[phecode]==0)].shape[0]
        #Run contingency test
        if case_pos>0 and case_neg>0 and control_pos>0 and control_neg>0:
            res=chi2_c([[control_neg, case_neg],[control_pos, case_pos]])
            #Write results
            out_file.write(','.join([phecode,str(res[0]),str(res[1]),str(res[2]),str(control_neg),str(case_neg),str(control_pos),str(case_pos)]))
            out_file.write('\n')
    out_file.close()
    print('ran phecode tests')
    #Get age
    df['AGE']= pd.to_datetime(df['BIRTH_DATETIME'].str[:10], format='%Y-%m-%d')
    df['AGE']=(datetime.datetime.now()-df['AGE']).astype('timedelta64[Y]')
    #Run same test procedure for covariates, but do regression (?)
    print('running regression')
    mod = smf.glm(formula='newcc ~ AGE + UNIQUE_PHECODES + RACE + GENDER + RECORD_LENGTH_DAYS', data=df, family=fam.Binomial())
    res = mod.fit()
    print(res.summary())
def covariate_analysis():
    cc_df = pd.read_csv(sys.argv[1])
    cc_df = cc_df.drop(cc_df[cc_df.BIRTH_DATETIME=='0'].index)
    #Compare sex, age, ethnicity, record_length, and most recent event
    #Get age
    cc_df['age'] = datetime.datetime.now() - cc_df["BIRTH_DATETIME"].str[:10].apply(dconvert)
    cc_df['age'] = cc_df['age'].apply(ddays)
    #Between Case and Control status
    all_res = smf.glm(formula="CC_STATUS ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=cc_df, family=fam.Binomial()).fit()
    print("Results for Case/control data:")
    print(all_res.summary())
    norm_df = cc_df.loc[cc_df.CC_STATUS==1]
    print(cc_df.shape)
    print(norm_df.shape)
    norm_df['normality_status'] = norm_df["Result"].apply(binarize_normal)
    normality_res = smf.glm(formula="normality_status ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=norm_df, family=fam.Binomial()).fit()
    print("Results for normal/abnormal data:")
    print(normality_res.summary())