def regression(self):

        print self.people.head(n=1)
        self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"


        self.logfile.write( "\n\n Sum Temp Interest NegBinom")
        m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        self.logfile.write( "\n\n Sum Temp Interest OLS")
        m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        self.logfile.write( "\n\n Pos Temp Interest NegBinom")
        m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        #lim_people = self.people[self.people.timePosInterest>0]
        self.logfile.write( "\n\n Pos Temp Interest OLS")
        m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())
    def regression(self):

        print self.people.head(n=1)
        self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"

        self.logfile.write( "\n\n Num Regions NegativeBinomial")
        m = glm("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
                data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        #lim_people = self.people[self.people.numRegions>0]
        self.logfile.write( "\n\n Num Regions OLS")
        m = ols("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
                data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())



        # we could use beta regression for normalized entropy
        #print "\n\n Region Entropy"
        #m = ols("entropy ~ C(gender,Treatment(reference='male')) ", #+ C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
        #        data=self.people).fit()
        #print m.summary() # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on



        self.logfile.write( "\n\n Sum Temp Interest")
        m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC"+str(+m.aic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())


        self.logfile.write( "\n\n Pos Temp Interest")
        m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())

        #lim_people = self.people[self.people.timePosInterest>0]
        self.logfile.write( "\n\n Pos Temp Interest OLS")
        m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
        self.logfile.write( "\n AIC "+str(m.aic))
        self.logfile.write( "\n BIC "+str(m.bic))
        for table in m.summary().tables:
            self.logfile.write(table.as_latex_tabular())
def generate_regression_models(df):
    # Using glm function in statsmodels.formula.api class to create regression models
    heart_deaths = sm.glm(formula="Heart_Disease_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
    cancer_deaths = sm.glm(formula="Cancer_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
    diabetes_deaths = sm.glm(formula="Diabetes_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
    resp_deaths = sm.glm(formula="Respiratory_Disease_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
   
    # Appending the different models to a list
    models = []
    models.append(heart_deaths)
    models.append(cancer_deaths)
    models.append(resp_deaths)
    models.append(diabetes_deaths)
    return models
Esempio n. 4
0
def estimate_latency(group):
    '''
  Use a linear regression to estimate time per work from a Pandas GroupBy.
  '''
    model = smf.glm(
        formula='Time ~ Work',
        data=group,
    ).fit()

    # pack up information about parameter estimates
    # so they can be programatically unpacked later
    decoder = {
        'Intercept': 'Overhead',
        'Work': 'Latency',
    }

    for spec, val in (({
            'statistic': statistic,
            'name': decoder[parameter],
    }, getter(parameter)) for parameter in model.params.keys()
                      for statistic, getter in (
                          ('Estimated', lambda param: model.params[parameter]),
                          ('Lower Bound',
                           lambda param: model.conf_int()[0][param]),
                          ('Upper Bound',
                           lambda param: model.conf_int()[1][param]),
                      )):
        group['{statistic} {name}'.format(**spec)] = val

    return group
Esempio n. 5
0
def test_formula_missing_exposure():
    # see 2083
    import statsmodels.formula.api as smf
    import pandas as pd

    d = {
        'Foo': [1, 2, 10, 149],
        'Bar': [1, 2, 3, np.nan],
        'constant': [1] * 4,
        'exposure': np.random.uniform(size=4),
        'x': [1, 3, 2, 1.5]
    }
    df = pd.DataFrame(d)

    family = sm.families.Gaussian(link=sm.families.links.log)

    mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure, family=family)
    assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray')

    exposure = pd.Series(np.random.uniform(size=5))
    assert_raises(ValueError,
                  smf.glm,
                  "Foo ~ Bar",
                  data=df,
                  exposure=exposure,
                  family=family)
    assert_raises(ValueError,
                  GLM,
                  df.Foo,
                  df[['constant', 'Bar']],
                  exposure=exposure,
                  family=family)
Esempio n. 6
0
def __stats_method(n1j, ni1, n11, family):
    '''
    If the expected counts are calculated via a statistical model,
    this function will do so. Expected counts are considered a
    function of n1j and ni1.

    Arguments:
        n1j (iterable): All adverse events for a single product
        ni1 (iterable): Total count of a particular AE across all products
        n11 (iterable): Total count of a particular AE for a particular product
        family (statsmodel family): The GLM family

    Returns:
        The expected counts for n11

    '''
    data = pd.DataFrame({'events': n11, 'prod_events': n1j, 'ae_events': ni1})
    model = smf.glm(formula='events ~ prod_events+ae_events',
                    data=data,
                    family=family)
    model = model.fit()

    if isinstance(family, sm.families.Poisson):
        dispersion = model.pearson_chi2 / model.df_resid
        if dispersion > 2:
            alpha, lb, ub = __test_dispersion(model, data)
            warnings.warn(
                """Variance does not equal the mean! Data likely overdispersed...\n
                          Consider utilizing the negative-binomial family instead of poisson.\n
                          Cameron-Trivedi alpha: {0:5.4f}, CI: ({1}, {2})""".
                format(alpha, lb, ub))

    return model.predict(data[['prod_events', 'ae_events']]).values
Esempio n. 7
0
def compute_pse(df, formula='resp ~ morph + location - 1'):
    """
    Compute Point of Subjective Equality based on responses in df. This is
    done by fitting a logit model on the response, and finding the point of
    inflection.

    Parameters
    ----------
    df : pd.DataFrame
        each row is a trial
    formula : str
        formula passed to the glm for fitting

    Returns
    -------
    pse : array (n_locations, )
        the pse estimates for each location
    """
    model = smf.glm(formula, df, family=sm.families.Binomial())
    try:
        res = model.fit()
        # now return the estimates
        p = res.params.as_matrix()
        pse = -p[:-1] / p[-1]
    except PerfectSeparationError:
        print("WARNING: got PerfectSeparationError, filling pses with 0.5")
        pse = np.ones(len(df.location.unique())) * 0.5

    return pse
Esempio n. 8
0
 def fit_with_logistic(self, threshold=0.5):
     formula = "%s~%s" % (self.y_col, "+".join(self.x_cols))
     model = smf.glm(formula, data=self.train_set, family=sm.families.Binomial())
     result = model.fit()
     predict_probs = result.predict(exog=self.test_set)
     real_values = self.test_set[self.y_col].map(lambda x: 1 if x == 'No' else 0)
     tp.output_table_with_prob(predict_probs, real_values, threshold=threshold, zero_one_col_texts=["Yes", "No"])
Esempio n. 9
0
    def outcome_model(self, model, print_model_results=True):
        """Used to specify the outcome model. Model used to predict the outcome via a logistic regression model

        model:
            -Independent variables to predict the outcome. Example) 'var1 + var2 + var3 + var4'
        print_model_results:
            -Whether to print the fitted model results. Default is True (prints results)
        """

        self._out_model = self._outcome + ' ~ ' + model
        f = sm.families.family.Binomial(sm.families.links.logit)
        log = smf.glm(self._out_model, self.df, family=f).fit()
        if print_model_results:
            print(
                '\n----------------------------------------------------------------'
            )
            print('MODEL: ' + self._out_model)
            print(
                '-----------------------------------------------------------------'
            )
            print(log.summary())

        dfx = self.df.copy()
        dfx[self._exposure] = 1
        self.df['pY1'] = log.predict(dfx)
        dfx = self.df.copy()
        dfx[self._exposure] = 0
        self.df['pY0'] = log.predict(dfx)
        self._fit_outcome_model = True
Esempio n. 10
0
    def test_all_methods(self):
        x_cols = ["Lag2"]
        formula = "Direction~Lag2"
        # print self.df.shape[0]
        train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
        # print train_data.shape[0]
        """ (d) logistic"""
        model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
        result = model.fit()
        test_data = self.df.ix[self.df["Year"] > 2008, :]
        probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
        pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
        tp.output_table(pred_values.values, test_data[self.y_col].values)

        train_X = train_data[x_cols].values
        train_y = train_data[self.y_col].values
        test_X = test_data[x_cols].values
        test_y = test_data[self.y_col].values
        """ (e) LDA """
        lda_res = LDA().fit(train_X, train_y)
        pred_y = lda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (f) QDA """
        qda_res = QDA().fit(train_X, train_y)
        pred_y = qda_res.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (g) KNN """
        clf = neighbors.KNeighborsClassifier(1, weights="uniform")
        clf.fit(train_X, train_y)
        pred_y = clf.predict(test_X)
        tp.output_table(pred_y, test_y)
        """ (h) logistic and LDA """
        """ (i) Is the purpose of the last question going through all methods with no direction?"""
Esempio n. 11
0
    def fit_model(self):
        """
        Fits Poisson model

        Returns
        -------
        p_val
            p-values for differential abundance test of all cell types
        """

        p_val = []
        K = self.y.shape[1]

        if self.y.shape[0] == 2:
            p_val = [0 for _ in range(K)]
        else:
            for k in range(K):
                data_ct = pd.DataFrame({"x": self.x[:, 0],
                                        "y": self.y[:, k]})

                model_ct = glm('y ~ x', data=data_ct,
                               family=sm.genmod.families.Poisson(), offset=np.log(self.n_total)).fit()
                p_val.append(model_ct.pvalues[1])

        self.p_val = p_val
Esempio n. 12
0
 def logistic_regression(self, use_glm=True):
     """
     (b) it seems the statistical significant predict variable is only Lag2. How disappointing...
     """
     formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
     model = (
         smf.glm(formula, data=self.df, family=sm.families.Binomial())
         if use_glm
         else smf.logit(formula, data=self.transformedDF)
     )
     result = model.fit()
     if use_glm:
         probs = result.fittedvalues
         """Beware the prob here is the index 0's prob, so we should use the lambda function below"""
         pred_values = probs.map(lambda x: 0 if x > 0.5 else 1)
     else:
         """The probability of being 1"""
         probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]])))
         pred_values = probs.map(lambda x: 1 if x > 0.5 else 0)
     """
     (c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%.
         Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%.
         Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%.
     """
     tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
Esempio n. 13
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''

    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)

    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    glm = glm('carbohydrate ~ age + weight + protein',
              family=Gaussian(),
              data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print glm.summary()

    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)
Esempio n. 14
0
    def fit_model(self,df, filters, model_expression):
        """
        Use statsmodels GLM to construct a model relation.

        Parameters
        ----------
        df : pandas.DataFrame
            Data to use for fit. Should contain all the columns
            referenced in the `model_expression`.
        filters : list of str
            Any filters to apply before doing the model fit.
        model_expression : str
            A patsy model expression that can be used with statsmodels.
            Should contain both the left- and right-hand sides.

        Returns
    -------
        fit : statsmodels.regression.linear_model.GLMSResults
        """
        df = util.apply_filter_query(df, filters)

        model=smf.glm(formula=model_expression, data=df, family=sm.families.Poisson())


        if len(model.exog) != len(df):
            raise ModelEvaluationError(
                'Estimated data does not have the same length as input.  '
                'This suggests there are null values in one or more of '
                'the input columns.')

        with log_start_finish('statsmodels GLM fit', logger):
            return model.fit()
    def test3(self):

        results = pd.read_csv("../Data/results.csv")
        results = results[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']]
        results = results.rename(columns={
            'FTHG': 'HomeGoals',
            'FTAG': 'AwayGoals'
        })
        results_data = pd.concat([
            results[['HomeTeam', 'AwayTeam',
                     'HomeGoals']].assign(home=1).rename(columns={
                         'HomeTeam': 'team',
                         'AwayTeam': 'opponent',
                         'HomeGoals': 'goals'
                     }),
            results[['AwayTeam', 'HomeTeam',
                     'AwayGoals']].assign(home=0).rename(columns={
                         'AwayTeam': 'team',
                         'HomeTeam': 'opponent',
                         'AwayGoals': 'goals'
                     })
        ])

        poisson_model = smf.glm(formula="goals ~ home + team + opponent",
                                data=results_data,
                                family=sm.families.Poisson()).fit()

        lfc_lei = simulate_match(poisson_model, "Liverpool", "Leicester")

        homewin = np.sum(np.tril(lfc_lei, -1))
        homeloss = np.sum(np.triu(lfc_lei, 1))
        homedraw = np.sum(np.diag(lfc_lei))
        total = np.ceil(homeloss + homewin + homedraw)
        self.assertNotEqual(0.5, total)
Esempio n. 16
0
    def outcome_model(self, model, print_results=True):
        """Build the model for the outcome. This is also referred to at the Q-model. This must be specified
        before the fit function. If it is not, an error will be raised.

        model:
            -variables to include in the model for predicting the outcome. Must be contained within the input
             pandas dataframe when initialized. Model form should contain the exposure. Format is the same as
             the functional form, i.e. 'var1 + var2 + var3 + var4'
        print_results:
            -whether to print the logistic regression results to the terminal. Default is True
        """
        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial(sm.families.links.logit)
        else:
            linkdist = sm.families.family.Gaussian(sm.families.links.identity)

        # Modeling the outcome
        if self._weights is None:
            m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist)
            self.outcome_model = m.fit()
        else:
            m = smf.gee(self.outcome + ' ~ ' + model,
                        self.gf.index,
                        self.gf,
                        family=linkdist,
                        weights=self.gf[self._weights])
            self.outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results is True:
            print(self.outcome_model.summary())
        self.model_fit = True
Esempio n. 17
0
    def fit_model(self):
        """
        Fits CLR model with linear model

        Returns
        -------
        p_val
            p-values for differential abundance test of all cell types
        """

        p_val = []
        K = self.y.shape[1]

        if self.y.shape[0] == 2:
            p_val = [0 for _ in range(K)]
        else:
            # computes clr-transformed data matrix as a pandas DataFrame
            geom_mean = np.prod(self.y, axis=1, keepdims=True) ** (1 / K)
            y_clr = np.log(self.y / geom_mean)

            for k in range(K):
                data_ct = pd.DataFrame({"x": self.x[:, 0],
                                        "y": y_clr[:, k]})

                model_ct = glm('y ~ x', data=data_ct).fit()
                p_val.append(model_ct.pvalues[1])

        self.p_val = p_val
Esempio n. 18
0
def mylogistic(_x, _y):
    x = _x.copy()
    y = _y.copy()
    r, c = x.shape

    beta = np.zeros((c, 1))
    epsilon = 1e-6

    while True:
        eta = np.dot(x, beta)
        pr = exp_it(eta)
        w = pr * (1 - pr)
        z = eta + (y - pr) / w
        sw = np.sqrt(w)
        mw = np.repeat(sw, c, axis=1)

        x_work = mw * x
        y_work = sw * z

        beta_new, _, _, _ = np.linalg.lstsq(x_work, y_work)
        err = np.sum(np.abs(beta_new - beta))
        beta = beta_new
        if err < epsilon:
            break

    model = smf.glm('admit ~ gre + gpa + rank',
                    df,
                    family=sm.families.Binomial()).fit()
    print model.summary()

    return model
Esempio n. 19
0
def multiple_linear_regression():
    '''Multiple linear regression
    chapter 6.3, p. 98'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
    df = get_data(inFile)
    
    # do the fit, for the original model ...
    model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
    print model.summary()
    print anova_lm(model)

    # as GLM
    glm = glm('carbohydrate ~ age + weight + protein',
            family=Gaussian(), data=df).fit()
    print 'Same model, calculated with GLM'
    ''' The confidence intervals are different than those from OLS.
    The reason (from Nathaniel Smith):
    OLS uses a method that gives exact results, but only works in the special
    case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
    instead uses an approximate method which is correct asymptotically but may
    be off for small samples; the tradeoff you get in return is that this method
    works the same way for all GLM models, including those with non-Gaussian
    error terms and non-trivial link functions. So that's why they're different.
    '''

    print glm.summary()
    
    # ... and for model 1
    model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
    print model1.summary()
    print anova_lm(model1)    
Esempio n. 20
0
def get_best_model(train, test, city):

    if city == 'sj':
        # Step 1: specify the form of the model
        model_formula = "total_cases ~ 1 + " \
                        "reanalysis_specific_humidity_g_per_kg + " \
                        "reanalysis_dew_point_temp_k + " \
                        "station_min_temp_c + " \
                        "station_avg_temp_c + " \
                        "reanalysis_relative_humidity_percent "
        #"reanalysis_min_air_temp_k + " \

    elif city == 'iq':
        model_formula = "total_cases ~ 1 + " \
                           "reanalysis_specific_humidity_g_per_kg + " \
                           "reanalysis_dew_point_temp_k + " \
                           "station_min_temp_c + " \
                           "station_avg_temp_c + " \
            "reanalysis_min_air_temp_k "
    grid = 10**np.arange(-10, -3, dtype=np.float64)

    best_alpha = []
    best_score = 1000

    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))
        results = model.fit()
        predictions = results.predict(test).astype(int)
    score = eval_measures.meanabs(predictions, test.total_cases)

    if score < best_score:
        best_alpha = alpha
        best_score = score
    #print('best alpha = ', best_alpha)
    #print('best score = ', best_score)

    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
Esempio n. 21
0
def computeEstabilityTest(df, yv):
    g = smf.glm(formula=df.columns[yv] + "~1",
                data=df,
                family=sm.families.Poisson()).fit()

    process = numpy.asarray(g.resid_response)

    k = 1
    n = len(process)

    process = process / numpy.sqrt(n)

    meat = numpy.inner(process, process)

    J12 = numpy.sqrt(1 / meat)

    process = J12 * process

    #print(sum(abs(process)))

    from_ = numpy.ceil(n * 0.1)
    from_ = int(max(from_, 10))

    to = int(n - from_)

    lambda_ = ((n - from_) * to) / (from_ * (n - to))
    tt = (numpy.arange(from_, to + 1)) / n
    ttt = (tt * (1.0 - tt))

    pvals = numpy.zeros((df.shape[1]))
    pvals[yv] = numpy.NaN

    if from_ >= to:
        return pvals

    for zv in range(df.shape[1]):

        if zv == yv:
            continue

        zi = df[df.columns[zv]]

        oi = numpy.argsort(zi, kind="mergesort")

        proci = process[oi]

        proci = numpy.cumsum(proci)

        xx = proci**2
        xx = xx[from_ - 1:to]
        stati = numpy.max(xx / ttt)

        #print(stati, k, lambda_)
        pvals[zv] = supLM(stati, k, lambda_)
        #print(pvals[zv])

    #print(pvals)

    return numpy.exp(pvals)
 def RegressionModel(self):
     #poisson regression
     model = smf.glm(formula = "num_pickups ~ year + month + lat +long +dayofweek +day+quarter", data=self.dftaxi, family=sm.families.Poisson()).fit()
     print("Poisson Model Summary")
     print(model.summary())
     print("\n")
     #RMSE
     print("RMSE for Poisson Regression Model : ",sm.tools.eval_measures.rmse(self.dftaxi.num_pickups, model.fittedvalues, axis=0))
     print("-------------------------------------------------")
     #negative binomial regression
     model = smf.glm(formula = "num_pickups ~ year + month + lat +long +dayofweek +day+quarter", data=self.dftaxi, family=sm.families.NegativeBinomial()).fit()
     print("Negative Binomial  Model Summary")
     print(model.summary())
     print("\n")
     #RMSE
     print("RMSE for Negative Binomial Regression Model : ",sm.tools.eval_measures.rmse(self.dftaxi.num_pickups, model.fittedvalues, axis=0))
     print("-------------------------------------------------")
Esempio n. 23
0
def EstimacionMVPromGolesLV(df_cal, ids_torneo):
    df_reg = ReshapeDataFramePromGolesLV(df_cal)
    formula, constraints = FormulaPromGolesLV(df_reg.columns.tolist())
    model = glm(formula, groups=None, data=df_reg,
                family=Poisson()).fit_constrained(constraints)
    dictparams = OutputPoissReg(model, ['pgfl', 'pgfv', 'pgal', 'pgav'],
                                ids_torneo)
    return dictparams
Esempio n. 24
0
def logistic_regression():
    '''Logistic regression example
    chapter 7.3, p 130
    [tbd]: the cloglog values are inconsistent with those mentioned in the book.
    This is probably due to the specific definitions of "loglog" and "cloglog"
    in the respective languages.
    '''
    
    inFile = r'GLM_data/Table 7.2 Beetle mortality.xls'
    df = get_data(inFile)
    
    # adjust the unusual column names in the Excel file
    colNames = [name.split(',')[1].lstrip() for name in df.columns.values]
    df.columns = colNames
    
    # fit the model
    df['tested'] = df['n']
    df['killed'] = df['y']
    df['survived'] = df['tested'] - df['killed']
    model = glm('survived + killed ~ x', data=df, family=Binomial()).fit()
    print model.summary()
    
    print '-'*65
    print 'Equivalent solution:'
    
    model = glm('I(n - y) + y ~ x', data=df, family=Binomial()).fit()
    print model.summary()    
    
    # The fitted number of survivors can be obtained by
    fits = df['n']*(1-model.fittedvalues)
    print 'Fits Logit:'
    print fits
    
    # The fits for other link functions are:
    model_probit = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.probit)).fit()
    print model_probit.summary()
    
    fits_probit = df['n']*(1-model_probit.fittedvalues)
    print 'Fits Probit:'
    print fits_probit
    
    model_cll = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.cloglog)).fit()
    print model_cll.summary()
    fits_cll = df['n']*(1-model_cll.fittedvalues)
    print 'Fits Extreme Value:'
    print fits_cll
 def Backward(self, odject_inputBackward, data_inputBackward, metricBackward = "aic"):
     import statsmodels.api as sm
     import statsmodels.formula.api as smf
     model_formula_full = odject_inputBackward.formula
     Xs_full = model_formula_full.split("~")[1].split("+")
     Xs_optimum = Xs_full
     if( metricBackward == "aic"): metric_optimum = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum), data = data_inputBackward, family=sm.families.Binomial()).fit().aic
     else: metric_optimum = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum), data = data_inputBackward, family=sm.families.Binomial()).fit().bic
     for Xs_full_i in Xs_full:
        Xs_temp = Xs_optimum[:]
        Xs_temp.remove(Xs_full_i)   
        if( metricBackward == "aic"): metric_temp = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_temp), data = data_inputBackward, family=sm.families.Binomial()).fit().aic
        else :  metric_temp = smf.glm(model_formula_full.split("~")[0]+"~"+"+".join(Xs_temp), data = data_inputBackward, family=sm.families.Binomial()).fit().bic
        if (metric_temp < metric_optimum):
            metric_optimum = metric_temp
            Xs_optimum = Xs_temp
     return model_formula_full.split("~")[0]+"~"+"+".join(Xs_optimum) 
def test_umap_one():
    print('started')
    df = pd.read_csv(sys.argv[1], dtype={'location':str, 'Result':str})
    df=df.drop(df[df.BIRTH_DATETIME=='0'].index)
    phecodes = pd.read_csv(sys.argv[2], dtype=str)
    out = sys.argv[3]
    phe_list=[phe for phe in list(phecodes.PHECODE.unique()) if phe in df]
    phedf = df.loc[:, phe_list]
    phedf[phedf>0] = 1
    df[phe_list] = phedf
    print('loaded')
    #Create embeddings
    pca = PCA(n_components=50, random_state=42)
    pc_emb = pca.fit_transform(phedf)
    ump = umap.UMAP(metric='euclidean', n_components=10, random_state=42)
    ump_emb = ump.fit_transform(pc_emb)
    print('embedded')
    #create df
    reduced_df = pd.DataFrame(ump_emb, columns = ['UMP-'+str(i+1) for i in range(10)])
    reduced_df['CC_STATUS']=df['CC_STATUS']
    #Create visualization
    sns.set()
    sns.pairplot(reduced_df, hue="CC_STATUS", vars=['UMP-'+str(i+1) for i in range(10)], height=4, markers=['o', 's'], plot_kws=dict(alpha=0.1))
    plt.savefig(out)
    print('graphed')
    #test components
    reduced_df['newcc']=0
    reduced_df.loc[reduced_df['UMP-2']<-12, 'newcc']=1
    df['newcc']=reduced_df['newcc']
    print('opening file')
    out_file = open('files/umap_new_cases_chi_phecode_test_2.csv', 'w')
    out_file.write('phecode,chi2,p,dof,control_neg,case_neg,control_pos,case_pos\n')
    #Run univariate tests using this newcc col
    for phecode in phe_list:
        #Get count of people positive for this phecode in case
        case_pos = df.loc[(df.newcc==1) & (df[phecode]==1)].shape[0]
        #Get negative count in case
        case_neg = df.loc[(df.newcc==1) & (df[phecode]==0)].shape[0]
        #Get positive control
        control_pos = df.loc[(df.newcc==0) & (df[phecode]==1)].shape[0]
        #Get negative control
        control_neg = df.loc[(df.newcc==0) & (df[phecode]==0)].shape[0]
        #Run contingency test
        if case_pos>0 and case_neg>0 and control_pos>0 and control_neg>0:
            res=chi2_c([[control_neg, case_neg],[control_pos, case_pos]])
            #Write results
            out_file.write(','.join([phecode,str(res[0]),str(res[1]),str(res[2]),str(control_neg),str(case_neg),str(control_pos),str(case_pos)]))
            out_file.write('\n')
    out_file.close()
    print('ran phecode tests')
    #Get age
    df['AGE']= pd.to_datetime(df['BIRTH_DATETIME'].str[:10], format='%Y-%m-%d')
    df['AGE']=(datetime.datetime.now()-df['AGE']).astype('timedelta64[Y]')
    #Run same test procedure for covariates, but do regression (?)
    print('running regression')
    mod = smf.glm(formula='newcc ~ AGE + UNIQUE_PHECODES + RACE + GENDER + RECORD_LENGTH_DAYS', data=df, family=fam.Binomial())
    res = mod.fit()
    print(res.summary())
Esempio n. 27
0
def pred(working, rating):
	data = working[working['prosper_rating']==rating]
	#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
	#60%, 20% 20% for traing, test and validation
	train, validation, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
	print("total:{} train:{} test:{} validation:{}".format(len(data), len(train), len(validation), len(test)))
	mod = smf.glm('status ~ borrower_rate', data=train, family=sm.families.Binomial()).fit()

	print(test_model(mod, test))
Esempio n. 28
0
 def test_sse(self):
     """Check sum of squared error vs statsmodels (doesn't apply to logistic regression)"""
     g = glm('ycts~x1 + x2 + x3 + x4 + x5', self.df, 'normal')
     mod = smf.glm(formula='ycts~x1 + x2 + x3 + x4 + x5', data=self.df)
     modfitted = mod.fit()
     pred = modfitted.predict()
     diff = pred - np.asarray(self.df.ycts)
     chk1 = abs(np.multiply(diff, diff).sum() - g.sse)
     self.assertAlmostEqual(chk1, 0, 5, 'glm: sse calculation')
def covariate_analysis():
    cc_df = pd.read_csv(sys.argv[1])
    cc_df = cc_df.drop(cc_df[cc_df.BIRTH_DATETIME=='0'].index)
    #Compare sex, age, ethnicity, record_length, and most recent event
    #Get age
    cc_df['age'] = datetime.datetime.now() - cc_df["BIRTH_DATETIME"].str[:10].apply(dconvert)
    cc_df['age'] = cc_df['age'].apply(ddays)
    #Between Case and Control status
    all_res = smf.glm(formula="CC_STATUS ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=cc_df, family=fam.Binomial()).fit()
    print("Results for Case/control data:")
    print(all_res.summary())
    norm_df = cc_df.loc[cc_df.CC_STATUS==1]
    print(cc_df.shape)
    print(norm_df.shape)
    norm_df['normality_status'] = norm_df["Result"].apply(binarize_normal)
    normality_res = smf.glm(formula="normality_status ~ weight_sum + RACE + GENDER + age + RECORD_LEN + GENDER*age + age*RECORD_LEN", data=norm_df, family=fam.Binomial()).fit()
    print("Results for normal/abnormal data:")
    print(normality_res.summary())
    def __init__(self, ind_data, index, use_rank=True, formula = None, method = 'pearson',
                 ind_rank_method = rankdata,
                 s_transform = None,t_transform = None,e_transform = None,
                 s_bounds = None, t_bounds = None,
                 verbose = False,return_term_array = 0,zdist = False,ndmetric = 'correlation'):
        """ for right now, it's up to you to make sure that the index matches your ind_data, 
        for a glm that means you need a separate index for each subject, for an LMER it'll be 
        1 big block of independent data and 1 big index. 
        
        ind_rank_method lets you define a function for ranking the independent data
        this lets us deal with ranking and sorting and interaction terms and all of that"""
        Measure.__init__(self)
        self._ind_data = ind_data[index]
        self._index = index
        self._use_rank = use_rank
        self._formula = formula
        self._method = method
        self._ndmetric = ndmetric
        self._subj = np.unique(self._ind_data['subject'])
        self._is1=True
        self._s_transform = s_transform
        self._t_transform = t_transform
        self._e_transform = e_transform
        self.verbose = verbose
        self._return_term_array = return_term_array
        self._zdist = zdist


        if s_bounds != None:
            self._s_bounds = s_bounds
        if t_bounds != None:
            self._t_bounds = t_bounds
        
        if s_transform is not None:
            self._ind_data['space'] = s_transform(self._ind_data['space'])
            
        if t_transform is not None:
            self._ind_data['space'] = t_transform(self._ind_data['time'])
        
        if e_transform is not None:
            self._ind_data['space'] = e_transform(self._ind_data['event'])
            
        if self._use_rank == True:
            # rank the ind data
            self._ind_data = ind_rank_method(self._ind_data)
        else:
            idat_df = pd.DataFrame(self._ind_data)
            idat_df['val'] = np.zeros(idat_df.shape[0])
            self._ind_data = idat_df.to_records()
        
        #figure out how long the results array will be by calling the glm on some dummy data
        if self._method=='glm':
            self._ind_data['val'] = np.random.randn(self._ind_data.shape[0])     
            self._res_len = (smf.glm(formula=self._formula, data=self._ind_data).fit().params.shape[0]*2)+4
            #set val back to zeros just in case
            self._ind_data['val'] = np.zeros(self._ind_data.shape[0])     
Esempio n. 31
0
    def scale(self,
              vars_to_regress=None,
              model_type="none",
              do_trim=False,
              do_scale=True,
              do_center=True,
              scale_max=10):
        """
        Regress out reads per cell and identity
        """
        scaled = np.zeros((self._ncell, self._ngene))
        reads_per_cell = self._meta["reads_per_cell"]
        genes_per_cell = self._meta["genes_per_cell"]
        ident = self._meta["orig_ident"]
        group = self._meta["group"]
        if model_type is "none":
            scaled = self._data.values.copy()
        else:
            for i in range(self._ngene):
                expr = self._data.iloc[:, i]
                d = pd.DataFrame(np.array(
                    (expr.astype(np.float), reads_per_cell, genes_per_cell,
                     ident, group)).T,
                                 columns=[
                                     "expr", "reads_per_cell",
                                     "genes_per_cell", "orig_ident", "group"
                                 ])
                if model_type is "linear":
                    results = smf.ols(
                        'expr ~ orig_ident + reads_per_cell + group',
                        data=d).fit()
                    scaled[:, i] = results.resid
                elif model_type is "poisson":
                    results = smf.glm(
                        'expr ~ reads_per_cell + orig_ident + group',
                        data=d,
                        family=sm.families.Poisson()).fit()
                    #results = smf.glm('expr ~ orig_ident', data=d,family=sm.families.NegativeBinomial()).fit()
                    scaled[:, i] = results.resid_pearson

        self._scaled = pd.DataFrame(scaled,
                                    columns=self._data.columns,
                                    index=self._data.index)
        if do_trim:
            x = self._scaled.mean()
            y = self._scaled.var() / x
            plt.plot(x, y, '.')
            good_genes = np.array(np.logical_and(y.values > 1, x.values > 0.1))
            self._scaled = self._scaled.iloc[:, good_genes]

        if do_center or do_scale:
            for i in range(self._scaled.shape[1]):
                temp = self._scaled.iloc[:, i].values
                temp = scale(temp, with_mean=do_center, with_std=do_scale)
                temp[temp > scale_max] = scale_max
                self._scaled.iloc[:, i] = temp
Esempio n. 32
0
def report_glm(formula, data, verbose=True, **kwargs):
    """Fit GLM, print a report, and return the fit object."""
    results = smf.glm(formula, data=data, **kwargs).fit(disp=False, **kwargs)
    summary = results.summary()

    if verbose:
        report = """\n{summary}\n""".format(summary=summary)
        print(report)

    return results
Esempio n. 33
0
 def fit(self, df):
     goal_model_data = pd.concat([
         df[['home_team_name', 'away_team_name', 'home_team_goal_count'
             ]].assign(home=1).rename(columns=self.home_dict),
         df[['home_team_name', 'away_team_name', 'away_team_goal_count'
             ]].assign(home=0).rename(columns=self.away_dict)
     ])
     self.model = smf.glm(formula="goals ~ home + team + opponent",
                          data=goal_model_data,
                          family=sm.families.Poisson()).fit()
Esempio n. 34
0
 def _update_model(self):
     """
     Creates/updates time independent Poisson regression model based on actual goal data.
     :return:
         Returns fitted time independent poisson regression model.
     """
     self.model = smf.glm(formula="goals ~ home + C(team) + C(opponent)",
                          data=self.goal_data,
                          family=sm.families.Poisson()).fit_regularized(
                              L1_wt=0, alpha=0.01)
Esempio n. 35
0
    def TrainModel(self, data, args={}):

        self, options = UpdateOptions(self, args)

        self.NegativeBinomial = smf.glm(
            formula=self.extractFormula(data),
            data=data,
            family=sm.families.NegativeBinomial(alpha=self.alpha))
        self.Results = self.NegativeBinomial.fit()
        return -1
Esempio n. 36
0
 def test_logit(self):
     from statsmodels.formula.api import glm
     from statsmodels.genmod.families import Binomial
     
     inData = logit.getData()
     dfFit = logit.prepareForFit(inData)
     model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit()
     logit.showResults(inData, model)
     
     self.assertAlmostEqual(model.params.Intercept, -15.042902, places=5)
Esempio n. 37
0
    def outcome_model(self, model, print_results=True):
        """Build the model for the outcome. This is also referred to at the Q-model. This must be specified
        before the fit function. If it is not, an error will be raised.

        Parameters
        ----------
        model : str
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
        print_results : bool, optional
            Whether to print the logistic regression results to the terminal. Default is True
        """
        if self.exposure not in model:
            warnings.warn("It looks like '" + self.exposure +
                          "' is not included in the outcome model.")

        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif self.outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        elif self.outcome_type == 'poisson':
            linkdist = sm.families.family.Poisson()
        else:
            raise ValueError(
                "Only 'binary', 'normal', and 'poisson' distributed outcomes are available"
            )

        # Modeling the outcome
        if self.weight is None:
            m = smf.glm(self.outcome + ' ~ ' + model,
                        self.sample,
                        family=linkdist)
            self._outcome_model = m.fit()
        else:
            m = smf.glm(self.outcome + ' ~ ' + model,
                        self.sample,
                        family=linkdist,
                        freq_weights=self.sample[self.weight])
            self._outcome_model = m.fit()

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())
Esempio n. 38
0
    def _run_categorical(self, data, formula, formula_restricted) -> Dict:
        result = dict()
        # Regress both models
        est = smf.glm(formula, data=data, family=self.family).fit(use_t=self.use_t)
        est_restricted = smf.glm(formula_restricted, data=data, family=self.family).fit(
            use_t=True
        )
        # Check convergence
        if est.converged & est_restricted.converged:
            result["Converged"] = True
            # Calculate Results
            lrdf = est_restricted.df_resid - est.df_resid
            lrstat = -2 * (est_restricted.llf - est.llf)
            lr_pvalue = scipy.stats.chi2.sf(lrstat, lrdf)
            result["LRT_pvalue"] = lr_pvalue
            result["pvalue"] = result["LRT_pvalue"]
            result["Diff_AIC"] = est.aic - est_restricted.aic

        return result
Esempio n. 39
0
 def test_logit(self):
     from statsmodels.formula.api import glm
     from statsmodels.genmod.families import Binomial
     
     inData = C13_2_logit.getData()
     dfFit = C13_2_logit.prepareForFit(inData)
     model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit()
     C13_2_logit.showResults(inData, model)
     
     self.assertAlmostEqual(model.params.Intercept, -15.042902, places=5)
Esempio n. 40
0
def Poisson_model(dataset,home,away):
    goal_model_data = pd.concat([dataset[['HomeTeam', 'AwayTeam', 'HomeGoals']].assign(home=1).rename(
        columns={'HomeTeam': 'team', 'AwayTeam': 'opponent', 'HomeGoals': 'goals'}),
        dataset[['AwayTeam', 'HomeTeam', 'AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam': 'team', 'HomeTeam': 'opponent', 'AwayGoals': 'goals'})])

    poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data,
                            family=sm.families.Poisson()).fit()
    poisson_model.summary()
    return poisson_model
Esempio n. 41
0
    def outcome_model(self, model, restriction=None, print_results=True):
        """Add a specified regression model for the outcome. Must be specified before the fit function.

        Parameters
        ----------
        model:
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Format follows patsy standards
            For example) 'var1 + var2 + var3 + var4'
        restriction : str, optional
            Used to restrict the population that the regression model is fit to. Useful for Intent-to-Treat model
            fitting. The pandas dataframe must be referred to as 'g'. For example) "g['art']==1"
        print_results : bool, optional
            Whether to print the logistic regression model results to the terminal. Default is True
        """
        g = self.gf.copy()
        if restriction is not None:
            g = g.loc[eval(restriction)].copy()
        linkdist = sm.families.family.Binomial()

        if self._weights is None:  # Unweighted g-formula
            if self._competing_event:
                self.out_model = sm.MNLogit.from_formula(
                    self.outcome + ' ~ ' + model, g).fit()
            else:
                self.out_model = smf.glm(self.outcome + ' ~ ' + model,
                                         g,
                                         family=linkdist).fit()

        else:  # Weighted g-formula
            if self._competing_event:
                raise ValueError(
                    "The weighted MonteCarloGFormula is not supported for competing events"
                )
            self.out_model = smf.glm(self.outcome + ' ~ ' + model,
                                     g,
                                     freq_weights=g[self._weights],
                                     family=linkdist).fit()
        if print_results:
            print(self.out_model.summary())

        self._outcome_model_fit = True
Esempio n. 42
0
    def outcome_model(self, model, print_results=True):
        """Build the outcome regression model. This is also referred to at the Q-model in various parts of the
        literature. This must be specified before the fit function. It is encouraged to make this model as flexible as
        possible

        Parameters
        ----------
        model : str
            Variables to include in the model for predicting the outcome. Must be contained within the input
            pandas dataframe when initialized. Model form should contain the exposure, i.e. 'art + age + male'
        print_results : bool, optional
            Whether to print the logistic regression results to the terminal. Default is True
        """
        if type(self.exposure) is not list:
            if self.exposure not in model:
                warnings.warn("It looks like '" + self.exposure +
                              "' is not included in the outcome model.")

        if self.outcome_type == 'binary':
            linkdist = sm.families.family.Binomial()
        elif self.outcome_type == 'normal':
            linkdist = sm.families.family.Gaussian()
        else:
            linkdist = sm.families.family.Poisson()

        # Modeling the outcome
        if self._weights is None:
            m = smf.glm(self.outcome + ' ~ ' + model, self.gf, family=linkdist)
            self._outcome_model = m.fit()
        else:
            m = smf.glm(self.outcome + ' ~ ' + model,
                        self.gf,
                        family=linkdist,
                        freq_weights=self.gf[self._weights])
            self._outcome_model = m.fit()

        # Creating predicted Y variable
        self._predicted_y_ = self._outcome_model.predict(self.gf)

        # Printing results of the model and if any observations were dropped
        if print_results:
            print(self._outcome_model.summary())
Esempio n. 43
0
def regression():
    '''Poisson regression example
    chapter 4.4, p.69'''
    
    # get the data from the web
    inFile = r'GLM_data/Table 4.3 Poisson regression.xls'
    df = get_data(inFile)
    
    # do the fit
    p = glm('y~x', family=Poisson(links.identity), data=df)
    print p.fit().summary()    
Esempio n. 44
0
def senility_and_WAIS():
    '''Another example of logistic regression.
    chapter 7.8, p 143
    [tbd]: I don't understand how the "Binomial model" (grouped response)
    is supposed to work, in either language'''

    inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls'
    df = get_data(inFile)
    
    # ungrouped
    model = glm('s ~ x', data=df, family=Binomial()).fit()
    print model.summary()    
Esempio n. 45
0
def calculate_odds_ratio(genotypes, phen_vector1,phen_vector2,reg_type,covariates,response='',phen_vector3=''): #diff - done
	"""
	Runs the regression for a specific phenotype vector relative to the genotype data and covariates.

	:param genotypes: a DataFrame containing the genotype information
	:param phen_vector: a array containing the phenotype vector
	:param covariates: a string containing all desired covariates
	:type genotypes: pandas DataFrame
	:type phen_vector: numpy array
	:type covariates: string

	.. note::
		The covariates must be a string that is delimited by '+', not a list.
		If you are using a list of covariates and would like to convert it to the pyPhewas format, use the following::

			l = ['genotype', 'age'] # a list of your covariates
			covariates = '+'.join(l) # pyPhewas format

		The covariates that are listed here *must* be headers to your genotype CSV file. 
	"""

	data = genotypes
	data['y']=phen_vector1
	data['MaxAgeAtICD'] = phen_vector2
	#f='y~'+covariates
	if response:
		f = response+'~ y + ' + covariates
		if phen_vector3.any():
			data['phe'] = phen_vector3
			f = response + '~ y + phe +' + covariates
	else:
		f = 'y ~' + covariates
		if phen_vector3.any():
			data['phe'] = phen_vector3
			f = 'y ~ phe +' + covariates
	try:
		if reg_type==0:
			logreg = smf.logit(f,data).fit(method='bfgs',disp=False)
			p=logreg.pvalues.genotype
			odds=logreg.deviance	
			conf = logreg.conf_int()
			od = [-math.log10(p), logreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])]
		else:
			linreg = smf.glm(f,data).fit(method='bfgs',disp=False)
			p=linreg.pvalues.genotype
			odds=0
			conf = linreg.conf_int()
			od = [-math.log10(p), linreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])]
	except:
		odds=0
		p=np.nan
		od = [np.nan,np.nan,np.nan]
	return (odds,p,od)
Esempio n. 46
0
def log_linear_models():
    '''Log-linear models
    chapter 9.7, p 180 & 182 '''

    # Malignant melanoma, p 180 --------------------------------
    inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls'
    df = get_data(inFile)    

    # Minimal model
    model_min = glm('frequency~1', family = Poisson(), data=df).fit()
    print 'Malignant melanoma'
    print model_min.fittedvalues[0]

    # Additive model
    model_add = glm('frequency~site+type', family = Poisson(), data=df).fit()
    print model_add.fittedvalues[0]

    # Saturated model
    # model_sat = glm('frequency~site*type', family = Poisson(), data=df).fit()
    #
    # The saturated model gives a perfect fit, and the fitted data are equal to
    # the original data. Statsmodels indicates a "PerfectSeparationError"

    # Ulcer and aspirin, p. 182 ------------------------------------- 
    inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls'
    df = get_data(inFile)
    df.columns = ['GD', 'CC', 'AP', 'freq']

    model1 = glm('freq~GD+CC+GD*CC', family = Poisson(), data=df).fit()
    model2 = glm('freq~GD+CC+GD*CC + AP', family = Poisson(), data=df).fit()
    model3 = glm('freq~GD+CC+GD*CC + AP + AP*CC', family = Poisson(), data=df).fit()
    model4 = glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family = Poisson(), data=df).fit()
    
    print 'Ulcer and aspirin'
    print model4.fittedvalues
Esempio n. 47
0
 def divide_train_set_and_fit(self, full_entities=True):
     train_data = self.df.ix[self.df['Year'] < 2005, :]
     test_data = self.df.ix[self.df.Year >= 2005, :]
     formula = "Direction~Lag1+Lag2"
     if full_entities is True:
         formula += "+Lag3+Lag4+Lag5+Volume"
     model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
     result = model.fit()
     print result.summary()
     predict_result = result.predict(exog=test_data)
     real_val = test_data['Direction'].map(lambda x: 1 if x == 'Down' else 0)
     self.output_binary_table(result, predict_result, real_val)
     return result
Esempio n. 48
0
def general_logistic_regression():
    '''Example General Logistic Recression,
    Example 7.4.1, p. 135'''
    
    # Get the data
    inFile = r'GLM_data/Table 7.5 Embryogenic anthers.xls'
    df = get_data(inFile)
    
    # Define the variables so that they match Dobson
    df['n_y'] = df['n'] - df['y']
    df['newstor'] = df['storage']-1
    df['x'] = np.log(df['centrifuge'])
    
    # Model 1
    model1 = glm('n_y + y ~ newstor*x', data=df, family=Binomial()).fit()
    print model1.summary()
    
    # Model 2
    model2 = glm('n_y + y ~ newstor+x', data=df, family=Binomial()).fit()
    print model2.summary()
    
    # Model 3
    model3 = glm('n_y + y ~ x', data=df, family=Binomial()).fit()
    print model3 .summary()    
Esempio n. 49
0
 def logistic_fit(self, glm_fit=True):
     '''
     The logit function would report error when y(Direction) is not transformed to 0/1
     So glm looks easier to use
     '''
     formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
     if glm_fit is True:
         model = smf.glm(formula, data=self.df, family=sm.families.Binomial())
     else:
         # In fact, this function has wrong fittedvalues, but it's predict value is still right.
         model = smf.logit(formula, data=self.df)
     result = model.fit()
     print result.summary()
     # In logit fit there are errors here. Not sure why...
     if glm_fit:
         self.output_binary_table(result, result.fittedvalues, model.endog.astype(int), glm_fit)
def predict_class_glm(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans predict_class_glm dans feature_selection")
    csv=input_file
    df = pd.read_csv(csv)
    #print df
    df = df[['Class','feature_1','feature_2','feature_3','feature_4','feature_5','feature_6','feature_7','feature_8','feature_9','feature_10','feature_11','feature_12','feature_13','feature_14','feature_15','feature_16','feature_17','feature_18','feature_19','feature_20','feature_21','feature_22','feature_23','feature_24','feature_25','feature_26','feature_27','feature_28','feature_29','feature_30','feature_31','feature_32','feature_33','feature_34','feature_35','feature_36','feature_37','feature_38','feature_39','feature_40','feature_41','feature_42','feature_43']].dropna()
    df.head()
    logit = glm(formula='Class  ~  feature_1+feature_2+feature_3+feature_4+feature_5+feature_6+feature_7+feature_8+feature_9+feature_10+feature_11+feature_12+feature_13+feature_14+feature_15+feature_16+feature_17+feature_18+feature_19+feature_20+feature_21+feature_22+feature_23+feature_24+feature_25+feature_26+feature_27+feature_28+feature_29+feature_30+feature_31+feature_32+feature_33+feature_34+feature_35+feature_36+feature_37+feature_38+feature_39+feature_40+feature_41+feature_42+feature_43', data=df).fit()
    print logit.summary()
    save = Output + "glm.txt"
    old_stdout = sys.stdout
    log_file = open(save,"w")
    sys.stdout = log_file
    print logit.summary()
    sys.stdout = old_stdout
    log_file.close()
    lvltrace.lvltrace("LVLSortie dans predict_class_glm dans feature_selection")
Esempio n. 51
0
    def lognorm_glm(self):

        """ Fit the lognormal distribution to the observed vector of integer values
        using a generalized linear model.
        Note: This is a fitted curve; not an actual form of the lognormal distribution
        This method was inspired by the vegan package's open source code on vegan's public
        GitHub repository: https://github.com/vegandevs/vegan/R/rad.lognormal.R
        on Thursday, 5 April 2016
        """

        ranks = np.log(range(1, len(self.obs)+1))
        ranks = -norm.ppf(self.ppoints(len(ranks)))

        d = pd.DataFrame({'rnks': ranks, 'x': self.obs})
        lm = smf.glm(formula='x ~ rnks', data = d, family = sm.genmod.families.family.Poisson(link=sm.genmod.families.links.log)).fit()
        pred = lm.predict()

        return pred
Esempio n. 52
0
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,
                             1])
        model = smf.glm(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                family=sm.families.Gaussian(link=sm.families.links.log()),
                var_weights=aweights
        )
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_gaussian_aweights_nonrobust
Esempio n. 53
0
    def from_glm(self):

        """ Fit the Zipf distribution to the observed vector of integer values
        using a generalized linear model.
        Note: This is a fitted curve; not an actual form of the Zipf distribution
        This method was inspired by the vegan
        package's open source code on vegan's public GitHub repository:
        https://github.com/vegandevs/vegan/blob/master/R/rad.zipf.R
        on Thursday, 19 Marth 2015 """

        ranks = np.log(range(1, len(self.obs)+1))
        off = [np.log(sum(self.obs))] * len(self.obs)

        d = pd.DataFrame({'ranks': ranks, 'off': off, 'x':self.obs})

        lm = smf.glm(formula='x ~ ranks', data = d, family = sm.families.Poisson()).fit()
        pred = lm.predict()

        return pred
    def regression(self):
        from statsmodels.formula.api import glm
        from statsmodels.api import families

        self.people.rename(
            columns={"class": "dbpedia_class"}, inplace=True
        )  # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"

        people = self.people[(self.people.birth_century >= 0) & (self.people.birth_century <= 2000)]

        m = glm(
            "edition_count ~ C(gender,Treatment(reference='male')) + C(available_english) + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + C(birth_century)",
            data=people,
            family=families.NegativeBinomial(),
        ).fit()

        print(
            m.summary(), file=self.logfile
        )  # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on
Esempio n. 55
0
def test_formula_missing_exposure():
    # see 2083
    import statsmodels.formula.api as smf
    import pandas as pd

    d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan],
         'constant': [1] * 4, 'exposure' : np.random.uniform(size=4),
         'x': [1, 3, 2, 1.5]}
    df = pd.DataFrame(d)

    family = sm.families.Gaussian(link=sm.families.links.log)

    mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure,
                  family=family)
    assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray')

    exposure = pd.Series(np.random.uniform(size=5))
    assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df,
                  exposure=exposure, family=family)
    assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']],
                  exposure=exposure, family=family)
Esempio n. 56
0
    def setup_class(cls):
        import statsmodels.formula.api as smf

        data = sm.datasets.fair.load_pandas()
        endog = data.endog
        data = data.exog
        data['fair'] = endog
        aweights = np.repeat(1, len(data.index))
        aweights[::5] = 5
        aweights[::13] = 3
        model = smf.glm(
                'fair ~ age + yrs_married',
                data=data,
                family=sm.families.Tweedie(
                    var_power=1.55,
                    link=sm.families.links.log()
                    ),
                var_weights=aweights
        )
        cls.res1 = model.fit(rtol=1e-25, atol=0)
        cls.res2 = res_r.results_tweedie_aweights_nonrobust
Esempio n. 57
0
def poisson_regression():
    '''Poisson Regression
    chapter 9.2, p.170 & 171 '''
    
    inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls"
    df = get_data(inFile)    
    print df

    # Generate the required variables
    df['smoke'] = np.zeros(len(df))
    df['smoke'][df['smoking']=='smoker']=1

    df['agecat'] = np.array([1,2,3,4,5,1,2,3,4,5])
    df['agesq'] = df['agecat']**2

    df['smkage'] = df['agecat']
    df['smkage'][df['smoking']=='non-smoker']=0

    model = glm('deaths~agecat+agesq+smoke+smkage',
            family=Poisson(), data=df,
            exposure=df["person-years"]).fit()
    print model.summary()
Esempio n. 58
0
def rr_cluster(cluster, covs, formula):
    """Set cluster values to reduced-residuals."""
    cluster = deepcopy(cluster)
    from statsmodels.formula.api import ols, glm

    if isinstance(cluster[0], CountFeature):
        for f in cluster:
            covs['methylation'] = f.methylated
            f.methylated[:] = np.round(glm(formula,
                                           covs,
                                           exposure=f.counts,
                                           family=Poisson()
                                          ).fit().resid
                                       ).astype(int)
            f.values[:] = f.methylated.astype(float) / f.counts
    else:
        for f in cluster:
            covs['methylation'] = f.values
            fit = ols(formula, covs).fit()
            f.values[:] = fit.resid
            f.ovalues = fit.fittedvalues
    return cluster
Esempio n. 59
0
def anova():
    '''ANOVA
    chapter 6.4, p. 108, and p. 113
    GLM does not work with anova_lm.
    '''
    
    # get the data from the web
    inFile = r'GLM_data/Table 6.6 Plant experiment.xls'
    df = get_data(inFile)
    
    # fit the model (p 109)
    glm = glm('weight~group', family=Gaussian(), data=df)
    print glm.fit().summary()        
    
    print '-'*65
    print 'OLS'
    model = ols('weight~group', data=df)
    print model.fit().summary()
    print anova_lm(model.fit())            
    
    # The model corresponding to the null hypothesis of no treatment effect is
    model0 = ols('weight~1', data=df)
    
    # Get the data for the two-factor ANOVA (p 113)
    inFile = r'GLM_data/Table 6.9 Two-factor data.xls' 
    df = get_data(inFile)
    
    # adjust the header names from the Excel-file
    df.columns = ['A','B', 'data']
    
    # two-factor anova, with interactions
    ols_int = ols('data~A*B', data=df)
    anova_lm(ols_int.fit())
    
    # The python commands for the other four models are
    ols_add = ols('data~A+B', data=df)
    ols_A = ols('data~A', data=df)    
    ols_B = ols('data~B', data=df)    
    ols_mean = ols('data~1', data=df)    
Esempio n. 60
0
    def setup_class(cls):
        self = cls
        import statsmodels.formula.api as smf

        data = sm.datasets.cpunish.load_pandas()
        endog = data.endog
        data = data.exog
        data['EXECUTIONS'] = endog
        data['INCOME'] /= 1000
        aweights = np.array([1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2,
                             1])
        model = smf.glm(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                family=sm.families.Gaussian(link=sm.families.links.identity()),
                var_weights=aweights
        )
        wlsmodel = smf.wls(
                'EXECUTIONS ~ INCOME + SOUTH - 1',
                data=data,
                weights=aweights)
        self.res1 = model.fit(rtol=1e-25, atol=1e-25)
        self.res2 = wlsmodel.fit()