コード例 #1
0
ファイル: test_gee.py プロジェクト: d8aninja/statsmodels
    def setup_class(cls):

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()
        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]

        cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                   "Id",
                                   D,
                                   family=family,
                                   cov_struct=vi)

        cls.start_params = np.array([
            -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566,
            -0.47709315
        ])
コード例 #2
0
ファイル: crystal.py プロジェクト: LeonardJ09/crystal
def one_cluster(formula,
                feature,
                covs,
                coef,
                method=OLS,
                _pat=re.compile("\+\s*CpG")):
    """used when we have a "cluster" with 1 probe."""
    c = covs.copy()
    # remove the CpG in the formula
    formula = _pat.sub("", formula)
    if isinstance(feature, CountFeature):
        c['methylation'] = feature.methylated
        c['counts'] = feature.counts
        c = c[c['counts'] > 0]
        try:
            return get_ptc(
                GLM.from_formula(formula,
                                 data=c,
                                 exposure=c['counts'],
                                 family=Poisson()).fit(), coef)
        except PerfectSeparationError:
            return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef)
    else:
        c['methylation'] = feature.values
        res = method.from_formula(formula, data=c).fit()
        return get_ptc(res, coef)
コード例 #3
0
def ppglmfit(X,Y):
    '''
    The GLM solver in statsmodels is very general. It accepts any link
    function and expects that, if you want a constant term in your model,
    that you have already manually added a column of ones to your
    design matrix. This wrapper simplifies using GLM to fit the common
    case of a Poisson point-process model, where the constant term has
    not been explicitly added to the design matrix

    Parameters
    ----------
    X: N_observations x N_features design matrix.
    Y: Binary point process observations

    Returns
    -------
    μ, B: the offset and parameter estimates for the GLM model.
    '''
    # add constant value to X, if the 1st column is not constant
    if np.mean(Y)>0.1:
        print('Caution: spike rate very high, is Poisson assumption valid?')
    if np.sum(Y)<100:
        print('Caution: fewer than 100 spikes to fit model')
    if not all(X[:,0]==X[0,0]):
        X = np.hstack([np.ones((X.shape[0],1),dtype=X.dtype), X])
    poisson_model   = GLM(Y,X,family=Poisson())
    poisson_results = poisson_model.fit()
    M = poisson_results.params
    return M[0],M[1:]
コード例 #4
0
ファイル: test_gee.py プロジェクト: d8aninja/statsmodels
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        mod1 = GEE.from_formula("Y ~ X1 + X2 + X3",
                                groups,
                                D,
                                family=family,
                                cov_struct=vs)
        rslt1 = mod1.fit()

        mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D)
        rslt2 = mod2.fit(disp=False)

        assert_almost_equal(rslt1.params.values,
                            rslt2.params.values,
                            decimal=10)
コード例 #5
0
ファイル: test_gee.py プロジェクト: d8aninja/statsmodels
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        mod1 = GEE.from_formula("y ~ age + trt + base",
                                data["subject"],
                                data,
                                cov_struct=ind,
                                family=fam)
        rslt1 = mod1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        mod2 = GLM.from_formula("y ~ age + trt + base",
                                data,
                                family=families.Poisson())
        rslt2 = mod2.fit(scale="X2")

        # don't use wrapper, asserts_xxx don't work
        rslt1 = rslt1._results
        rslt2 = rslt2._results

        assert_almost_equal(rslt1.params, rslt2.params, decimal=6)
        assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
コード例 #6
0
    def test_predict_exposure(self):

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.random.uniform(1, 2, size=n)
        Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model = GEE.from_formula(fml, groups, data, family=Poisson(),
                                 offset="offset", exposure="exposure")
        result = model.fit()
        assert_equal(result.converged, True)

        pred1 = result.predict()
        pred2 = result.predict(offset=data["offset"])
        pred3 = result.predict(exposure=data["exposure"])
        pred4 = result.predict(offset=data["offset"], exposure=data["exposure"])
        pred5 = result.predict(exog=data[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:])
        # without patsy
        pred6 = result.predict(exog=result.model.exog[-10:],
                               offset=data["offset"][-10:],
                               exposure=data["exposure"][-10:],
                               transform=False)
        assert_allclose(pred1, pred2)
        assert_allclose(pred1, pred3)
        assert_allclose(pred1, pred4)
        assert_allclose(pred1[-10:], pred5)
        assert_allclose(pred1[-10:], pred6)
コード例 #7
0
    def test_poisson_epil(self):

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.join(cur_dir, "results", "epil.csv")
        data = pd.read_csv(fname)

        fam = Poisson()
        ind = Independence()
        md1 = GEE.from_formula("y ~ age + trt + base",
                               data,
                               groups=data["subject"],
                               cov_struct=ind,
                               family=fam)
        mdf1 = md1.fit()

        # Coefficients should agree with GLM
        from statsmodels.genmod.generalized_linear_model import GLM
        from statsmodels.genmod import families

        md2 = GLM.from_formula("y ~ age + trt + base",
                               data,
                               family=families.Poisson())
        mdf2 = md2.fit(scale="X2")

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
コード例 #8
0
def EstimacionMVBase(df_cal, ids_torneo):
    df_reg = ReshapeDataFrameBase(df_cal)
    formula, constraints = FormulaBase(df_reg.columns.tolist())
    model = glm(formula, groups=None, data=df_reg,
                family=Poisson()).fit_constrained(constraints)
    dictparams = OutputPoissReg(model, [], ids_torneo)
    return dictparams
コード例 #9
0
def EstimacionMVPromGolesLV(df_cal, ids_torneo):
    df_reg = ReshapeDataFramePromGolesLV(df_cal)
    formula, constraints = FormulaPromGolesLV(df_reg.columns.tolist())
    model = glm(formula, groups=None, data=df_reg,
                family=Poisson()).fit_constrained(constraints)
    dictparams = OutputPoissReg(model, ['pgfl', 'pgfv', 'pgal', 'pgav'],
                                ids_torneo)
    return dictparams
コード例 #10
0
def BuildPoissonModels(hist_data, feature_list, comp_data=None):
    ''' Build score predictions via (linear) poisson regression. '''
    hist_data_1 = hist_data[["team_1_score"] + feature_list]
    hist_data_2 = hist_data[["team_2_score"] + feature_list]

    formula_1 = "team_1_score ~ " + " + ".join(feature_list)
    formula_2 = "team_2_score ~ " + " + ".join(feature_list)

    # using the GEE package along with independance assumptions to fit poisson model.
    # Am assuming this is using a maximum likleyhood approach?
    fam = Poisson()
    ind = Independence()

    model_1 = GEE.from_formula(formula_1,
                               "team_1_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)
    model_2 = GEE.from_formula(formula_2,
                               "team_2_score",
                               hist_data,
                               cov_struct=ind,
                               family=fam)

    model_1_fit = model_1.fit()
    model_2_fit = model_2.fit()
    print(model_1_fit.summary())

    hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data)
    hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data)

    # return historical data if comp_data wasn't passed.
    if comp_data is None:
        return hist_data

    # prepare comp data
    comp_data['team_1_score_pred'] = model_1_fit.predict(
        comp_data[feature_list])
    comp_data['team_2_score_pred'] = model_2_fit.predict(
        comp_data[feature_list])

    comp_data['team_1_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[
            'team_2_score_pred']), 1)
    comp_data['team_tie_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred']
                              ), 1)
    comp_data['team_2_prob'] = comp_data[[
        'team_1_score_pred', 'team_2_score_pred'
    ]].apply(
        lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred'
                                                            ]), 1)

    return hist_data, comp_data
コード例 #11
0
ファイル: dobson.py プロジェクト: EJHortala/books-2
def regression():
    '''Poisson regression example
    chapter 4.4, p.69'''

    # get the data from the web
    inFile = r'GLM_data/Table 4.3 Poisson regression.xls'
    df = get_data(inFile)

    # do the fit
    p = glm('y~x', family=Poisson(links.identity), data=df)
    print p.fit().summary()
コード例 #12
0
    def setup_class(cls):


        endog, exog, group_n = load_data("gee_poisson_1.csv")

        family = Poisson()
        vi = Independence()

        cls.mod = GEE(endog, exog, group_n, None, family, vi)

        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
                                      0.57628591, -0.0046566,  -0.47709315])
コード例 #13
0
ファイル: test_gee.py プロジェクト: d8aninja/statsmodels
    def test_wrapper(self):

        endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False)
        endog = pd.Series(endog)
        exog = pd.DataFrame(exog)
        group_n = pd.Series(group_n)

        family = Poisson()
        vi = Independence()

        mod = GEE(endog, exog, group_n, None, family, vi)
        rslt2 = mod.fit()

        check_wrapper(rslt2)
コード例 #14
0
    def model_estimate_coeff(self, X, y):
        """ Fit Poisson Regression Models & get Coefficiant & Intercept,

         Parametes:
        ---------

            model: ```sklearn.models```, Scikit-learn Regression Models

            x: ND Array, Inputs Variables,
            y: 1D Array, Output Variables Binary variable

        Output:
        -------
            Estimated Coefficient For Poisson Regression Model + Intercept
        """
        X = sm.add_constant(X)  # Add Constact To bootsraped X

        # Fit Poisson Model To bootsraped samples
        return fm.GLM(y, X, family=Poisson()).fit().params
コード例 #15
0
    def test_offset_formula(self):
        # Test various ways of passing offset and exposure to `from_formula`.

        n = 50
        X1 = np.random.normal(size=n)
        X2 = np.random.normal(size=n)
        groups = np.kron(np.arange(25), np.r_[1, 1])
        offset = np.random.uniform(1, 2, size=n)
        exposure = np.exp(offset)
        Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n)
        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
                             "offset": offset, "exposure": exposure})

        fml = "Y ~ X1 + X2"
        model1 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset="offset")
        result1 = model1.fit()
        assert_equal(result1.converged, True)

        model2 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=offset)
        result2 = model2.fit(start_params=result1.params)
        assert_allclose(result1.params, result2.params)
        assert_equal(result2.converged, True)

        model3 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure=exposure)
        result3 = model3.fit(start_params=result1.params)
        assert_allclose(result1.params, result3.params)
        assert_equal(result3.converged, True)

        model4 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure")
        result4 = model4.fit(start_params=result1.params)
        assert_allclose(result1.params, result4.params)
        assert_equal(result4.converged, True)

        model5 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  exposure="exposure", offset="offset")
        result5 = model5.fit()
        assert_equal(result5.converged, True)

        model6 = GEE.from_formula(fml, groups, data, family=Poisson(),
                                  offset=2*offset)
        result6 = model6.fit(start_params=result5.params)
        assert_allclose(result5.params, result6.params)
        assert_equal(result6.converged, True)
コード例 #16
0
def rr_cluster(cluster, covs, formula):
    """Set cluster values to reduced-residuals."""
    cluster = deepcopy(cluster)
    from statsmodels.formula.api import ols, glm

    if isinstance(cluster[0], CountFeature):
        for f in cluster:
            covs['methylation'] = f.methylated
            f.methylated[:] = np.round(glm(formula,
                                           covs,
                                           exposure=f.counts,
                                           family=Poisson()
                                          ).fit().resid
                                       ).astype(int)
            f.values[:] = f.methylated.astype(float) / f.counts
    else:
        for f in cluster:
            covs['methylation'] = f.values
            fit = ols(formula, covs).fit()
            f.values[:] = fit.resid
            f.ovalues = fit.fittedvalues
    return cluster
コード例 #17
0
ファイル: test_gee.py プロジェクト: lema655/statsmodels
    def test_compare_poisson(self):

        vs = Independence()
        family = Poisson()

        Y = np.ceil(-np.log(np.random.uniform(size=100)))
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.random.randint(0, 4, size=100)

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=groups,
                              family=family,
                              covstruct=vs).fit()

        sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(sml.params.values, md.params, decimal=10)
コード例 #18
0
ファイル: dobson.py プロジェクト: EJHortala/books-2
def poisson_regression():
    '''Poisson Regression
    chapter 9.2, p.170 & 171 '''

    inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls"
    df = get_data(inFile)
    print df

    # Generate the required variables
    df['smoke'] = np.zeros(len(df))
    df['smoke'][df['smoking'] == 'smoker'] = 1

    df['agecat'] = np.array([1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
    df['agesq'] = df['agecat']**2

    df['smkage'] = df['agecat']
    df['smkage'][df['smoking'] == 'non-smoker'] = 0

    model = glm('deaths~agecat+agesq+smoke+smkage',
                family=Poisson(),
                data=df,
                exposure=df["person-years"]).fit()
    print model.summary()
コード例 #19
0
ファイル: dobson.py プロジェクト: EJHortala/books-2
def log_linear_models():
    '''Log-linear models
    chapter 9.7, p 180 & 182 '''

    # Malignant melanoma, p 180 --------------------------------
    inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls'
    df = get_data(inFile)

    # Minimal model
    model_min = glm('frequency~1', family=Poisson(), data=df).fit()
    print 'Malignant melanoma'
    print model_min.fittedvalues[0]

    # Additive model
    model_add = glm('frequency~site+type', family=Poisson(), data=df).fit()
    print model_add.fittedvalues[0]

    # Saturated model
    # model_sat = glm('frequency~site*type', family = Poisson(), data=df).fit()
    #
    # The saturated model gives a perfect fit, and the fitted data are equal to
    # the original data. Statsmodels indicates a "PerfectSeparationError"

    # Ulcer and aspirin, p. 182 -------------------------------------
    inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls'
    df = get_data(inFile)
    df.columns = ['GD', 'CC', 'AP', 'freq']

    model1 = glm('freq~GD+CC+GD*CC', family=Poisson(), data=df).fit()
    model2 = glm('freq~GD+CC+GD*CC + AP', family=Poisson(), data=df).fit()
    model3 = glm('freq~GD+CC+GD*CC + AP + AP*CC', family=Poisson(),
                 data=df).fit()
    model4 = glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD',
                 family=Poisson(),
                 data=df).fit()

    print 'Ulcer and aspirin'
    print model4.fittedvalues
コード例 #20
0
J = pd.DataFrame()
J['iterative_step'] = range(0, m + 1)
J['cost'] = np.full(m + 1, None)
J.loc[0, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) +
                               np.dot((A + factorial(Y)).T, ones_vector))

for i in range(1, m + 1):
    J_partial_Beta = np.dot(X.T, (A - Y))
    J_2partial_Beta2 = np.dot(A.T, ones_vector) * np.dot(X.T, X)

    Beta = Beta - np.dot(inv(J_2partial_Beta2), J_partial_Beta)
    Z = np.dot(X, Beta)
    A = np.exp(Z)
    J.loc[i, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) +
                                   np.dot((A + factorial(Y)).T, ones_vector))
    del J_partial_Beta

plt.plot(J['iterative_step'], J['cost'])
plt.title('Newton Rhapson')
plt.xlabel('Iterative Step')
plt.ylabel('Cost')
Beta

## built in package
results = sm.glm(
    formula=
    "S_Length ~ S_Width + P_Length + P_Width + Species_setosa + Species_versicolor",
    data=LR_df,
    family=Poisson()).fit()
print(results.params)
コード例 #21
0
        [0., 1, -1, 0, 0],
    ])
    rhs = np.r_[0.0, ]

    # Loop over data generating models
    for gendat in gendats:

        pvalues = []
        params = []
        std_errors = []
        dparams = []

        for j in range(nrep):

            da, va = gendat()
            ga = Poisson()

            # Poisson seems to be more sensitive to starting values,
            # so we run the independence model first.
            md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence())
            mdf = md.fit()

            md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
            mdf = md.fit(start_params=mdf.params)
            if mdf is None or (not mdf.converged):
                print("Failed to converge")
                continue

            scale_inv = 1. / md.estimate_scale()
            dparams.append(np.r_[va.dparams, scale_inv])
            params.append(np.asarray(mdf.params))
コード例 #22
0
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 12 11:36:51 2016

@author: emg
"""

import numpy as np
import pandas as pd
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                           Autoregressive)
from statsmodels.genmod.families import Poisson

fam = Poisson()
ind = Independence()
model1 = GEE.from_formula("author_count ~ top + mod",
                          "author",
                          authors,
                          cov_struct=ind,
                          family=fam)
result1 = model1.fit()
print(result1.summary())
コード例 #23
0
ファイル: test_gee.py プロジェクト: lema655/statsmodels
    def test_poisson(self):
        """
        library(gee)
        Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]
        X4 = Z[,6]
        X5 = Z[,7]

        mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="independence", scale.fix=TRUE)
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
                corstr="exchangeable", scale.fix=TRUE)
        sme = summary(me)

        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s]]", cfi, cfe)
        sprintf("se = [[%s],[%s]]", sei, see)
        """

        family = Poisson()

        endog, exog, group_n = load_data("gee_poisson_1.csv")

        vi = Independence()
        ve = Exchangeable()

        # From R gee
        cf = [[
            -0.0364450410793481, -0.0543209391301178, 0.0156642711741052,
            0.57628591338724, -0.00465659951186211, -0.477093153099256
        ],
              [
                  -0.0315615554826533, -0.0562589480840004, 0.0178419412298561,
                  0.571512795340481, -0.00363255566297332, -0.475971696727736
              ]]
        se = [[
            0.0611309237214186, 0.0390680524493108, 0.0334234174505518,
            0.0366860768962715, 0.0304758505008105, 0.0316348058881079
        ],
              [
                  0.0610840153582275, 0.0376887268649102, 0.0325168379415177,
                  0.0369786751362213, 0.0296141014225009, 0.0306115470200955
              ]]

        for j, v in enumerate((vi, ve)):
            md = GEE(endog, exog, group_n, None, family, v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  covstruct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=5)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
コード例 #24
0
ファイル: interactions.py プロジェクト: PanditPranav/taggit
def run_permutation_test(dependent, network, number_of_permutations,
                         output_path):
    nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)),
                                   orient='index')
    degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index')
    centrality = pd.DataFrame.from_dict(dict(
        nx.betweenness_centrality(network)),
                                        orient='index')
    h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0)
    h1.columns = [
        'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree',
        'centrality'
    ]
    h1['degree_dist'] = h1.degree / float(h1.degree.max())

    equation = dependent + "~ Age + Sex"
    from statsmodels.genmod.generalized_estimating_equations import GEE
    from statsmodels.genmod.cov_struct import (Exchangeable, Independence,
                                               Autoregressive)
    from statsmodels.genmod.families import Poisson
    fam = Poisson()
    ind = Independence()

    model = GEE.from_formula(equation,
                             "Location",
                             h1,
                             cov_struct=ind,
                             family=fam)
    main_model_result = model.fit()
    main_result = pd.DataFrame(main_model_result.params).T

    degree_random_coeff = []
    for i in range(number_of_permutations):
        rand_h1 = h1.copy()
        rand_h1[dependent] = np.random.permutation(h1[dependent])
        fam = Poisson()
        ind = Independence()
        model = GEE.from_formula(equation,
                                 "Location",
                                 rand_h1,
                                 cov_struct=ind,
                                 family=fam)
        result = model.fit()
        degree_random_coeff.append(result.params)

    d = pd.DataFrame.from_records(degree_random_coeff)
    import seaborn as sns
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    ax1.hist(d['Age[T.HY]'], bins=100)
    ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272')
    p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p
    ax1.set_xlabel(
        'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' +
        '{0:.2f}'.format(p))
    ax1.set_ylabel('Frequency')

    ax2.hist(d['Age[T.UNK]'], bins=100)
    ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272')
    p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' +
                   '{0:.2f}'.format(p))

    ax3.hist(d['Sex[T.M]'], bins=100)
    ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272')
    p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0]
         ).sum() / float(number_of_permutations)
    if p > 0.5:
        p = 1 - p
    else:
        p = p

    ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' +
                   '{0:.2f}'.format(p))
    title = 'permutation test for ' + dependent
    f.suptitle(title)
    plt.tight_layout()
    plt.savefig(output_path + '/' + dependent + '_Permutation_test.png',
                dpi=300)
    plt.show()
コード例 #25
0
# model comparison with likelihood ratio test
LR = 2 * (model_panel2_results.llf - model_panel1_results.llf)
p = chi2.sf(LR, 2) 
print('p: %.30f' % p) 

# provides a summary of the number of zeros
print(US_cases_long_demogr_week['cases_count_pos'].describe())
print(US_cases_long_demogr_week['cases_count_pos'].value_counts())
count_total = sum(US_cases_long_demogr_week['cases_count_pos'].value_counts().to_dict().values())
count_zero = US_cases_long_demogr_week['cases_count_pos'].value_counts()[0.0]
print("Count of zero is {}, about {:.4f} of the data.".format(count_zero, count_zero / count_total ))

# Approach one to generalized linear models for panel data: Generalized Estimating Equations
# poisson model
poi=Poisson()
ar=Autoregressive()
gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
gee_model0_results = gee_model0.fit(maxiter=200)
print(gee_model0_results.summary())
print(ar.summary())
print("scale=%.2f" % (gee_model0_results.scale))

# There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case,
# specific starting values are needed to get the estimating algorithm to converge.
# First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077.
fam = Poisson()
ex = Exchangeable()
ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \
    data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
コード例 #26
0
def scanline_harmonization(source_gdf,
                           target_gdf,
                           pop_string,
                           raster_path,
                           verbose=True,
                           auxiliary_type='nlcd',
                           regression_method='Poisson',
                           codes=[21, 22, 23, 24],
                           n_pixels_option_values=256,
                           ReLU=True,
                           **kwargs):
    """Function that generates an interpolated population using scanlines with the entire pipeline.
    
    Parameters
    ----------
    
    source_gdf             : geopandas GeoDataFrame with geometry column of polygon type for the source set of polygons desired.
    
    target_gdf             : geopandas GeoDataFrame with geometry column of polygon type for the target set of polygons desired.
    
    pop_string             : the name of the variable on geodataframe that the interpolation shall be conducted.
    
    raster_path            : the path to the associated raster image.
    
    verbose                : bool. Default is False.
                             Wheter the function will print progress steps.
    
    auxiliary_type         : string. The type of the auxiliary variable for the desired method of interpolation. Default is 'nlcd' for the National Land Cover Dataset. 
    
    regression_method      : the method used to estimate the weights of each land type and population. Default is "Poisson".
                        
        "Poisson"  : performs Generalized Linear Model with a Poisson likelihood with log-link function.
        "Gaussian" : ordinary least squares will be fitted.
        "XGBoost"  : an Extreme Gradient Boosting regression will be fitted and the weights will be extracted from the Shapelys value from each land type.

    codes                  : an integer list of codes values that should be considered as 'populated' for the raster file. See (1) in notes.
    
    n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.
    
    ReLU                   : bool. Default is True.
                             Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.
                             
    **kwargs               : additional arguments that can be passed to internal functions.
                             Currently `tuned_xgb` or `gbm_hyperparam_grid` can be passed to internal XGBoost approach.

    Notes
    -----

    1) Since this was inspired using the National Land Cover Database (NLCD), it is established some default values for this argument.
       The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
       The description of each code for NLCD can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html    
    
    """

    if verbose: print('INITIALIZING FIRST SCANLINES')
    profiled_df_pre = scanlines_count_pixels(source_gdf,
                                             raster_path,
                                             verbose=verbose)

    profiled_df = pd.concat([source_gdf.reset_index(), profiled_df_pre],
                            axis=1)

    codes.sort()

    str_codes = [str(i) for i in codes]
    formula_string = pop_string + ' ~ -1 + ' + " + ".join(
        ['Type_' + s for s in str_codes])

    if (regression_method == 'Poisson'):
        results = smf.glm(formula_string, data=profiled_df,
                          family=Poisson()).fit()
        weights = np.array(results.params)

    if (regression_method == 'Gaussian'):
        results = smf.glm(formula_string, data=profiled_df,
                          family=Gaussian()).fit()
        weights = np.array(results.params)

    if (regression_method == 'XGBoost'):
        weights = _return_xgboost_weights(profiled_df, pop_string, str_codes,
                                          **kwargs)

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    # Correction Term (CT)
    profiled_df['denominator'] = (
        np.array(profiled_df[['Type_' + s
                              for s in str_codes]]) * weights).sum(axis=1)
    profiled_df['CT'] = np.nan_to_num(profiled_df[pop_string] /
                                      profiled_df['denominator'])
    scan_line_input_CT = profiled_df[['geometry', 'CT']]

    long_weights = np.zeros(n_pixels_option_values)
    long_weights[codes] = weights

    if verbose: print('\nINITIALIZING SECOND SCANLINES')
    interpolate = scanlines_interpolate(target_gdf=target_gdf,
                                        source_CTs=scan_line_input_CT,
                                        weights_long=long_weights,
                                        raster_path=raster_path,
                                        verbose=verbose)

    interpolate_df = pd.concat([target_gdf.reset_index(), interpolate], axis=1)

    return interpolate_df
コード例 #27
0
# Let's actually look at the distribution of monthly Enrollments by TP
get_ipython().magic('matplotlib inline')

y = train['Enrolls']

sns.distplot(y)
plt.show()


# #### First Attempt - Poisson Regression
# We utilize a Poisson regression here because our independent variable, Enrolls, is a count with a relatively small range. Since the distribution of the error terms will therefore not be independent and identically distributed we do not use OLS.

# In[17]:

poisson = sm.GLM(y_train, new_x, family = Poisson()).fit()
# poisson.summary()


# In[18]:

y_train.mean()


# In[19]:

y_train.var()


# #### Second Attempt - Negative Binomial
# Shouldn't use Poisson, because the variance does not equal the mean. Trying a Negative Binomial instead.
コード例 #28
0
def _return_weights_from_regression(
    geodataframe,
    raster_path,
    pop_string,
    codes=[21, 22, 23, 24],
    likelihood="poisson",
    formula_string=None,
    n_pixels_option_values=256,
    force_crs_match=True,
    na_value=255,
    ReLU=True,
):

    """Function that returns the weights of each land type according to NLCD
    types/codes.

    Parameters
    ----------
    geodataframe :  geopandas.GeoDataFrame 
        used to build regression
    raster_path : str
        the path to the associated raster image.
    formula_string : str
        patsy-style model formula
    pop_string : str
        the name of the variable on geodataframe that the regression shall be conducted
    codes : list
        an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
        The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
        The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
    likelihood : str, {'Poisson', 'Gaussian'}
        the likelihood assumed for the dependent variable (population). It can be 'Poisson' or 'Gaussian'.
        With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted.
    n_pixels_option_values : int
        number of options of the pixel values of rasterior. Default is 256.
    force_crs_match   : bool. Default is True.
        Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
        It is recommended to let this argument as True.
    na_value : int. Default is 255.
        The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
    ReLU : bool. Default is True.
         Whether the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.

    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    """

    _check_presence_of_crs(geodataframe)

    if na_value in codes:
        raise ValueError("codes should not assume the na_value value.")

    if not likelihood in ["poisson", "gaussian"]:
        raise ValueError("likelihood must one of 'poisson', 'gaussian'")

    profiled_df = _fast_append_profile_in_gdf(
        geodataframe[["geometry", pop_string]], raster_path, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    if not formula_string:
        # Formula WITHOUT intercept
        str_codes = [str(i) for i in codes]
        formula_string = (
            pop_string + " ~ -1 + " + " + ".join(["Type_" + s for s in str_codes])
        )

    if likelihood == "poisson":
        results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit()

    if likelihood == "gaussian":
        results = smf.ols(formula_string, data=profiled_df).fit()

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = results.params

    if ReLU:
        weights = np.where(weights < 0, 0, weights)

    return weights
コード例 #29
0
def return_weights_from_regression(geodataframe,
                                   raster,
                                   pop_string,
                                   codes=[21, 22, 23, 24],
                                   likelihood='Poisson',
                                   n_pixels_option_values=256,
                                   force_crs_match=True,
                                   na_value=255):
    """Function that returns the weights of each land type according to NLCD types/codes
    
    Parameters
    ----------
    
    geodataframe           : a geopandas geoDataFrame used to build regression
    
    raster                 : a raster (from rasterio.open) that has the types of each pixel in the geodataframe
    
    pop_string             : the name of the variable on geodataframe that the regression shall be conducted
    
    codes                  : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
                             The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
                             The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
                             
    likelihood             : the likelihood assumed for the dependent variable (population). 
                             It can be 'Poisson' or 'Gaussian'. 
                             With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted. 
                             
    n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.
    
    force_crs_match        : bool. Default is True.
                             Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. 
                             It is recommended to let this argument as True.
    
    na_value               : int. Default is 255.
                             The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
    
    Notes
    -----
    1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
    2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
    
    """

    _check_presence_of_crs(geodataframe)

    if (na_value in codes):
        raise ValueError('codes should not assume the na_value value.')

    if not likelihood in ['Poisson', 'Gaussian']:
        raise ValueError('likelihood must one of \'Poisson\', \'Gaussian\'')

    print('Appending profile...')
    profiled_df = append_profile_in_gdf(
        geodataframe[['geometry', pop_string]], raster, force_crs_match
    )  # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).
    print('Append profile: Done.')

    # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
    codes.sort()

    # Formula WITHOUT intercept
    str_codes = [str(i) for i in codes]
    formula_string = pop_string + ' ~ -1 + ' + " + ".join(
        ['Type_' + s for s in str_codes])

    print('Starting to fit regression...')
    if (likelihood == 'Poisson'):
        results = smf.glm(formula_string, data=profiled_df,
                          family=Poisson()).fit()

    if (likelihood == 'Gaussian'):
        results = smf.ols(formula_string, data=profiled_df).fit()

    weights = np.zeros(n_pixels_option_values)
    weights[codes] = results.params

    return weights
コード例 #30
0
######################
## Gradient Descent ##
######################
m = 50000
alpha = 0.0002
J = pd.DataFrame()
J['iterative_step'] = range(0,m+1)
J['cost'] = np.full(m+1, None)
J.loc[0, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A+factorial(Y)).T, ones_vector))                        

for i in range(1, m+1):    
    J_partial_Beta = np.dot(X.T, (A-Y))
    Beta = Beta - (alpha*J_partial_Beta)
    Z = np.dot(X, Beta)
    A = np.exp(Z)
    J.loc[i, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A+factorial(Y)).T, ones_vector))   
    del J_partial_Beta    

plt.plot(J['iterative_step'], J['cost'])
plt.title('Gradient Descent') 
plt.xlabel('Iterative Step') 
plt.ylabel('Cost') 
Beta

## built in package
results = sm.glm(formula="S_Length ~ S_Width + P_Length + P_Width + Species_setosa + Species_versicolor", data=LR_df, family=Poisson()).fit()
print(results.params)