コード例 #1
0
def run_eqtl_one_test_nb_lm(expression, genotype, covariates, lib_size):
    # FIT ALTERNATE MODEL
    # Covariate matrix
    X = np.vstack((genotype, covariates.T)).T
    # Add intercept
    X2 = sm.add_constant(X)

    try:
        model = sm.NegativeBinomial(expression, X2, exposure=lib_size)
        fit = model.fit()
        beta = fit.params[1]
        standard_error = fit.bse[1]
        pvalue = fit.pvalues[1]
        ll_full = fit.llf

        # FIT NULL MODEL
        X3 = sm.add_constant(covariates)
        model = sm.NegativeBinomial(expression, X3, exposure=lib_size)
        fit = model.fit()
        ll_null = fit.llf

        pseudo_r_squared = 1.0 - (ll_full / ll_null)
    except:
        beta = "nan"
        standard_error = "nan"
        pvalue = "nan"
        pseudo_r_squared = "nan"

    return beta, standard_error, pvalue, pseudo_r_squared
コード例 #2
0
 def setup(self):
     #fit for each test, because results will be changed by test
     np.random.seed(987689)
     data = sm.datasets.randhie.load(as_pandas=False)
     exog = sm.add_constant(data.exog, prepend=False)
     mod = sm.NegativeBinomial(data.endog, data.exog)
     self.results = mod.fit(disp=0)
コード例 #3
0
 def setup(self):
     #fit for each test, because results will be changed by test
     np.random.seed(987689)
     data = sm.datasets.randhie.load()
     exog = sm.add_constant(data.exog, prepend=False)
     mod = sm.NegativeBinomial(data.endog, data.exog)
     start_params = np.array([
         -0.0565406, -0.21213599, 0.08783076, -0.02991835, 0.22901974,
         0.0621026, 0.06799283, 0.08406688, 0.18530969, 1.36645452
     ])
     self.results = mod.fit(start_params=start_params, disp=0)
コード例 #4
0
 def setup(self):
     #fit for each test, because results will be changed by test
     np.random.seed(987689)
     data = sm.datasets.randhie.load(as_pandas=False)
     exog = sm.add_constant(data.exog, prepend=False)
     mod = sm.NegativeBinomial(data.endog, exog)
     start_params = np.array([-0.05783623, -0.26655806,  0.04109148, -0.03815837,
                              0.2685168 ,   0.03811594, -0.04426238,  0.01614795,
                              0.17490962,  0.66461151,   1.2925957 ])
     self.results = mod.fit(start_params=start_params, disp=0, maxiter=500)
     self.transform_index = -1
コード例 #5
0
def test_poi_nb_zip_zinb_tiny_subset(meta, m):
    exog_names = r"rowid;latitude;longitude;target;dbuiltup;dforest;drecreation;dbrr;dwrl;dwrn;dwrr;dcamping;dcaravan;dcross;dgolf;dheem;dhaven;dsafari;dwater;attr;dbath;lu;lc;maxmeanhaz;maxstdhaz".split(";")[4:]

    np.random.seed(2)

    randint = np.random.randint(0, high=len(m)-1, size=800)

    msel = m[randint,:]

    Y = msel[:, 0]
    X = msel[:, 1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    print("Msel shape: ", msel.shape)

    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=42)

    print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

    print
    print("Model: Poisson")
    poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    poi_mean_pred = poi_mod.predict(xtest)
    poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)
    poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs))
    # print(np.unique(poi_ppf_obs, return_counts=True))
    print("RMSE Poisson: ", poi_rmse)
    # print(poi_mod.summary(yname='tickbites', xname=exog_names))

    print
    print("Model: Neg. Binomial")
    nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params = None, method = 'newton', maxiter=50)
    nb_pred = nb_mod.predict(xtest)
    nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred))
    # print(np.unique(nb_pred, return_counts=True))
    print("RMSE Negative Binomial: ", nb_rmse)

    print
    print("Model: Zero Inflated Poisson")
    zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50)
    zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred)
    zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs))
    print("RMSE Zero-Inflated Poisson", zip_rmse)

    print
    print("Model: Zero Inflated Neg. Binomial")
    zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50)
    zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
    zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred))
    print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
コード例 #6
0
def regression(df, a, b, c, d, distribution):
    """[summary]
    Calculate VE and CI's according
    https://timeseriesreasoning.com/contents/estimation-of-vaccine-efficacy-using-logistic-regression/
    * We'll use Patsy to carve out the X and y matrices
    * Build and train a Logit model (sm.Logit)

    Args:
        a ([type]): sick vax
        b ([type]): sick unvax
        c ([type]): total vax
        d ([type]): total unvax

    Returns:
        0"""

    p_sick_unvax = b / d
    #Form the regression equation
    expr = 'INFECTED ~  VACCINATED'

    #We'll use Patsy to carve out the X and y matrices
    y_train, X_train = dmatrices(expr, df, return_type='dataframe')

    #Build and train a Logit model
    if distribution == "logit":
        model = sm.Logit(endog=y_train, exog=X_train, disp=False)
    elif distribution == "poisson":
        model = sm.Poisson(endog=y_train, exog=X_train, disp=False)
    elif distribution == "neg_bin":
        model = sm.NegativeBinomial(endog=y_train, exog=X_train, disp=False)

    results = model.fit(disp=False)
    params = results.params

    #Print the model summary
    #stl.write(logit_results.summary2())

    VE = VE_(params[1], p_sick_unvax)

    # stl.write(f"\nConfidence intervals")
    # stl.write(logit_results.conf_int())  # confidence intervals

    conf = results.conf_int()
    high, low = conf[0][1], conf[1][1]
    prsquared = results.prsquared
    VE_low, VE_high = VE_(low, p_sick_unvax), VE_(high, p_sick_unvax)
    stl.write(
        f"VE Regression {distribution}                       : {VE} % [{VE_low} , {VE_high}] | pseudo-R2 = {prsquared}"
    )
コード例 #7
0
def SPNegativeBinomial(context):
    # 从 Context 中获取相关数据
    args = context.args
    # 查看上一节点发送的 args.inputData 数据
    df = args.inputData

    featureColumns = args.featureColumns
    labelColumn = args.labelColumn

    features = df[featureColumns].values
    label = df[labelColumn].values

    arma_mod = sm.NegativeBinomial(label, features, missing=args.missing)
    arma_res = arma_mod.fit(method=args.method)

    return arma_res
コード例 #8
0
 def NBFit(cnts):
     '''
     Negative Binomival fit. 
     Parameters:
         cnts: list or pandas.Series
             list of counts
     Returns:
         size, prob: float
             negative binomial parameters.
     '''
     y, x = list(cnts), numpy.ones(len(cnts))
     res = sm.NegativeBinomial(y, x, loglike_method='nb1').fit(start_params=[0.1, 0.1],disp=True)
     mu, alpha = numpy.exp(res.params[0]), res.params[1]
     size = mu/alpha
     prob = size/(size+mu)
     return size, prob
コード例 #9
0
ファイル: rage_regmodels.py プロジェクト: tadesouaiaia/rage
    def regress_nb(self, Y, X, interest=None):

        print 'uh'

        #sm.GLM(data.endog, data.exog, family=sm.families.Gamma())
        foo = sm.GLM(Y, X, family=sm.families.NegativeBinomial(),
                     variance=10).fit()
        foo = sm.GLM(Y, X, family=sm.families.NegativeBinomial()).fit()

        print foo.summary()

        model = sm.NegativeBinomial([log(y + 1.0) for y in Y], X).fit()
        #		print model.summary()
        sys.exit()

        print len(model.pvalues)
        print len(model.params)

        print model.bic
        #		print model.rsquared
        #		print model.rsquared_adj

        for v in vars(model._results):
            print v
        #		print model.summary()

        for v in vars(model.model):
            print v

#		p_out = {'params': r_out, 'bic': model.bic, 'rs': model.rsquared, 'ars': model.rsquared_adj, 'resids': model.resid, 'pwr': pwr}

        mPV, aPV = res_nbin.pvalues
        nbM, nbA = exp(res_nbin.params[0]), res_nbin.params[1]
        estX, estP = convert_nb(nbM, nbA)
        my_comps = stats.nbinom.rvs(estX, estP, size=len(my_vals))
        chiT, chiP = self.bin_chi(my_vals, my_comps,
                                  min(binInt, int(len(my_vals) * binRate)))
        self.tests['nbin'] = (chiT, chiP)

        nbAIC, nbBIC = res_nbin.aic, res_nbin.bic

        print m.name, len(vals), len(
            dZ
        ), val_type, 'neg-binom', chiT, chiP, "|", mPV, aPV, '|', nbAIC, nbBIC

        sys.exit()
コード例 #10
0
def tiny_negbin(l):
    print("\t\tRunning NegBin")
    nb_mod, nb_pred = [None for i in range(2)]
    nb_rmse = 0
    xtr = np.array([item[1:] for item in l])
    ytr = np.array([item[0] for item in l]).reshape(-1, 1)
    try:
        nb_mod = sm.NegativeBinomial(ytr, xtr).fit(start_params=None, method='newton', maxiter=50, disp=0)
        nb_pred = nb_mod.predict(xtr)
        nb_rmse = np.sqrt(mean_squared_error(ytr, nb_pred))
    except np.linalg.LinAlgError as e:
        if 'Singular matrix' in str(e):
            print("\t\t\tIgnored a singular matrix.")
    except ValueError:
        print("\t\t\tIgnored output containing np.nan or np.inf")

    return [nb_mod, nb_pred, nb_rmse]
コード例 #11
0
ファイル: h3.py プロジェクト: afogarty85/latent_control
def negbin_cdf(series):
    '''
    This function takes a np.array and returns a negative
    binomial CDF for overdispersed count data.
    # prob. that x is less than or equal to val.
    '''
    series = series.tolist()
    y = np.array([series])
    y = y.flatten()
    # create intercept to fit a model with intercept
    intercept = np.ones(len(y))
    # fit negative binomial
    m1 = sm.NegativeBinomial(y, intercept, loglike_method='nb2').fit()
    # retrieve mu
    mu = np.exp(m1.params[0])
    # retrieve alpha
    alpha = m1.params[1]
    # set Q to zero for nb2 method, Q to 1 for nb1 method
    Q = 0
    # derive size
    size = 1. / alpha * mu**Q
    # derive prob
    prob = size / (size + mu)
    return nbinom.cdf(y, n=size, p=prob)
コード例 #12
0
ファイル: negBinRegEx.py プロジェクト: scc-usc/LAcrime
"""
Negative binomial regression example
"""

import statsmodels.api as sm

endog = [2, 2, 3]
exog = [[1, 2], [2, 4], [5, 6]]

#endog1 = [1,2,3]
exog1 = [[1, 2], [2, 4]]

nbm = sm.NegativeBinomial(endog, sm.add_constant(exog), loglike_method='nb2')

res = nbm.fit()

#print(res.summary())

pred = list(res.predict(exog1))
print pred
コード例 #13
0
xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                Y,
                                                train_size=0.60,
                                                random_state=42)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

print("Model: Poisson")
poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50)
poi_mean_pred = poi_mod.predict(xtest)
poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred)
poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs))

print("Model: Neg. Binomial")
nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params=None,
                                                 method='newton',
                                                 maxiter=50)
nb_pred = nb_mod.predict(xtest)
nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred))

print(np.ones(len(xtest)).shape)

print("Model: Zero Inflated Poisson")
zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton",
                                                     maxiter=50)
zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1)))
zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred)
zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs))

print("Model: Zero Inflated Neg. Binomial")
zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain,
コード例 #14
0
ファイル: models.py プロジェクト: m0rr15/workflow.py
# nominal data models (not tested)
y = df.y_nominal  # DV
mn_logit = sm.MNLogit(y, X).fit()
print(mn_logit.summary2())  # estimation summary
y_pred = mn_logit.predict(X)  # fitted/predicted values
print(confusion_matrix(y, (y_pred > .5).astype(int)))

# count data models (w/ exposure!)
y = df.y_count  # DV

m_poiss = sm.Poisson(
    y, X, exposure=df['x_timespan'].values).fit()
print(m_poiss.summary2())

m_NB2 = sm.NegativeBinomial(
    y, X, loglike_method='nb2', exposure=df['x_timespan'].values).fit()
print(m_NB2.summary2())

m_NB1 = sm.NegativeBinomial(
    y, X, loglike_method='nb1', exposure=df['x_timespan'].values).fit()
print(m_NB1.summary2())

m_NBP = sm.NegativeBinomialP(
    y, X, exposure=df['x_timespan'].values).fit()
print(m_NBP.summary2())

#endregion



#region REGRESSION MODELS
コード例 #15
0
    def pval_at_rna_by_nbinom(self,
                              pos_dict_of_counts: Mapping[str, List],
                              neg_vals_at_rna: np.array,
                              gene_and_type,
                              log_if_values_above=1E9,
                              log_values=False,
                              which='per_read',
                              verbose=False):
        """For a given RNA, get the p values for all proteins by negative binomial.
        gene_and_type: "SMAD2::exon"
        dict_of_counts: dict of {protein: [replicate 1, replicate 2]}
        """

        if len(neg_vals_at_rna) == 0:
            return None

        log_scale_high_value = (np.mean(neg_vals_at_rna) > log_if_values_above)

        if log_values or log_scale_high_value:
            log_this_gene = True
            neg_vals_at_rna = np.log10(neg_vals_at_rna)
        else:
            log_this_gene = False

        #if not np.any(neg_vals_at_rna):
        #print("No positive values in negatives.")
        #    neg_vals_at_rna = np.array([
        #        self.negatives.lowest_positive_vals[which][x]/10 for x in \
        #            self.negatives.metadata.random_proteins])
        #print(f"negatives now {neg_vals_at_rna}")
        mean_negative = np.average(neg_vals_at_rna)
        std_negative = np.std(neg_vals_at_rna)

        vmr = (std_negative**2) / mean_negative

        verbose and print(f'vmr for negatives={vmr}')
        # Use a poisson if the var/mean is low enough:
        if vmr < 2:
            verbose and print("Using poisson.")
            self.stats_log['vmr<2'] += 1
            pois = stats.poisson(mean_negative)
            return self.use_dist(pos_dict_of_counts, log_this_gene, pois)

        verbose and print("Wil try to use NB.")
        self.stats_log['vmr>=2'] += 1

        # Try to fit a NB useing statsmodels.
        q = sm.NegativeBinomial(neg_vals_at_rna,
                                np.array([1] * len(neg_vals_at_rna)),
                                loglike_method='nb2')
        try:
            res = q.fit(disp=0)
        except:  # If a NB can't be fit, revert to a poisson.
            print(
                f"Could not run q.fit(disp=0) on neg_vals_at_rna= {neg_vals_at_rna}. Using poisson."
            )
            pois = stats.poisson(mean_negative)
            return self.use_dist(pos_dict_of_counts, log_this_gene, pois)

        # Create a scipy.stats.nbinom object to use its cdf, based on the statsmodels fit parameters.
        # There is no cdf function for the statsmodels object.
        mu = res.predict()[0]  # alpha = res.params[1]
        size = 1. / res.params[1]  # prob = size / (size + mu)

        verbose and print(f"Fit NB mu={mu}")

        pvals = self.use_dist(pos_dict_of_counts, log_this_gene,
                              stats.nbinom(size, size / (size + mu)))

        return pvals
コード例 #16
0
def get_stats(
    scores: np.array,
    background: np.array,
    total_bg: int,
    neg_binom: bool = False,
    adj_method: str = "fdr_bh",
    pval_adj_cutoff: float = 0.01,
    return_negbinom_params: bool = False,
):
    """Retrieves valid candidate genes to be used for random gene pairs.
    Parameters
    ----------
    scores: np.array        Per spot scores for a particular LR pair.
    background: np.array    Background distribution for non-zero scores.
    total_bg: int           Total number of background values calculated.
    neg_binom: bool         Whether to use neg-binomial distribution to estimate p-values, NOT appropriate with log1p data, alternative is to use background distribution itself (recommend higher number of n_pairs for this).
    adj_method: str         Parsed to statsmodels.stats.multitest.multipletests for multiple hypothesis testing correction.
    Returns
    -------
    stats: tuple          Per spot pvalues, pvals_adj, log10_pvals_adj, lr_sign (the LR scores for significant spots).
    """
    ##### Negative Binomial fit
    if neg_binom:
        # Need to make full background for fitting !!!
        background = np.array(
            list(background) + [0] * (total_bg - len(background)))
        pmin, pmax = min(background), max(background)
        background2 = [item - pmin for item in background]
        x = np.linspace(pmin, pmax, 1000)
        res = sm.NegativeBinomial(background2,
                                  np.ones(len(background2)),
                                  loglike_method="nb2").fit(
                                      start_params=[0.1, 0.3], disp=0)
        mu = res.predict()  # use if not constant
        mu = np.exp(res.params[0])
        alpha = res.params[1]
        Q = 0
        size = 1.0 / alpha * mu**Q
        prob = size / (size + mu)

        if return_negbinom_params:  # For testing purposes #
            return size, prob

        # Calculate probability for all spots
        pvals = 1 - scipy.stats.nbinom.cdf(scores - pmin, size, prob)

    else:  ###### Using the actual values to estimate p-values
        pvals = np.zeros((1, len(scores)), dtype=np.float)[0, :]
        nonzero_score_bool = scores > 0
        nonzero_score_indices = np.where(nonzero_score_bool)[0]
        zero_score_indices = np.where(nonzero_score_bool == False)[0]
        pvals[zero_score_indices] = (total_bg - len(background)) / total_bg
        pvals[nonzero_score_indices] = [
            len(np.where(background >= scores[i])[0]) / total_bg
            for i in nonzero_score_indices
        ]

    pvals_adj = multipletests(pvals, method=adj_method)[1]
    log10_pvals_adj = -np.log10(pvals_adj)
    lr_sign = scores * (pvals_adj < pval_adj_cutoff)
    return pvals, pvals_adj, log10_pvals_adj, lr_sign
コード例 #17
0
def main():
    usage = "%prog path/to/config.json "
    parser = OptionParser(usage=usage)
    parser.add_option('--outdir', type=str, default=None,
                      help='Output dir [basedir of config if None]: default=%default')
    parser.add_option('--save', action="store_true", default=False,
                      help='Save data matrix: default=%default')

    (options, args) = parser.parse_args()
    config_file = args[0]
    print(config_file)
    with open(config_file) as f:
        config = json.load(f)
    for key, value in config.items():
        print(key, value)

    outdir = options.outdir
    if outdir is None:
        outdir = os.path.split(config_file)[0]
    if not os.path.exists(outdir):
        raise RuntimeError("Output directory does not exist")

    train_file = config['train_file']
    max_iter = config['max_iter']
    family = config.get('family', 'NegativeBinomial')
    subset_column = config.get('subset_column', None)
    subset_target = config.get('subset_target', None)

    df_train = pd.read_csv(train_file, header=0, index_col=0)

    if subset_column is not None and subset_target is not None:
        print("Taking subset of df")
        print(df_train.shape)
        df_train = df_train[df_train[subset_column] == subset_target]
        print(df_train.shape)

    target = config['target']

    factors = config['factors']

    interactions = config['interactions']

    intercept = config.get('intercept', True)

    l1_alpha = config.get('l1_alpha', None)

    lists = {}
    columns = {}
    types = {}
    poly_matrices = {}
    val_indices = {}

    y = df_train[target].values
    X = pd.DataFrame()

    X_pred = pd.DataFrame()

    zscore_stds = {}
    zscore_means = {}

    for factor in factors:
        name = factor['name']
        factor_type = factor['type']
        types[name] = factor_type
        transform = factor.get('transform', None)
        if factor_type == 'vector':
            if transform is not None and transform == 'log':
                X['log(' + name + ')'] = np.log(df_train[name].values)
                columns[name] = ['log(' + name + ')']
                X_pred['log(' + name + ')'] = [0]
            elif transform is not None and transform == 'zscore':
                values = df_train[name].values
                zscored_values, zmean, zstd = zscore_set(values)
                X['zscore(' + name + ')'] = zscored_values
                zscore_means[name] = zmean
                zscore_stds[name] = zstd
                X_pred['zscore(' + name + ')'] = [0]
            else:
                X[name] = df_train[name].values
                X_pred[name] = [0]

        elif factor_type == 'int':
            linear = factor.get('linear', False)
            quadratic = factor.get('quadratic', False)
            cubic = factor.get('cubic', False)
            include = factor.get('include', [])
            first = factor.get('first', None)
            last = factor.get('last', None)
            pred_val = factor.get('pred_val', 0)
            print(name, 'pred_val', pred_val)
            components_excl_linear = factor.get('components_excl_linear', None)
            factor_df, poly_matrix, val_index = convert_int_list_to_matrix(name, df_train[name].values, linear=linear, components_excl_linear=components_excl_linear)
            poly_matrices[name] = poly_matrix
            val_indices[name] = val_index
            columns[name] = list(factor_df.columns)
            for col in factor_df.columns:
                X[col] = factor_df[col].values

            factor_df_pred, _, _ = convert_int_list_to_matrix(name, [pred_val], linear=linear, poly_matrix=poly_matrices[name], val_index=val_index)
            for col in factor_df_pred.columns:
                X_pred[col] = factor_df_pred[col].values

        elif factor_type == 'str':
            exclude = factor.get('exclude', None)
            exclude_most_common = factor.get('exclude_most_common', False)
            min_count = factor.get('min_count', 0)
            factor_df = convert_string_list_to_matrix(name, df_train[name].values, exclude_most_common=exclude_most_common, exclude=exclude, min_count=min_count)
            lists[name] = list(factor_df.columns)
            columns[name] = list(factor_df.columns)
            for col in factor_df.columns:
                X[col] = factor_df[col].values
                X_pred[col] = [0]
        else:
            print(factor)
            raise RuntimeError("Factor type not recognized")

    if intercept:
        print("Adding intercept")
        X['const'] = 1.
        X_pred['const'] = 1.

    if options.save:
        X_copy = X.copy()
        X_copy[target] = y
        X_copy.to_csv(os.path.join(outdir, 'Xy.csv'))

    X, interaction_cols = add_interactions(X, interactions, columns)
    X_pred, _ = add_interactions(X_pred, interactions, columns)

    if family == 'Logistic':
        print("Using Logistic model")
        model = sm.Logit(y, X)
    elif family.lower() == 'linear':
        print("Using Linear model")
        model = sm.OLS(y, X)
    elif family == 'NegativeBinomial':
        print("Using negative Binomial model")
        model = sm.NegativeBinomial(y, X)
    else:
        raise ValueError("Model family not recognized", family)

    if l1_alpha is None:
        fit = model.fit(maxiter=max_iter)
    else:
        fit = model.fit_regularized(alpha=l1_alpha)

    params = fit.params
    intervals = fit.conf_int()
    stder = fit.bse

    pvalues = fit.pvalues
    aic = fit.aic
    bic = fit.bic
    llf = fit.llf
    print("AIC:", aic)
    print("BIC:", bic)

    params.to_csv(os.path.join(outdir, 'params.csv'))
    intervals.to_csv(os.path.join(outdir, 'intervals.csv'))
    stder.to_csv(os.path.join(outdir, 'stder.csv'))
    pvalues.to_csv(os.path.join(outdir, 'pvalues.csv'))
    for name, poly_matrix in poly_matrices.items():
        np.savez(os.path.join(outdir, name + '.npz'), matrix=poly_matrix)

    if options.save:
        fit.save(os.path.join(outdir, 'model.pkl'))

    report = {'aic': aic,
              'bic': bic,
              'llf': llf,
              'nans': int(np.isnan(pvalues.values).any())
              }
    with open(os.path.join(outdir, 'report.json'), 'w') as f:
        json.dump(report, f, indent=2)

    with open(os.path.join(outdir, 'columns.json'), 'w') as f:
        json.dump(columns, f, indent=2)

    with open(os.path.join(outdir, 'interactions.json'), 'w') as f:
       json.dump(interaction_cols, f, indent=2)

    resids = np.array(y) - np.array(fit.fittedvalues)
    order = np.arange(len(y))
    np.random.shuffle(order)
    fig, ax = plt.subplots()
    ax.scatter(np.arange(len(resids)), resids, alpha=0.2)
    plt.savefig(os.path.join(outdir, 'resids.pdf'), bbox_inches='tight')

    with open(os.path.join(outdir, 'zscore_means.json'), 'w') as f:
        json.dump(zscore_means, f, indent=2)

    with open(os.path.join(outdir, 'zscore_stds.json'), 'w') as f:
        json.dump(zscore_stds, f, indent=2)

    prediction = fit.predict(X_pred)
    print(prediction)
    X_pred['pred'] = prediction
    X_pred.to_csv(os.path.join(outdir, '2009_0_pred.json'))

    fit.save(os.path.join(outdir, 'model.pickle'))
コード例 #18
0
print('P-values: ', res.pvalues)
print('AIC: ', res.aic)

# As usual, you can obtain a full list of available information by typing
# ``dir(res)``.
# We can also look at the summary of the estimation results.

print(res.summary())

# ### Testing

# We can check the results by using the statsmodels implementation of the
# Negative Binomial model, which uses the analytic score function and
# Hessian.

res_nbin = sm.NegativeBinomial(y, X).fit(disp=0)
print(res_nbin.summary())

print(res_nbin.params)

print(res_nbin.bse)

# Or we could compare them to results obtained using the MASS
# implementation for R:
#
#     url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdataset
# s/csv/COUNT/medpar.csv'
#     medpar = read.csv(url)
#     f = los~factor(type)+hmo+white
#
#     library(MASS)
コード例 #19
0
 def setup(self):
     # fit for each test, because results will be changed by test
     np.random.seed(987689)
     data = sm.datasets.randhie.load()
     mod = sm.NegativeBinomial(data.endog, data.exog)
     self.results = mod.fit(disp=0)
コード例 #20
0
#
# Load the Rand data. Note that this example is similar to Cameron and
# Trivedi's `Microeconometrics` Table 20.5, but it is slightly different
# because of minor changes in the data.

rand_data = sm.datasets.randhie.load()
rand_exog = rand_data.exog
rand_exog = sm.add_constant(rand_exog, prepend=False)

# Fit Poisson model:

poisson_mod = sm.Poisson(rand_data.endog, rand_exog)
poisson_res = poisson_mod.fit(method="newton")
print(poisson_res.summary())

# ## Negative Binomial
#
# The negative binomial model gives slightly different results.

mod_nbin = sm.NegativeBinomial(rand_data.endog, rand_exog)
res_nbin = mod_nbin.fit(disp=False)
print(res_nbin.summary())

# ## Alternative solvers
#
# The default method for fitting discrete data MLE models is Newton-
# Raphson. You can use other solvers by using the ``method`` argument:

mlogit_res = mlogit_mod.fit(method="bfgs", maxiter=250)
print(mlogit_res.summary())
コード例 #21
0
##############################################################################
##########################  SECTION 3: POISSON AND NEGATIVE BINOMIAL FOR DURATION
##########################  SECTION: 3.1.4 IN THE THESIS
##############################################################################

# Compute poisson regression
poisson = sm.Poisson(Z_1, X_1)
poisson = poisson.fit()
poisson.summary()

# Compute marginal effects
poisson_1_dydx = poisson.get_margeff(method='dydx', at='median')
poisson_1_dydx.summary()

# Compute negative binomial
negative_binomial = sm.NegativeBinomial(Z_1, X_1)
negative_binomial = negative_binomial.fit(method="newton", max_iter=100)
nbinomial_1_dydx = negative_binomial.get_margeff(method='dydx', at='median')

nbinomial_1_dydx.summary()

# Compute RMSE for negative binomial
pred_binom = negative_binomial.predict(X_1)
pred_binom = np.array(pred_binom).reshape(len(X_1))
RMSE_neg_binom = compute_RMSE(Z_1_arr, pred_binom)

##############################################################################
##########################  SECTION 4: LATEX OUTPUT
##############################################################################

def select_n_coeffs(results, nb_first):
コード例 #22
0
ファイル: crimePredictionNBM.py プロジェクト: scc-usc/LAcrime
            dep_var_mat_neighbors.append(
                np.append(cellList[k:(k + lookback)], neighbors))
            dep_var_mat_clustering.append(
                np.append(cellList[k:(k + lookback)], clustered_neighbors))

            indep_var_col.append(cellList[k + lookback])
            n_crimes[i][j] += cellList[k + lookback]
            k += 1

            #if i==7 and (j == 18):
            #	print(str(cellList[k:(k+ lookback)]) + ' ' + str(cellList[k+lookback]))
            #	raw_input('')

        bm = sm.NegativeBinomial(indep_var_col,
                                 dep_var_mat,
                                 loglike_method='nb2')
        reg_mat[i][j] = bm.fit()
        """
		reg_mat_neighbors[i][j]=sm.NegativeBinomial(indep_var_col,dep_var_mat_neighbors,loglike_method='nb2')
		reg_mat_neighbors[i][j].fit()
		"""
        reg_mat_neighbors[i][j] = LinearRegression()
        reg_mat_neighbors[i][j].fit(dep_var_mat_neighbors, indep_var_col)

        bm1 = sm.NegativeBinomial(indep_var_col,
                                  dep_var_mat_clustering,
                                  loglike_method='nb2')
        reg_mat_clustering[i][j] = bm1.fit()

        #if i==7 and (j == 18):
コード例 #23
0
    def fit_binary(self, minSize=20, binInt=5, binRate=0.2):

        bin_dists = ['poisson', 'nbinom']
        bin_dists = ['poisson']

        for m in self.members:
            vals, logV, dZ = [int(x) for x in m.cnts.values()], [
                log(v + 1.0) for v in m.cnts.values()
            ], [0 for i in range(self.space - len(m.cnts.values()))]
            if len(vals) < minSize: continue

            val_key = {
                'RAW-NZ': vals,
                'RAW-WZ': vals + dZ,
                'LOG-NZ': logV,
                'LOG-WZ': logV + dZ
            }

            for val_type, my_vals in val_key.items():
                self.tests = {}
                vLen, bR, vMean = len(my_vals), int(len(my_vals) *
                                                    binRate), np.mean(my_vals)
                if val_type.split('-')[0] == 'LOG': continue

                ## FIRST POISSON ##

                poisson_mod = sm.Poisson(my_vals, [1 for v in my_vals])
                poisson_res = poisson_mod.fit(method="newton", disp=0)
                poisson_pv = poisson_res.pvalues[0]
                pAIC, pBIC = poisson_res.aic, poisson_res.bic

                poisson_sample = stats.poisson.rvs(vMean, size=len(my_vals))
                chiT, chiP = self.bin_chi(
                    my_vals, poisson_sample,
                    min(binInt, int(len(vals) * binRate)))
                self.tests['poisson'] = (chiT, chiP)
                print m.name, len(vals), len(
                    dZ
                ), val_type, 'poisson', chiT, chiP, '|', poisson_pv, 'NA', '|', pAIC, pBIC

                ## NEGATIVE BINOMIAL ##

                mod_nbin = sm.NegativeBinomial(my_vals, [1 for v in my_vals])
                res_nbin = mod_nbin.fit(disp=0)

                mPV, aPV = res_nbin.pvalues
                nbM, nbA = exp(res_nbin.params[0]), res_nbin.params[1]
                estX, estP = convert_nb(nbM, nbA)
                my_comps = stats.nbinom.rvs(estX, estP, size=len(my_vals))
                chiT, chiP = self.bin_chi(
                    my_vals, my_comps, min(binInt,
                                           int(len(my_vals) * binRate)))
                self.tests['nbin'] = (chiT, chiP)

                nbAIC, nbBIC = res_nbin.aic, res_nbin.bic

                print m.name, len(vals), len(
                    dZ
                ), val_type, 'neg-binom', chiT, chiP, "|", mPV, aPV, '|', nbAIC, nbBIC

                ## NOW ZERO P ###
                if val_type.split('-')[-1] == 'NZ': continue
                zp_nbin = msc.PoissonZiGMLE(my_vals, [1 for v in my_vals])
                res_zp = zp_nbin.fit(disp=0)
                zpAIC, zpBIC = res_zp.aic, res_zp.bic
                zpM = exp(res_zp.params[0])
                zpZ = 1 - (np.mean(my_vals) / zpM)

                try:
                    cPV, zPV = res_zp.pvalues
                except ValueError:
                    cPV, zPV = 'NA', 'NA'
                    print 'hmmm'
                my_comps = [
                    x if random.random() > zpZ else 0
                    for x in stats.poisson.rvs(zpM, size=len(my_vals))
                ]
                chiT, chiP = self.bin_chi(
                    my_vals, my_comps, min(binInt,
                                           int(len(my_vals) * binRate)))
                self.tests['zp'] = (chiT, chiP)
                print m.name, len(vals), len(
                    dZ
                ), val_type, 'zip-po', chiT, chiP, "|", cPV, zPV, '|', zpAIC, zpBIC

        sys.exit()