def run_eqtl_one_test_nb_lm(expression, genotype, covariates, lib_size): # FIT ALTERNATE MODEL # Covariate matrix X = np.vstack((genotype, covariates.T)).T # Add intercept X2 = sm.add_constant(X) try: model = sm.NegativeBinomial(expression, X2, exposure=lib_size) fit = model.fit() beta = fit.params[1] standard_error = fit.bse[1] pvalue = fit.pvalues[1] ll_full = fit.llf # FIT NULL MODEL X3 = sm.add_constant(covariates) model = sm.NegativeBinomial(expression, X3, exposure=lib_size) fit = model.fit() ll_null = fit.llf pseudo_r_squared = 1.0 - (ll_full / ll_null) except: beta = "nan" standard_error = "nan" pvalue = "nan" pseudo_r_squared = "nan" return beta, standard_error, pvalue, pseudo_r_squared
def setup(self): #fit for each test, because results will be changed by test np.random.seed(987689) data = sm.datasets.randhie.load(as_pandas=False) exog = sm.add_constant(data.exog, prepend=False) mod = sm.NegativeBinomial(data.endog, data.exog) self.results = mod.fit(disp=0)
def setup(self): #fit for each test, because results will be changed by test np.random.seed(987689) data = sm.datasets.randhie.load() exog = sm.add_constant(data.exog, prepend=False) mod = sm.NegativeBinomial(data.endog, data.exog) start_params = np.array([ -0.0565406, -0.21213599, 0.08783076, -0.02991835, 0.22901974, 0.0621026, 0.06799283, 0.08406688, 0.18530969, 1.36645452 ]) self.results = mod.fit(start_params=start_params, disp=0)
def setup(self): #fit for each test, because results will be changed by test np.random.seed(987689) data = sm.datasets.randhie.load(as_pandas=False) exog = sm.add_constant(data.exog, prepend=False) mod = sm.NegativeBinomial(data.endog, exog) start_params = np.array([-0.05783623, -0.26655806, 0.04109148, -0.03815837, 0.2685168 , 0.03811594, -0.04426238, 0.01614795, 0.17490962, 0.66461151, 1.2925957 ]) self.results = mod.fit(start_params=start_params, disp=0, maxiter=500) self.transform_index = -1
def test_poi_nb_zip_zinb_tiny_subset(meta, m): exog_names = r"rowid;latitude;longitude;target;dbuiltup;dforest;drecreation;dbrr;dwrl;dwrn;dwrr;dcamping;dcaravan;dcross;dgolf;dheem;dhaven;dsafari;dwater;attr;dbath;lu;lc;maxmeanhaz;maxstdhaz".split(";")[4:] np.random.seed(2) randint = np.random.randint(0, high=len(m)-1, size=800) msel = m[randint,:] Y = msel[:, 0] X = msel[:, 1:] # Ynz, Xnz = trim_value(Y, X, 0) print("Msel shape: ", msel.shape) xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=42) print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape) print print("Model: Poisson") poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50) poi_mean_pred = poi_mod.predict(xtest) poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred) poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs)) # print(np.unique(poi_ppf_obs, return_counts=True)) print("RMSE Poisson: ", poi_rmse) # print(poi_mod.summary(yname='tickbites', xname=exog_names)) print print("Model: Neg. Binomial") nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params = None, method = 'newton', maxiter=50) nb_pred = nb_mod.predict(xtest) nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred)) # print(np.unique(nb_pred, return_counts=True)) print("RMSE Negative Binomial: ", nb_rmse) print print("Model: Zero Inflated Poisson") zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs)) print("RMSE Zero-Inflated Poisson", zip_rmse) print print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50) zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred)) print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
def regression(df, a, b, c, d, distribution): """[summary] Calculate VE and CI's according https://timeseriesreasoning.com/contents/estimation-of-vaccine-efficacy-using-logistic-regression/ * We'll use Patsy to carve out the X and y matrices * Build and train a Logit model (sm.Logit) Args: a ([type]): sick vax b ([type]): sick unvax c ([type]): total vax d ([type]): total unvax Returns: 0""" p_sick_unvax = b / d #Form the regression equation expr = 'INFECTED ~ VACCINATED' #We'll use Patsy to carve out the X and y matrices y_train, X_train = dmatrices(expr, df, return_type='dataframe') #Build and train a Logit model if distribution == "logit": model = sm.Logit(endog=y_train, exog=X_train, disp=False) elif distribution == "poisson": model = sm.Poisson(endog=y_train, exog=X_train, disp=False) elif distribution == "neg_bin": model = sm.NegativeBinomial(endog=y_train, exog=X_train, disp=False) results = model.fit(disp=False) params = results.params #Print the model summary #stl.write(logit_results.summary2()) VE = VE_(params[1], p_sick_unvax) # stl.write(f"\nConfidence intervals") # stl.write(logit_results.conf_int()) # confidence intervals conf = results.conf_int() high, low = conf[0][1], conf[1][1] prsquared = results.prsquared VE_low, VE_high = VE_(low, p_sick_unvax), VE_(high, p_sick_unvax) stl.write( f"VE Regression {distribution} : {VE} % [{VE_low} , {VE_high}] | pseudo-R2 = {prsquared}" )
def SPNegativeBinomial(context): # 从 Context 中获取相关数据 args = context.args # 查看上一节点发送的 args.inputData 数据 df = args.inputData featureColumns = args.featureColumns labelColumn = args.labelColumn features = df[featureColumns].values label = df[labelColumn].values arma_mod = sm.NegativeBinomial(label, features, missing=args.missing) arma_res = arma_mod.fit(method=args.method) return arma_res
def NBFit(cnts): ''' Negative Binomival fit. Parameters: cnts: list or pandas.Series list of counts Returns: size, prob: float negative binomial parameters. ''' y, x = list(cnts), numpy.ones(len(cnts)) res = sm.NegativeBinomial(y, x, loglike_method='nb1').fit(start_params=[0.1, 0.1],disp=True) mu, alpha = numpy.exp(res.params[0]), res.params[1] size = mu/alpha prob = size/(size+mu) return size, prob
def regress_nb(self, Y, X, interest=None): print 'uh' #sm.GLM(data.endog, data.exog, family=sm.families.Gamma()) foo = sm.GLM(Y, X, family=sm.families.NegativeBinomial(), variance=10).fit() foo = sm.GLM(Y, X, family=sm.families.NegativeBinomial()).fit() print foo.summary() model = sm.NegativeBinomial([log(y + 1.0) for y in Y], X).fit() # print model.summary() sys.exit() print len(model.pvalues) print len(model.params) print model.bic # print model.rsquared # print model.rsquared_adj for v in vars(model._results): print v # print model.summary() for v in vars(model.model): print v # p_out = {'params': r_out, 'bic': model.bic, 'rs': model.rsquared, 'ars': model.rsquared_adj, 'resids': model.resid, 'pwr': pwr} mPV, aPV = res_nbin.pvalues nbM, nbA = exp(res_nbin.params[0]), res_nbin.params[1] estX, estP = convert_nb(nbM, nbA) my_comps = stats.nbinom.rvs(estX, estP, size=len(my_vals)) chiT, chiP = self.bin_chi(my_vals, my_comps, min(binInt, int(len(my_vals) * binRate))) self.tests['nbin'] = (chiT, chiP) nbAIC, nbBIC = res_nbin.aic, res_nbin.bic print m.name, len(vals), len( dZ ), val_type, 'neg-binom', chiT, chiP, "|", mPV, aPV, '|', nbAIC, nbBIC sys.exit()
def tiny_negbin(l): print("\t\tRunning NegBin") nb_mod, nb_pred = [None for i in range(2)] nb_rmse = 0 xtr = np.array([item[1:] for item in l]) ytr = np.array([item[0] for item in l]).reshape(-1, 1) try: nb_mod = sm.NegativeBinomial(ytr, xtr).fit(start_params=None, method='newton', maxiter=50, disp=0) nb_pred = nb_mod.predict(xtr) nb_rmse = np.sqrt(mean_squared_error(ytr, nb_pred)) except np.linalg.LinAlgError as e: if 'Singular matrix' in str(e): print("\t\t\tIgnored a singular matrix.") except ValueError: print("\t\t\tIgnored output containing np.nan or np.inf") return [nb_mod, nb_pred, nb_rmse]
def negbin_cdf(series): ''' This function takes a np.array and returns a negative binomial CDF for overdispersed count data. # prob. that x is less than or equal to val. ''' series = series.tolist() y = np.array([series]) y = y.flatten() # create intercept to fit a model with intercept intercept = np.ones(len(y)) # fit negative binomial m1 = sm.NegativeBinomial(y, intercept, loglike_method='nb2').fit() # retrieve mu mu = np.exp(m1.params[0]) # retrieve alpha alpha = m1.params[1] # set Q to zero for nb2 method, Q to 1 for nb1 method Q = 0 # derive size size = 1. / alpha * mu**Q # derive prob prob = size / (size + mu) return nbinom.cdf(y, n=size, p=prob)
""" Negative binomial regression example """ import statsmodels.api as sm endog = [2, 2, 3] exog = [[1, 2], [2, 4], [5, 6]] #endog1 = [1,2,3] exog1 = [[1, 2], [2, 4]] nbm = sm.NegativeBinomial(endog, sm.add_constant(exog), loglike_method='nb2') res = nbm.fit() #print(res.summary()) pred = list(res.predict(exog1)) print pred
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=42) print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape) print("Model: Poisson") poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50) poi_mean_pred = poi_mod.predict(xtest) poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred) poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs)) print("Model: Neg. Binomial") nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params=None, method='newton', maxiter=50) nb_pred = nb_mod.predict(xtest) nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred)) print(np.ones(len(xtest)).shape) print("Model: Zero Inflated Poisson") zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs)) print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain,
# nominal data models (not tested) y = df.y_nominal # DV mn_logit = sm.MNLogit(y, X).fit() print(mn_logit.summary2()) # estimation summary y_pred = mn_logit.predict(X) # fitted/predicted values print(confusion_matrix(y, (y_pred > .5).astype(int))) # count data models (w/ exposure!) y = df.y_count # DV m_poiss = sm.Poisson( y, X, exposure=df['x_timespan'].values).fit() print(m_poiss.summary2()) m_NB2 = sm.NegativeBinomial( y, X, loglike_method='nb2', exposure=df['x_timespan'].values).fit() print(m_NB2.summary2()) m_NB1 = sm.NegativeBinomial( y, X, loglike_method='nb1', exposure=df['x_timespan'].values).fit() print(m_NB1.summary2()) m_NBP = sm.NegativeBinomialP( y, X, exposure=df['x_timespan'].values).fit() print(m_NBP.summary2()) #endregion #region REGRESSION MODELS
def pval_at_rna_by_nbinom(self, pos_dict_of_counts: Mapping[str, List], neg_vals_at_rna: np.array, gene_and_type, log_if_values_above=1E9, log_values=False, which='per_read', verbose=False): """For a given RNA, get the p values for all proteins by negative binomial. gene_and_type: "SMAD2::exon" dict_of_counts: dict of {protein: [replicate 1, replicate 2]} """ if len(neg_vals_at_rna) == 0: return None log_scale_high_value = (np.mean(neg_vals_at_rna) > log_if_values_above) if log_values or log_scale_high_value: log_this_gene = True neg_vals_at_rna = np.log10(neg_vals_at_rna) else: log_this_gene = False #if not np.any(neg_vals_at_rna): #print("No positive values in negatives.") # neg_vals_at_rna = np.array([ # self.negatives.lowest_positive_vals[which][x]/10 for x in \ # self.negatives.metadata.random_proteins]) #print(f"negatives now {neg_vals_at_rna}") mean_negative = np.average(neg_vals_at_rna) std_negative = np.std(neg_vals_at_rna) vmr = (std_negative**2) / mean_negative verbose and print(f'vmr for negatives={vmr}') # Use a poisson if the var/mean is low enough: if vmr < 2: verbose and print("Using poisson.") self.stats_log['vmr<2'] += 1 pois = stats.poisson(mean_negative) return self.use_dist(pos_dict_of_counts, log_this_gene, pois) verbose and print("Wil try to use NB.") self.stats_log['vmr>=2'] += 1 # Try to fit a NB useing statsmodels. q = sm.NegativeBinomial(neg_vals_at_rna, np.array([1] * len(neg_vals_at_rna)), loglike_method='nb2') try: res = q.fit(disp=0) except: # If a NB can't be fit, revert to a poisson. print( f"Could not run q.fit(disp=0) on neg_vals_at_rna= {neg_vals_at_rna}. Using poisson." ) pois = stats.poisson(mean_negative) return self.use_dist(pos_dict_of_counts, log_this_gene, pois) # Create a scipy.stats.nbinom object to use its cdf, based on the statsmodels fit parameters. # There is no cdf function for the statsmodels object. mu = res.predict()[0] # alpha = res.params[1] size = 1. / res.params[1] # prob = size / (size + mu) verbose and print(f"Fit NB mu={mu}") pvals = self.use_dist(pos_dict_of_counts, log_this_gene, stats.nbinom(size, size / (size + mu))) return pvals
def get_stats( scores: np.array, background: np.array, total_bg: int, neg_binom: bool = False, adj_method: str = "fdr_bh", pval_adj_cutoff: float = 0.01, return_negbinom_params: bool = False, ): """Retrieves valid candidate genes to be used for random gene pairs. Parameters ---------- scores: np.array Per spot scores for a particular LR pair. background: np.array Background distribution for non-zero scores. total_bg: int Total number of background values calculated. neg_binom: bool Whether to use neg-binomial distribution to estimate p-values, NOT appropriate with log1p data, alternative is to use background distribution itself (recommend higher number of n_pairs for this). adj_method: str Parsed to statsmodels.stats.multitest.multipletests for multiple hypothesis testing correction. Returns ------- stats: tuple Per spot pvalues, pvals_adj, log10_pvals_adj, lr_sign (the LR scores for significant spots). """ ##### Negative Binomial fit if neg_binom: # Need to make full background for fitting !!! background = np.array( list(background) + [0] * (total_bg - len(background))) pmin, pmax = min(background), max(background) background2 = [item - pmin for item in background] x = np.linspace(pmin, pmax, 1000) res = sm.NegativeBinomial(background2, np.ones(len(background2)), loglike_method="nb2").fit( start_params=[0.1, 0.3], disp=0) mu = res.predict() # use if not constant mu = np.exp(res.params[0]) alpha = res.params[1] Q = 0 size = 1.0 / alpha * mu**Q prob = size / (size + mu) if return_negbinom_params: # For testing purposes # return size, prob # Calculate probability for all spots pvals = 1 - scipy.stats.nbinom.cdf(scores - pmin, size, prob) else: ###### Using the actual values to estimate p-values pvals = np.zeros((1, len(scores)), dtype=np.float)[0, :] nonzero_score_bool = scores > 0 nonzero_score_indices = np.where(nonzero_score_bool)[0] zero_score_indices = np.where(nonzero_score_bool == False)[0] pvals[zero_score_indices] = (total_bg - len(background)) / total_bg pvals[nonzero_score_indices] = [ len(np.where(background >= scores[i])[0]) / total_bg for i in nonzero_score_indices ] pvals_adj = multipletests(pvals, method=adj_method)[1] log10_pvals_adj = -np.log10(pvals_adj) lr_sign = scores * (pvals_adj < pval_adj_cutoff) return pvals, pvals_adj, log10_pvals_adj, lr_sign
def main(): usage = "%prog path/to/config.json " parser = OptionParser(usage=usage) parser.add_option('--outdir', type=str, default=None, help='Output dir [basedir of config if None]: default=%default') parser.add_option('--save', action="store_true", default=False, help='Save data matrix: default=%default') (options, args) = parser.parse_args() config_file = args[0] print(config_file) with open(config_file) as f: config = json.load(f) for key, value in config.items(): print(key, value) outdir = options.outdir if outdir is None: outdir = os.path.split(config_file)[0] if not os.path.exists(outdir): raise RuntimeError("Output directory does not exist") train_file = config['train_file'] max_iter = config['max_iter'] family = config.get('family', 'NegativeBinomial') subset_column = config.get('subset_column', None) subset_target = config.get('subset_target', None) df_train = pd.read_csv(train_file, header=0, index_col=0) if subset_column is not None and subset_target is not None: print("Taking subset of df") print(df_train.shape) df_train = df_train[df_train[subset_column] == subset_target] print(df_train.shape) target = config['target'] factors = config['factors'] interactions = config['interactions'] intercept = config.get('intercept', True) l1_alpha = config.get('l1_alpha', None) lists = {} columns = {} types = {} poly_matrices = {} val_indices = {} y = df_train[target].values X = pd.DataFrame() X_pred = pd.DataFrame() zscore_stds = {} zscore_means = {} for factor in factors: name = factor['name'] factor_type = factor['type'] types[name] = factor_type transform = factor.get('transform', None) if factor_type == 'vector': if transform is not None and transform == 'log': X['log(' + name + ')'] = np.log(df_train[name].values) columns[name] = ['log(' + name + ')'] X_pred['log(' + name + ')'] = [0] elif transform is not None and transform == 'zscore': values = df_train[name].values zscored_values, zmean, zstd = zscore_set(values) X['zscore(' + name + ')'] = zscored_values zscore_means[name] = zmean zscore_stds[name] = zstd X_pred['zscore(' + name + ')'] = [0] else: X[name] = df_train[name].values X_pred[name] = [0] elif factor_type == 'int': linear = factor.get('linear', False) quadratic = factor.get('quadratic', False) cubic = factor.get('cubic', False) include = factor.get('include', []) first = factor.get('first', None) last = factor.get('last', None) pred_val = factor.get('pred_val', 0) print(name, 'pred_val', pred_val) components_excl_linear = factor.get('components_excl_linear', None) factor_df, poly_matrix, val_index = convert_int_list_to_matrix(name, df_train[name].values, linear=linear, components_excl_linear=components_excl_linear) poly_matrices[name] = poly_matrix val_indices[name] = val_index columns[name] = list(factor_df.columns) for col in factor_df.columns: X[col] = factor_df[col].values factor_df_pred, _, _ = convert_int_list_to_matrix(name, [pred_val], linear=linear, poly_matrix=poly_matrices[name], val_index=val_index) for col in factor_df_pred.columns: X_pred[col] = factor_df_pred[col].values elif factor_type == 'str': exclude = factor.get('exclude', None) exclude_most_common = factor.get('exclude_most_common', False) min_count = factor.get('min_count', 0) factor_df = convert_string_list_to_matrix(name, df_train[name].values, exclude_most_common=exclude_most_common, exclude=exclude, min_count=min_count) lists[name] = list(factor_df.columns) columns[name] = list(factor_df.columns) for col in factor_df.columns: X[col] = factor_df[col].values X_pred[col] = [0] else: print(factor) raise RuntimeError("Factor type not recognized") if intercept: print("Adding intercept") X['const'] = 1. X_pred['const'] = 1. if options.save: X_copy = X.copy() X_copy[target] = y X_copy.to_csv(os.path.join(outdir, 'Xy.csv')) X, interaction_cols = add_interactions(X, interactions, columns) X_pred, _ = add_interactions(X_pred, interactions, columns) if family == 'Logistic': print("Using Logistic model") model = sm.Logit(y, X) elif family.lower() == 'linear': print("Using Linear model") model = sm.OLS(y, X) elif family == 'NegativeBinomial': print("Using negative Binomial model") model = sm.NegativeBinomial(y, X) else: raise ValueError("Model family not recognized", family) if l1_alpha is None: fit = model.fit(maxiter=max_iter) else: fit = model.fit_regularized(alpha=l1_alpha) params = fit.params intervals = fit.conf_int() stder = fit.bse pvalues = fit.pvalues aic = fit.aic bic = fit.bic llf = fit.llf print("AIC:", aic) print("BIC:", bic) params.to_csv(os.path.join(outdir, 'params.csv')) intervals.to_csv(os.path.join(outdir, 'intervals.csv')) stder.to_csv(os.path.join(outdir, 'stder.csv')) pvalues.to_csv(os.path.join(outdir, 'pvalues.csv')) for name, poly_matrix in poly_matrices.items(): np.savez(os.path.join(outdir, name + '.npz'), matrix=poly_matrix) if options.save: fit.save(os.path.join(outdir, 'model.pkl')) report = {'aic': aic, 'bic': bic, 'llf': llf, 'nans': int(np.isnan(pvalues.values).any()) } with open(os.path.join(outdir, 'report.json'), 'w') as f: json.dump(report, f, indent=2) with open(os.path.join(outdir, 'columns.json'), 'w') as f: json.dump(columns, f, indent=2) with open(os.path.join(outdir, 'interactions.json'), 'w') as f: json.dump(interaction_cols, f, indent=2) resids = np.array(y) - np.array(fit.fittedvalues) order = np.arange(len(y)) np.random.shuffle(order) fig, ax = plt.subplots() ax.scatter(np.arange(len(resids)), resids, alpha=0.2) plt.savefig(os.path.join(outdir, 'resids.pdf'), bbox_inches='tight') with open(os.path.join(outdir, 'zscore_means.json'), 'w') as f: json.dump(zscore_means, f, indent=2) with open(os.path.join(outdir, 'zscore_stds.json'), 'w') as f: json.dump(zscore_stds, f, indent=2) prediction = fit.predict(X_pred) print(prediction) X_pred['pred'] = prediction X_pred.to_csv(os.path.join(outdir, '2009_0_pred.json')) fit.save(os.path.join(outdir, 'model.pickle'))
print('P-values: ', res.pvalues) print('AIC: ', res.aic) # As usual, you can obtain a full list of available information by typing # ``dir(res)``. # We can also look at the summary of the estimation results. print(res.summary()) # ### Testing # We can check the results by using the statsmodels implementation of the # Negative Binomial model, which uses the analytic score function and # Hessian. res_nbin = sm.NegativeBinomial(y, X).fit(disp=0) print(res_nbin.summary()) print(res_nbin.params) print(res_nbin.bse) # Or we could compare them to results obtained using the MASS # implementation for R: # # url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdataset # s/csv/COUNT/medpar.csv' # medpar = read.csv(url) # f = los~factor(type)+hmo+white # # library(MASS)
def setup(self): # fit for each test, because results will be changed by test np.random.seed(987689) data = sm.datasets.randhie.load() mod = sm.NegativeBinomial(data.endog, data.exog) self.results = mod.fit(disp=0)
# # Load the Rand data. Note that this example is similar to Cameron and # Trivedi's `Microeconometrics` Table 20.5, but it is slightly different # because of minor changes in the data. rand_data = sm.datasets.randhie.load() rand_exog = rand_data.exog rand_exog = sm.add_constant(rand_exog, prepend=False) # Fit Poisson model: poisson_mod = sm.Poisson(rand_data.endog, rand_exog) poisson_res = poisson_mod.fit(method="newton") print(poisson_res.summary()) # ## Negative Binomial # # The negative binomial model gives slightly different results. mod_nbin = sm.NegativeBinomial(rand_data.endog, rand_exog) res_nbin = mod_nbin.fit(disp=False) print(res_nbin.summary()) # ## Alternative solvers # # The default method for fitting discrete data MLE models is Newton- # Raphson. You can use other solvers by using the ``method`` argument: mlogit_res = mlogit_mod.fit(method="bfgs", maxiter=250) print(mlogit_res.summary())
############################################################################## ########################## SECTION 3: POISSON AND NEGATIVE BINOMIAL FOR DURATION ########################## SECTION: 3.1.4 IN THE THESIS ############################################################################## # Compute poisson regression poisson = sm.Poisson(Z_1, X_1) poisson = poisson.fit() poisson.summary() # Compute marginal effects poisson_1_dydx = poisson.get_margeff(method='dydx', at='median') poisson_1_dydx.summary() # Compute negative binomial negative_binomial = sm.NegativeBinomial(Z_1, X_1) negative_binomial = negative_binomial.fit(method="newton", max_iter=100) nbinomial_1_dydx = negative_binomial.get_margeff(method='dydx', at='median') nbinomial_1_dydx.summary() # Compute RMSE for negative binomial pred_binom = negative_binomial.predict(X_1) pred_binom = np.array(pred_binom).reshape(len(X_1)) RMSE_neg_binom = compute_RMSE(Z_1_arr, pred_binom) ############################################################################## ########################## SECTION 4: LATEX OUTPUT ############################################################################## def select_n_coeffs(results, nb_first):
dep_var_mat_neighbors.append( np.append(cellList[k:(k + lookback)], neighbors)) dep_var_mat_clustering.append( np.append(cellList[k:(k + lookback)], clustered_neighbors)) indep_var_col.append(cellList[k + lookback]) n_crimes[i][j] += cellList[k + lookback] k += 1 #if i==7 and (j == 18): # print(str(cellList[k:(k+ lookback)]) + ' ' + str(cellList[k+lookback])) # raw_input('') bm = sm.NegativeBinomial(indep_var_col, dep_var_mat, loglike_method='nb2') reg_mat[i][j] = bm.fit() """ reg_mat_neighbors[i][j]=sm.NegativeBinomial(indep_var_col,dep_var_mat_neighbors,loglike_method='nb2') reg_mat_neighbors[i][j].fit() """ reg_mat_neighbors[i][j] = LinearRegression() reg_mat_neighbors[i][j].fit(dep_var_mat_neighbors, indep_var_col) bm1 = sm.NegativeBinomial(indep_var_col, dep_var_mat_clustering, loglike_method='nb2') reg_mat_clustering[i][j] = bm1.fit() #if i==7 and (j == 18):
def fit_binary(self, minSize=20, binInt=5, binRate=0.2): bin_dists = ['poisson', 'nbinom'] bin_dists = ['poisson'] for m in self.members: vals, logV, dZ = [int(x) for x in m.cnts.values()], [ log(v + 1.0) for v in m.cnts.values() ], [0 for i in range(self.space - len(m.cnts.values()))] if len(vals) < minSize: continue val_key = { 'RAW-NZ': vals, 'RAW-WZ': vals + dZ, 'LOG-NZ': logV, 'LOG-WZ': logV + dZ } for val_type, my_vals in val_key.items(): self.tests = {} vLen, bR, vMean = len(my_vals), int(len(my_vals) * binRate), np.mean(my_vals) if val_type.split('-')[0] == 'LOG': continue ## FIRST POISSON ## poisson_mod = sm.Poisson(my_vals, [1 for v in my_vals]) poisson_res = poisson_mod.fit(method="newton", disp=0) poisson_pv = poisson_res.pvalues[0] pAIC, pBIC = poisson_res.aic, poisson_res.bic poisson_sample = stats.poisson.rvs(vMean, size=len(my_vals)) chiT, chiP = self.bin_chi( my_vals, poisson_sample, min(binInt, int(len(vals) * binRate))) self.tests['poisson'] = (chiT, chiP) print m.name, len(vals), len( dZ ), val_type, 'poisson', chiT, chiP, '|', poisson_pv, 'NA', '|', pAIC, pBIC ## NEGATIVE BINOMIAL ## mod_nbin = sm.NegativeBinomial(my_vals, [1 for v in my_vals]) res_nbin = mod_nbin.fit(disp=0) mPV, aPV = res_nbin.pvalues nbM, nbA = exp(res_nbin.params[0]), res_nbin.params[1] estX, estP = convert_nb(nbM, nbA) my_comps = stats.nbinom.rvs(estX, estP, size=len(my_vals)) chiT, chiP = self.bin_chi( my_vals, my_comps, min(binInt, int(len(my_vals) * binRate))) self.tests['nbin'] = (chiT, chiP) nbAIC, nbBIC = res_nbin.aic, res_nbin.bic print m.name, len(vals), len( dZ ), val_type, 'neg-binom', chiT, chiP, "|", mPV, aPV, '|', nbAIC, nbBIC ## NOW ZERO P ### if val_type.split('-')[-1] == 'NZ': continue zp_nbin = msc.PoissonZiGMLE(my_vals, [1 for v in my_vals]) res_zp = zp_nbin.fit(disp=0) zpAIC, zpBIC = res_zp.aic, res_zp.bic zpM = exp(res_zp.params[0]) zpZ = 1 - (np.mean(my_vals) / zpM) try: cPV, zPV = res_zp.pvalues except ValueError: cPV, zPV = 'NA', 'NA' print 'hmmm' my_comps = [ x if random.random() > zpZ else 0 for x in stats.poisson.rvs(zpM, size=len(my_vals)) ] chiT, chiP = self.bin_chi( my_vals, my_comps, min(binInt, int(len(my_vals) * binRate))) self.tests['zp'] = (chiT, chiP) print m.name, len(vals), len( dZ ), val_type, 'zip-po', chiT, chiP, "|", cPV, zPV, '|', zpAIC, zpBIC sys.exit()