def setup_class(cls): data = sm.datasets.randhie.load_pandas() cls.endog = data.endog cls.data = data exog = sm.add_constant(data.exog.iloc[:, 1:4], prepend=False) exog_infl = sm.add_constant(data.exog.iloc[:, 0], prepend=False) # we don't need to verify convergence here start_params = np.asarray([ 0.10337834587498942, -1.0459825102508549, -0.08219794475894268, 0.00856917434709146, -0.026795737379474334, 1.4823632430107334 ]) model = sm.ZeroInflatedPoisson(data.endog, exog, exog_infl=exog_infl, inflation='logit') cls.res1 = model.fit(start_params=start_params, method='newton', maxiter=500, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] cls.init_kwds = {'inflation': 'logit'} res2 = RandHIE() res2.zero_inflated_poisson_logit() cls.res2 = res2
def tiny_zip(l): zip_mod, zip_ppf_obs, zip_pred = [None for i in range(3)] zip_rmse = 0 xtr = np.array([item[1:] for item in l]) ytr = np.array([item[0] for item in l]).reshape(-1, 1) zip_res = [] try: if np.count_nonzero(ytr) > 0: zip_mod = sm.ZeroInflatedPoisson(ytr, xtr).fit_regularized( maxiter=10000, disp=0, maxfun=10000) # print(zip_mod.summary()) zip_mean_pred = zip_mod.predict(xtr, exog_infl=np.ones((len(xtr), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse_tr = np.sqrt(mean_squared_error(ytr, zip_ppf_obs)) zip_res = [zip_mod, zip_ppf_obs, zip_rmse_tr] else: zip_res = return_zeros(ytr, "AllZeros") except np.linalg.LinAlgError as e: if 'Singular matrix' in str(e): # print(" You should not have reached this point. ") # print(" Regularization should avoid the singular matrix. ") nzeros = len(ytr) - np.count_nonzero(ytr) zip_res = return_zeros(ytr, "Singular") prop = round((100 * nzeros) / len(ytr), 2) # print(" Proportion of zeros: ", prop) zip_prop_err_singmat.append(prop) except AssertionError as e: zip_res = return_zeros(ytr, "Assert") except ValueError as e: print("\t\t\tIgnored output containing np.nan or np.inf") pass return zip_res
def fitZIP(preCellType,postCellType): # Get data from file filename = "data_dense_model\%s_%s.csv" % (preCellType,postCellType) df = pd.read_csv(filename,header=None,names=["data"]) # Prepare data for fitting X = df.data nobs = len(X) exog = np.ones(nobs) freq = np.bincount(X) / nobs binValue = list(range(0,len(freq))) # Fit Data mod_ZIP = sm.ZeroInflatedPoisson(X, exog) res_ZIP = mod_ZIP.fit(disp=False) # Get fitting results probs_zip = res_ZIP.predict(which='prob') probsm_zip = probs_zip.mean(0) # Export freq and probsm_zinb values = {'x': freq, 'xFit': probsm_zip} outputDF = DataFrame(values, columns= ['x', 'xFit']) outputfilename = "fit_dense_model\%s_%s_ZIP.csv" % (preCellType,postCellType) export_csv = outputDF.to_csv (outputfilename,index=None,header=True) # Export fit results X = res_ZIP.summary().as_csv() outputfilenameFit = "fit_dense_model\%s_%s_ZIP_FitResults.csv" % (preCellType,postCellType) text_file = open(outputfilenameFit, "w") n = text_file.write(X) text_file.close()
def setup_class(cls): expected_params = [1, 0.5] np.random.seed(123) nobs = 200 exog = np.ones((nobs, 2)) exog[:nobs//2, 1] = 2 mu_true = exog.dot(expected_params) cls.endog = sm.distributions.zipoisson.rvs(mu_true, 0.05, size=mu_true.shape) model = sm.ZeroInflatedPoisson(cls.endog, exog) cls.res = model.fit(method='bfgs', maxiter=5000, maxfun=5000, disp=0)
def test_exposure(self): # This test mostly the equivalence of offset and exposure = exp(offset) # use data arrays from class model model1 = self.res1.model offset = model1.offset model3 = sm.ZeroInflatedPoisson(model1.endog, model1.exog, exog_infl=model1.exog_infl, exposure=np.exp(offset)) res3 = model3.fit(start_params=self.res1.params, method='newton', maxiter=500, disp=False) assert_allclose(res3.params, self.res1.params, atol=1e-6, rtol=1e-6) fitted1 = self.res1.predict() fitted3 = res3.predict() assert_allclose(fitted3, fitted1, atol=1e-6, rtol=1e-6) ex = model1.exog ex_infl = model1.exog_infl offset = model1.offset fitted1_0 = self.res1.predict(exog=ex, exog_infl=ex_infl, offset=offset.tolist()) fitted3_0 = res3.predict(exog=ex, exog_infl=ex_infl, exposure=np.exp(offset)) assert_allclose(fitted3_0, fitted1_0, atol=1e-6, rtol=1e-6) ex = model1.exog[:10:2] ex_infl = model1.exog_infl[:10:2] offset = offset[:10:2] # # TODO: this raises with shape mismatch, # # i.e. uses offset or exposure from model -> fix it or not? # GLM.predict to setting offset and exposure to zero # fitted1_1 = self.res1.predict(exog=ex, exog_infl=ex_infl) # fitted3_1 = res3.predict(exog=ex, exog_infl=ex_infl) # assert_allclose(fitted3_1, fitted1_1, atol=1e-6, rtol=1e-6) fitted1_2 = self.res1.predict(exog=ex, exog_infl=ex_infl, offset=offset) fitted3_2 = res3.predict(exog=ex, exog_infl=ex_infl, exposure=np.exp(offset)) assert_allclose(fitted3_2, fitted1_2, atol=1e-6, rtol=1e-6) assert_allclose(fitted1_2, fitted1[:10:2], atol=1e-6, rtol=1e-6) assert_allclose(fitted3_2, fitted1[:10:2], atol=1e-6, rtol=1e-6) # without specifying offset and exposure fitted1_3 = self.res1.predict(exog=ex, exog_infl=ex_infl) fitted3_3 = res3.predict(exog=ex, exog_infl=ex_infl) assert_allclose(fitted3_3, fitted1_3, atol=1e-6, rtol=1e-6)
def test_pd_offset_exposure(self): endog = pd.DataFrame({'F': [0.0, 0.0, 0.0, 0.0, 1.0]}) exog = pd.DataFrame({'I': [1.0, 1.0, 1.0, 1.0, 1.0], 'C': [0.0, 1.0, 0.0, 1.0, 0.0]}) exposure = pd.Series([1., 1, 1, 2, 1]) offset = pd.Series([1, 1, 1, 2, 1]) sm.Poisson(endog=endog, exog=exog, offset=offset).fit() inflations = ['logit', 'probit'] for inflation in inflations: sm.ZeroInflatedPoisson(endog=endog, exog=exog["I"], exposure=exposure, inflation=inflation).fit()
def test_poi_nb_zip_zinb_tiny_subset(meta, m): exog_names = r"rowid;latitude;longitude;target;dbuiltup;dforest;drecreation;dbrr;dwrl;dwrn;dwrr;dcamping;dcaravan;dcross;dgolf;dheem;dhaven;dsafari;dwater;attr;dbath;lu;lc;maxmeanhaz;maxstdhaz".split(";")[4:] np.random.seed(2) randint = np.random.randint(0, high=len(m)-1, size=800) msel = m[randint,:] Y = msel[:, 0] X = msel[:, 1:] # Ynz, Xnz = trim_value(Y, X, 0) print("Msel shape: ", msel.shape) xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=42) print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape) print print("Model: Poisson") poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50) poi_mean_pred = poi_mod.predict(xtest) poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred) poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs)) # print(np.unique(poi_ppf_obs, return_counts=True)) print("RMSE Poisson: ", poi_rmse) # print(poi_mod.summary(yname='tickbites', xname=exog_names)) print print("Model: Neg. Binomial") nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params = None, method = 'newton', maxiter=50) nb_pred = nb_mod.predict(xtest) nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred)) # print(np.unique(nb_pred, return_counts=True)) print("RMSE Negative Binomial: ", nb_rmse) print print("Model: Zero Inflated Poisson") zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs)) print("RMSE Zero-Inflated Poisson", zip_rmse) print print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50) zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred)) print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
def test_names(self): param_names = ['inflate_lncoins', 'inflate_const', 'idp', 'lpi', 'fmde', 'const'] assert_array_equal(self.res1.model.exog_names, param_names) assert_array_equal(self.res1.params.index.tolist(), param_names) assert_array_equal(self.res1.bse.index.tolist(), param_names) exog = sm.add_constant(self.data.exog.iloc[:,1:4], prepend=True) exog_infl = sm.add_constant(self.data.exog.iloc[:,0], prepend=True) param_names = ['inflate_const', 'inflate_lncoins', 'const', 'idp', 'lpi', 'fmde'] model = sm.ZeroInflatedPoisson(self.data.endog, exog, exog_infl=exog_infl, inflation='logit') assert_array_equal(model.exog_names, param_names)
def setup_class(cls): data = sm.datasets.randhie.load(as_pandas=False) cls.endog = data.endog exog = sm.add_constant(data.exog[:,1:4], prepend=False) exog_infl = sm.add_constant(data.exog[:,0], prepend=False) cls.res1 = sm.ZeroInflatedPoisson(data.endog, exog, exog_infl=exog_infl, inflation='probit').fit(method='newton', maxiter=500, disp=0) # for llnull test cls.res1._results._attach_nullmodel = True cls.init_keys = ['exog_infl', 'exposure', 'inflation', 'offset'] cls.init_kwds = {'inflation': 'probit'} res2 = RandHIE.zero_inflated_poisson_probit cls.res2 = res2
def tiny_zip(l): print("\t\tRunning Zero-Inflated Poisson") zip_mod, zip_ppf_obs, zip_pred = [None for i in range(3)] zip_rmse = 0 xtr = np.array([item[1:] for item in l]) ytr = np.array([item[0] for item in l]).reshape(-1, 1) try: zip_mod = sm.ZeroInflatedPoisson(ytr, xtr).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtr, exog_infl=np.ones((len(xtr), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytr, zip_ppf_obs)) except np.linalg.LinAlgError as e: if 'Singular matrix' in str(e): print("\t\t\tIgnored a singular matrix.") except ValueError: print("\t\t\tIgnored output containing np.nan or np.inf") return [zip_mod, zip_ppf_obs, zip_rmse]
def test_poi_nb_zip_zinb_raw_data(meta, m): Y = m[:, 0] X = m[:, 1:] Ynz, Xnz = trim_value(Y, X, 0) xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=77) print("Training with: ", xtrain.shape, ytrain.shape) print("Testing with: ", xtest.shape, ytest.shape) print() print("Model: Poisson") poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50) poi_mean_pred = poi_mod.predict(xtest) poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred) poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs)) print("Model: Zero Inflated Poisson") zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs)) print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50) zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred)) print() print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50) zinb_pred = zinb_mod.predict(xtest) zinb_rmse = np.sqrt(mean_squared_error(ytrain, zinb_pred)) print("RMSE Poisson: ", poi_rmse) print("RMSE Negative Binomial: ", nb_rmse) print("RMSE Zero-Inflated Poisson", zip_rmse) print("RMSE Zero-Inflated Negative Binomial: ", zinb_rmse)
poi_mod = sm.Poisson(ytrain, xtrain).fit(method="newton", maxiter=50) poi_mean_pred = poi_mod.predict(xtest) poi_ppf_obs = stats.poisson.ppf(q=0.95, mu=poi_mean_pred) poi_rmse = np.sqrt(mean_squared_error(ytest, poi_ppf_obs)) print("Model: Neg. Binomial") nb_mod = sm.NegativeBinomial(ytrain, xtrain).fit(start_params=None, method='newton', maxiter=50) nb_pred = nb_mod.predict(xtest) nb_rmse = np.sqrt(mean_squared_error(ytest, nb_pred)) print(np.ones(len(xtest)).shape) print("Model: Zero Inflated Poisson") zip_mod = sm.ZeroInflatedPoisson(ytrain, xtrain).fit(method="newton", maxiter=50) zip_mean_pred = zip_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zip_ppf_obs = stats.poisson.ppf(q=0.95, mu=zip_mean_pred) zip_rmse = np.sqrt(mean_squared_error(ytest, zip_ppf_obs)) print("Model: Zero Inflated Neg. Binomial") zinb_mod = sm.ZeroInflatedNegativeBinomialP(ytrain, xtrain).fit(method="newton", maxiter=50) zinb_pred = zinb_mod.predict(xtest, exog_infl=np.ones((len(xtest), 1))) zinb_rmse = np.sqrt(mean_squared_error(ytest, zinb_pred)) print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape) print("RMSE Poisson: ", poi_rmse) print("RMSE Neg. Bin.: ", nb_rmse)
y_train_disab, X_train_disab = dmatrices(expr_disab, train_disab, return_type='dataframe') y_test_disab, X_test_disab = dmatrices(expr_disab, test_disab, return_type='dataframe') #train the model poisson_training_results_disab = sm.GLM(y_train_disab, X_train_disab, family=sm.families.Poisson()).fit() print(poisson_training_results_disab.summary()) #%% ZIP Model FOR DISABLED #check zeros. ax = sns.distplot(finalhatedata_nonNAN['Disability_p1000']) plt.title('Distribution of Disabled hate crime rate') #run zip model on the data and print summary. zip_training_results_disab = sm.ZeroInflatedPoisson(endog=y_train_disab, exog=X_train_disab, exog_infl=X_train_disab, inflation='logit').fit() print(zip_training_results_disab.summary()) #%% poisson reg - trans PLACE VS trans RATE #columns list(finalhatedata_nonNAN) #check the mean and variance print('variance='+str(finalhatedata_nonNAN['Transgender_p1000'].var())) print('mean='+str(finalhatedata_nonNAN['Transgender_p1000'].mean())) #create train and test data frames. mask_trans = np.random.rand(len(finalhatedata_nonNAN)) < 0.8 train_trans = finalhatedata_nonNAN[mask_trans]