def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] cls.mod = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([ -0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315 ])
def one_cluster(formula, feature, covs, coef, method=OLS, _pat=re.compile("\+\s*CpG")): """used when we have a "cluster" with 1 probe.""" c = covs.copy() # remove the CpG in the formula formula = _pat.sub("", formula) if isinstance(feature, CountFeature): c['methylation'] = feature.methylated c['counts'] = feature.counts c = c[c['counts'] > 0] try: return get_ptc( GLM.from_formula(formula, data=c, exposure=c['counts'], family=Poisson()).fit(), coef) except PerfectSeparationError: return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef) else: c['methylation'] = feature.values res = method.from_formula(formula, data=c).fit() return get_ptc(res, coef)
def ppglmfit(X,Y): ''' The GLM solver in statsmodels is very general. It accepts any link function and expects that, if you want a constant term in your model, that you have already manually added a column of ones to your design matrix. This wrapper simplifies using GLM to fit the common case of a Poisson point-process model, where the constant term has not been explicitly added to the design matrix Parameters ---------- X: N_observations x N_features design matrix. Y: Binary point process observations Returns ------- μ, B: the offset and parameter estimates for the GLM model. ''' # add constant value to X, if the 1st column is not constant if np.mean(Y)>0.1: print('Caution: spike rate very high, is Poisson assumption valid?') if np.sum(Y)<100: print('Caution: fewer than 100 spikes to fit model') if not all(X[:,0]==X[0,0]): X = np.hstack([np.ones((X.shape[0],1),dtype=X.dtype), X]) poisson_model = GLM(Y,X,family=Poisson()) poisson_results = poisson_model.fit() M = poisson_results.params return M[0],M[1:]
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = sm.poisson("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() mod1 = GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit(scale="X2") # don't use wrapper, asserts_xxx don't work rslt1 = rslt1._results rslt2 = rslt2._results assert_almost_equal(rslt1.params, rslt2.params, decimal=6) assert_almost_equal(rslt1.scale, rslt2.scale, decimal=6)
def test_predict_exposure(self): n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.random.uniform(1, 2, size=n) Y = np.random.poisson(0.1*(X1 + X2) + offset + np.log(exposure), size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset", exposure="exposure") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data["offset"]) pred3 = result.predict(exposure=data["exposure"]) pred4 = result.predict(offset=data["offset"], exposure=data["exposure"]) pred5 = result.predict(exog=data[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:]) # without patsy pred6 = result.predict(exog=result.model.exog[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:], transform=False) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4) assert_allclose(pred1[-10:], pred5) assert_allclose(pred1[-10:], pred6)
def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = Poisson() ind = Independence() md1 = GEE.from_formula("y ~ age + trt + base", data, groups=data["subject"], cov_struct=ind, family=fam) mdf1 = md1.fit() # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families md2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) mdf2 = md2.fit(scale="X2") assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.scale, mdf2.scale, decimal=6)
def EstimacionMVBase(df_cal, ids_torneo): df_reg = ReshapeDataFrameBase(df_cal) formula, constraints = FormulaBase(df_reg.columns.tolist()) model = glm(formula, groups=None, data=df_reg, family=Poisson()).fit_constrained(constraints) dictparams = OutputPoissReg(model, [], ids_torneo) return dictparams
def EstimacionMVPromGolesLV(df_cal, ids_torneo): df_reg = ReshapeDataFramePromGolesLV(df_cal) formula, constraints = FormulaPromGolesLV(df_reg.columns.tolist()) model = glm(formula, groups=None, data=df_reg, family=Poisson()).fit_constrained(constraints) dictparams = OutputPoissReg(model, ['pgfl', 'pgfv', 'pgal', 'pgav'], ids_torneo) return dictparams
def BuildPoissonModels(hist_data, feature_list, comp_data=None): ''' Build score predictions via (linear) poisson regression. ''' hist_data_1 = hist_data[["team_1_score"] + feature_list] hist_data_2 = hist_data[["team_2_score"] + feature_list] formula_1 = "team_1_score ~ " + " + ".join(feature_list) formula_2 = "team_2_score ~ " + " + ".join(feature_list) # using the GEE package along with independance assumptions to fit poisson model. # Am assuming this is using a maximum likleyhood approach? fam = Poisson() ind = Independence() model_1 = GEE.from_formula(formula_1, "team_1_score", hist_data, cov_struct=ind, family=fam) model_2 = GEE.from_formula(formula_2, "team_2_score", hist_data, cov_struct=ind, family=fam) model_1_fit = model_1.fit() model_2_fit = model_2.fit() print(model_1_fit.summary()) hist_data['team_1_score_pred'] = model_1_fit.predict(hist_data) hist_data['team_2_score_pred'] = model_2_fit.predict(hist_data) # return historical data if comp_data wasn't passed. if comp_data is None: return hist_data # prepare comp data comp_data['team_1_score_pred'] = model_1_fit.predict( comp_data[feature_list]) comp_data['team_2_score_pred'] = model_2_fit.predict( comp_data[feature_list]) comp_data['team_1_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: 1 - skellam.cdf(0, x['team_1_score_pred'], x[ 'team_2_score_pred']), 1) comp_data['team_tie_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.pmf(0, x['team_1_score_pred'], x['team_2_score_pred'] ), 1) comp_data['team_2_prob'] = comp_data[[ 'team_1_score_pred', 'team_2_score_pred' ]].apply( lambda x: skellam.cdf(-1, x['team_1_score_pred'], x['team_2_score_pred' ]), 1) return hist_data, comp_data
def regression(): '''Poisson regression example chapter 4.4, p.69''' # get the data from the web inFile = r'GLM_data/Table 4.3 Poisson regression.xls' df = get_data(inFile) # do the fit p = glm('y~x', family=Poisson(links.identity), data=df) print p.fit().summary()
def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = Poisson() vi = Independence() cls.mod = GEE(endog, exog, group_n, None, family, vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315])
def test_wrapper(self): endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False) endog = pd.Series(endog) exog = pd.DataFrame(exog) group_n = pd.Series(group_n) family = Poisson() vi = Independence() mod = GEE(endog, exog, group_n, None, family, vi) rslt2 = mod.fit() check_wrapper(rslt2)
def model_estimate_coeff(self, X, y): """ Fit Poisson Regression Models & get Coefficiant & Intercept, Parametes: --------- model: ```sklearn.models```, Scikit-learn Regression Models x: ND Array, Inputs Variables, y: 1D Array, Output Variables Binary variable Output: ------- Estimated Coefficient For Poisson Regression Model + Intercept """ X = sm.add_constant(X) # Add Constact To bootsraped X # Fit Poisson Model To bootsraped samples return fm.GLM(y, X, family=Poisson()).fit().params
def test_offset_formula(self): # Test various ways of passing offset and exposure to `from_formula`. n = 50 X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1*(X1 + X2) + 2*offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = GEE.from_formula(fml, groups, data, family=Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = GEE.from_formula(fml, groups, data, family=Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = GEE.from_formula(fml, groups, data, family=Poisson(), offset=2*offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True)
def rr_cluster(cluster, covs, formula): """Set cluster values to reduced-residuals.""" cluster = deepcopy(cluster) from statsmodels.formula.api import ols, glm if isinstance(cluster[0], CountFeature): for f in cluster: covs['methylation'] = f.methylated f.methylated[:] = np.round(glm(formula, covs, exposure=f.counts, family=Poisson() ).fit().resid ).astype(int) f.values[:] = f.methylated.astype(float) / f.counts else: for f in cluster: covs['methylation'] = f.values fit = ols(formula, covs).fit() f.values[:] = fit.resid f.ovalues = fit.fittedvalues return cluster
def test_compare_poisson(self): vs = Independence() family = Poisson() Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None, groups=groups, family=family, covstruct=vs).fit() sml = sm.poisson("Y ~ X1 + X2 + X3", data=D).fit() assert_almost_equal(sml.params.values, md.params, decimal=10)
def poisson_regression(): '''Poisson Regression chapter 9.2, p.170 & 171 ''' inFile = r"GLM_data/Table 9.1 British doctors' smoking and coronary death.xls" df = get_data(inFile) print df # Generate the required variables df['smoke'] = np.zeros(len(df)) df['smoke'][df['smoking'] == 'smoker'] = 1 df['agecat'] = np.array([1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) df['agesq'] = df['agecat']**2 df['smkage'] = df['agecat'] df['smkage'][df['smoking'] == 'non-smoker'] = 0 model = glm('deaths~agecat+agesq+smoke+smkage', family=Poisson(), data=df, exposure=df["person-years"]).fit() print model.summary()
def log_linear_models(): '''Log-linear models chapter 9.7, p 180 & 182 ''' # Malignant melanoma, p 180 -------------------------------- inFile = r'GLM_data/Table 9.4 Malignant melanoma.xls' df = get_data(inFile) # Minimal model model_min = glm('frequency~1', family=Poisson(), data=df).fit() print 'Malignant melanoma' print model_min.fittedvalues[0] # Additive model model_add = glm('frequency~site+type', family=Poisson(), data=df).fit() print model_add.fittedvalues[0] # Saturated model # model_sat = glm('frequency~site*type', family = Poisson(), data=df).fit() # # The saturated model gives a perfect fit, and the fitted data are equal to # the original data. Statsmodels indicates a "PerfectSeparationError" # Ulcer and aspirin, p. 182 ------------------------------------- inFile = r'GLM_data/Table 9.7 Ulcer and aspirin use.xls' df = get_data(inFile) df.columns = ['GD', 'CC', 'AP', 'freq'] model1 = glm('freq~GD+CC+GD*CC', family=Poisson(), data=df).fit() model2 = glm('freq~GD+CC+GD*CC + AP', family=Poisson(), data=df).fit() model3 = glm('freq~GD+CC+GD*CC + AP + AP*CC', family=Poisson(), data=df).fit() model4 = glm('freq~GD+CC+GD*CC + AP + AP*CC + AP*GD', family=Poisson(), data=df).fit() print 'Ulcer and aspirin' print model4.fittedvalues
J = pd.DataFrame() J['iterative_step'] = range(0, m + 1) J['cost'] = np.full(m + 1, None) J.loc[0, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A + factorial(Y)).T, ones_vector)) for i in range(1, m + 1): J_partial_Beta = np.dot(X.T, (A - Y)) J_2partial_Beta2 = np.dot(A.T, ones_vector) * np.dot(X.T, X) Beta = Beta - np.dot(inv(J_2partial_Beta2), J_partial_Beta) Z = np.dot(X, Beta) A = np.exp(Z) J.loc[i, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A + factorial(Y)).T, ones_vector)) del J_partial_Beta plt.plot(J['iterative_step'], J['cost']) plt.title('Newton Rhapson') plt.xlabel('Iterative Step') plt.ylabel('Cost') Beta ## built in package results = sm.glm( formula= "S_Length ~ S_Width + P_Length + P_Width + Species_setosa + Species_versicolor", data=LR_df, family=Poisson()).fit() print(results.params)
[0., 1, -1, 0, 0], ]) rhs = np.r_[0.0, ] # Loop over data generating models for gendat in gendats: pvalues = [] params = [] std_errors = [] dparams = [] for j in range(nrep): da, va = gendat() ga = Poisson() # Poisson seems to be more sensitive to starting values, # so we run the independence model first. md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence()) mdf = md.fit() md = GEE(da.endog, da.exog, da.group, da.time, ga, va) mdf = md.fit(start_params=mdf.params) if mdf is None or (not mdf.converged): print("Failed to converge") continue scale_inv = 1. / md.estimate_scale() dparams.append(np.r_[va.dparams, scale_inv]) params.append(np.asarray(mdf.params))
# -*- coding: utf-8 -*- """ Created on Fri Aug 12 11:36:51 2016 @author: emg """ import numpy as np import pandas as pd from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model1 = GEE.from_formula("author_count ~ top + mod", "author", authors, cov_struct=ind, family=fam) result1 = model1.fit() print(result1.summary())
def test_poisson(self): """ library(gee) Z = read.csv("results/gee_poisson_1.csv", header=FALSE) Y = Z[,2] Id = Z[,1] X1 = Z[,3] X2 = Z[,4] X3 = Z[,5] X4 = Z[,6] X5 = Z[,7] mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="independence", scale.fix=TRUE) smi = summary(mi) u = coefficients(smi) cfi = paste(u[,1], collapse=",") sei = paste(u[,4], collapse=",") me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, corstr="exchangeable", scale.fix=TRUE) sme = summary(me) u = coefficients(sme) cfe = paste(u[,1], collapse=",") see = paste(u[,4], collapse=",") sprintf("cf = [[%s],[%s]]", cfi, cfe) sprintf("se = [[%s],[%s]]", sei, see) """ family = Poisson() endog, exog, group_n = load_data("gee_poisson_1.csv") vi = Independence() ve = Exchangeable() # From R gee cf = [[ -0.0364450410793481, -0.0543209391301178, 0.0156642711741052, 0.57628591338724, -0.00465659951186211, -0.477093153099256 ], [ -0.0315615554826533, -0.0562589480840004, 0.0178419412298561, 0.571512795340481, -0.00363255566297332, -0.475971696727736 ]] se = [[ 0.0611309237214186, 0.0390680524493108, 0.0334234174505518, 0.0366860768962715, 0.0304758505008105, 0.0316348058881079 ], [ 0.0610840153582275, 0.0376887268649102, 0.0325168379415177, 0.0369786751362213, 0.0296141014225009, 0.0306115470200955 ]] for j, v in enumerate((vi, ve)): md = GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = [ "Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", D, None, groups=D.loc[:, "Id"], family=family, covstruct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
def run_permutation_test(dependent, network, number_of_permutations, output_path): nodes = pd.DataFrame.from_dict(dict(network.nodes(data=True)), orient='index') degree = pd.DataFrame.from_dict(dict(network.degree()), orient='index') centrality = pd.DataFrame.from_dict(dict( nx.betweenness_centrality(network)), orient='index') h1 = pd.concat([nodes, degree, centrality], axis=1).reset_index(0) h1.columns = [ 'ID', 'Age', 'Species', 'type', 'Location', 'Sex', 'degree', 'centrality' ] h1['degree_dist'] = h1.degree / float(h1.degree.max()) equation = dependent + "~ Age + Sex" from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import (Exchangeable, Independence, Autoregressive) from statsmodels.genmod.families import Poisson fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", h1, cov_struct=ind, family=fam) main_model_result = model.fit() main_result = pd.DataFrame(main_model_result.params).T degree_random_coeff = [] for i in range(number_of_permutations): rand_h1 = h1.copy() rand_h1[dependent] = np.random.permutation(h1[dependent]) fam = Poisson() ind = Independence() model = GEE.from_formula(equation, "Location", rand_h1, cov_struct=ind, family=fam) result = model.fit() degree_random_coeff.append(result.params) d = pd.DataFrame.from_records(degree_random_coeff) import seaborn as sns f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) ax1.hist(d['Age[T.HY]'], bins=100) ax1.axvline(x=main_result['Age[T.HY]'].values[0], color='#fc9272') p = (d['Age[T.HY]'] > main_result['Age[T.HY]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax1.set_xlabel( 'Coefficient Age: Hatch Year\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax1.set_ylabel('Frequency') ax2.hist(d['Age[T.UNK]'], bins=100) ax2.axvline(x=main_result['Age[T.UNK]'].values[0], color='#fc9272') p = (d['Age[T.UNK]'] > main_result['Age[T.UNK]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax2.set_xlabel('Coefficient Age: Unknown\n(ref: After Hatch Year)\np= ' + '{0:.2f}'.format(p)) ax3.hist(d['Sex[T.M]'], bins=100) ax3.axvline(x=main_result['Sex[T.M]'].values[0], color='#fc9272') p = (d['Sex[T.M]'] > main_result['Sex[T.M]'].values[0] ).sum() / float(number_of_permutations) if p > 0.5: p = 1 - p else: p = p ax3.set_xlabel('Coefficient Sex: Male\n (ref: Female)\np= ' + '{0:.2f}'.format(p)) title = 'permutation test for ' + dependent f.suptitle(title) plt.tight_layout() plt.savefig(output_path + '/' + dependent + '_Permutation_test.png', dpi=300) plt.show()
# model comparison with likelihood ratio test LR = 2 * (model_panel2_results.llf - model_panel1_results.llf) p = chi2.sf(LR, 2) print('p: %.30f' % p) # provides a summary of the number of zeros print(US_cases_long_demogr_week['cases_count_pos'].describe()) print(US_cases_long_demogr_week['cases_count_pos'].value_counts()) count_total = sum(US_cases_long_demogr_week['cases_count_pos'].value_counts().to_dict().values()) count_zero = US_cases_long_demogr_week['cases_count_pos'].value_counts()[0.0] print("Count of zero is {}, about {:.4f} of the data.".format(count_zero, count_zero / count_total )) # Approach one to generalized linear models for panel data: Generalized Estimating Equations # poisson model poi=Poisson() ar=Autoregressive() gee_model0 = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, time='week_of_year', cov_struct=ar, family=poi, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"]))) gee_model0_results = gee_model0.fit(maxiter=200) print(gee_model0_results.summary()) print(ar.summary()) print("scale=%.2f" % (gee_model0_results.scale)) # There is warning -- "IterationLimitWarning: Iteration limit reached prior to convergence" even if I specify maxiter = 2000. So, in this case, # specific starting values are needed to get the estimating algorithm to converge. # First run with exchangeable dependence structure. We know from this model that the within-state correlation is roughly 0.077. fam = Poisson() ex = Exchangeable() ex_model = GEE.from_formula("cases_count_pos ~ week_of_year + percent_age65over + percent_female + percent_black", groups="state", \ data=US_cases_long_demogr_week, cov_struct=ex, family=fam, offset=np.log(np.asarray(US_cases_long_demogr_week["pop_count_2019"])))
def scanline_harmonization(source_gdf, target_gdf, pop_string, raster_path, verbose=True, auxiliary_type='nlcd', regression_method='Poisson', codes=[21, 22, 23, 24], n_pixels_option_values=256, ReLU=True, **kwargs): """Function that generates an interpolated population using scanlines with the entire pipeline. Parameters ---------- source_gdf : geopandas GeoDataFrame with geometry column of polygon type for the source set of polygons desired. target_gdf : geopandas GeoDataFrame with geometry column of polygon type for the target set of polygons desired. pop_string : the name of the variable on geodataframe that the interpolation shall be conducted. raster_path : the path to the associated raster image. verbose : bool. Default is False. Wheter the function will print progress steps. auxiliary_type : string. The type of the auxiliary variable for the desired method of interpolation. Default is 'nlcd' for the National Land Cover Dataset. regression_method : the method used to estimate the weights of each land type and population. Default is "Poisson". "Poisson" : performs Generalized Linear Model with a Poisson likelihood with log-link function. "Gaussian" : ordinary least squares will be fitted. "XGBoost" : an Extreme Gradient Boosting regression will be fitted and the weights will be extracted from the Shapelys value from each land type. codes : an integer list of codes values that should be considered as 'populated' for the raster file. See (1) in notes. n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256. ReLU : bool. Default is True. Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types. **kwargs : additional arguments that can be passed to internal functions. Currently `tuned_xgb` or `gbm_hyperparam_grid` can be passed to internal XGBoost approach. Notes ----- 1) Since this was inspired using the National Land Cover Database (NLCD), it is established some default values for this argument. The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). The description of each code for NLCD can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html """ if verbose: print('INITIALIZING FIRST SCANLINES') profiled_df_pre = scanlines_count_pixels(source_gdf, raster_path, verbose=verbose) profiled_df = pd.concat([source_gdf.reset_index(), profiled_df_pre], axis=1) codes.sort() str_codes = [str(i) for i in codes] formula_string = pop_string + ' ~ -1 + ' + " + ".join( ['Type_' + s for s in str_codes]) if (regression_method == 'Poisson'): results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit() weights = np.array(results.params) if (regression_method == 'Gaussian'): results = smf.glm(formula_string, data=profiled_df, family=Gaussian()).fit() weights = np.array(results.params) if (regression_method == 'XGBoost'): weights = _return_xgboost_weights(profiled_df, pop_string, str_codes, **kwargs) if ReLU: weights = np.where(weights < 0, 0, weights) # Correction Term (CT) profiled_df['denominator'] = ( np.array(profiled_df[['Type_' + s for s in str_codes]]) * weights).sum(axis=1) profiled_df['CT'] = np.nan_to_num(profiled_df[pop_string] / profiled_df['denominator']) scan_line_input_CT = profiled_df[['geometry', 'CT']] long_weights = np.zeros(n_pixels_option_values) long_weights[codes] = weights if verbose: print('\nINITIALIZING SECOND SCANLINES') interpolate = scanlines_interpolate(target_gdf=target_gdf, source_CTs=scan_line_input_CT, weights_long=long_weights, raster_path=raster_path, verbose=verbose) interpolate_df = pd.concat([target_gdf.reset_index(), interpolate], axis=1) return interpolate_df
# Let's actually look at the distribution of monthly Enrollments by TP get_ipython().magic('matplotlib inline') y = train['Enrolls'] sns.distplot(y) plt.show() # #### First Attempt - Poisson Regression # We utilize a Poisson regression here because our independent variable, Enrolls, is a count with a relatively small range. Since the distribution of the error terms will therefore not be independent and identically distributed we do not use OLS. # In[17]: poisson = sm.GLM(y_train, new_x, family = Poisson()).fit() # poisson.summary() # In[18]: y_train.mean() # In[19]: y_train.var() # #### Second Attempt - Negative Binomial # Shouldn't use Poisson, because the variance does not equal the mean. Trying a Negative Binomial instead.
def _return_weights_from_regression( geodataframe, raster_path, pop_string, codes=[21, 22, 23, 24], likelihood="poisson", formula_string=None, n_pixels_option_values=256, force_crs_match=True, na_value=255, ReLU=True, ): """Function that returns the weights of each land type according to NLCD types/codes. Parameters ---------- geodataframe : geopandas.GeoDataFrame used to build regression raster_path : str the path to the associated raster image. formula_string : str patsy-style model formula pop_string : str the name of the variable on geodataframe that the regression shall be conducted codes : list an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). likelihood : str, {'Poisson', 'Gaussian'} the likelihood assumed for the dependent variable (population). It can be 'Poisson' or 'Gaussian'. With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted. n_pixels_option_values : int number of options of the pixel values of rasterior. Default is 256. force_crs_match : bool. Default is True. Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. na_value : int. Default is 255. The number which is considered to be 'Not a Number' (NaN) in the raster pixel values. ReLU : bool. Default is True. Whether the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types. Notes ----- 1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function. 2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256. """ _check_presence_of_crs(geodataframe) if na_value in codes: raise ValueError("codes should not assume the na_value value.") if not likelihood in ["poisson", "gaussian"]: raise ValueError("likelihood must one of 'poisson', 'gaussian'") profiled_df = _fast_append_profile_in_gdf( geodataframe[["geometry", pop_string]], raster_path, force_crs_match ) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it). # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match codes.sort() if not formula_string: # Formula WITHOUT intercept str_codes = [str(i) for i in codes] formula_string = ( pop_string + " ~ -1 + " + " + ".join(["Type_" + s for s in str_codes]) ) if likelihood == "poisson": results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit() if likelihood == "gaussian": results = smf.ols(formula_string, data=profiled_df).fit() weights = np.zeros(n_pixels_option_values) weights[codes] = results.params if ReLU: weights = np.where(weights < 0, 0, weights) return weights
def return_weights_from_regression(geodataframe, raster, pop_string, codes=[21, 22, 23, 24], likelihood='Poisson', n_pixels_option_values=256, force_crs_match=True, na_value=255): """Function that returns the weights of each land type according to NLCD types/codes Parameters ---------- geodataframe : a geopandas geoDataFrame used to build regression raster : a raster (from rasterio.open) that has the types of each pixel in the geodataframe pop_string : the name of the variable on geodataframe that the regression shall be conducted codes : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD). The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity). likelihood : the likelihood assumed for the dependent variable (population). It can be 'Poisson' or 'Gaussian'. With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted. n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256. force_crs_match : bool. Default is True. Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file. It is recommended to let this argument as True. na_value : int. Default is 255. The number which is considered to be 'Not a Number' (NaN) in the raster pixel values. Notes ----- 1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function. 2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256. """ _check_presence_of_crs(geodataframe) if (na_value in codes): raise ValueError('codes should not assume the na_value value.') if not likelihood in ['Poisson', 'Gaussian']: raise ValueError('likelihood must one of \'Poisson\', \'Gaussian\'') print('Appending profile...') profiled_df = append_profile_in_gdf( geodataframe[['geometry', pop_string]], raster, force_crs_match ) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it). print('Append profile: Done.') # If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match codes.sort() # Formula WITHOUT intercept str_codes = [str(i) for i in codes] formula_string = pop_string + ' ~ -1 + ' + " + ".join( ['Type_' + s for s in str_codes]) print('Starting to fit regression...') if (likelihood == 'Poisson'): results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit() if (likelihood == 'Gaussian'): results = smf.ols(formula_string, data=profiled_df).fit() weights = np.zeros(n_pixels_option_values) weights[codes] = results.params return weights
###################### ## Gradient Descent ## ###################### m = 50000 alpha = 0.0002 J = pd.DataFrame() J['iterative_step'] = range(0,m+1) J['cost'] = np.full(m+1, None) J.loc[0, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A+factorial(Y)).T, ones_vector)) for i in range(1, m+1): J_partial_Beta = np.dot(X.T, (A-Y)) Beta = Beta - (alpha*J_partial_Beta) Z = np.dot(X, Beta) A = np.exp(Z) J.loc[i, 'cost'] = np.asscalar(-np.dot(Y.T, np.dot(X, Beta)) + np.dot((A+factorial(Y)).T, ones_vector)) del J_partial_Beta plt.plot(J['iterative_step'], J['cost']) plt.title('Gradient Descent') plt.xlabel('Iterative Step') plt.ylabel('Cost') Beta ## built in package results = sm.glm(formula="S_Length ~ S_Width + P_Length + P_Width + Species_setosa + Species_versicolor", data=LR_df, family=Poisson()).fit() print(results.params)