def test_framing_example_moderator(): # moderation without formulas, generally not useful but test anyway cur_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv")) outcome = np.asarray(data["cong_mesg"]) outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data, return_type='dataframe') probit = sm.families.links.probit outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit())) mediator = np.asarray(data["emo"]) mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data, return_type='dataframe') mediator_model = sm.OLS(mediator, mediator_exog) tx_pos = [ outcome_exog.columns.tolist().index("treat"), mediator_exog.columns.tolist().index("treat") ] med_pos = outcome_exog.columns.tolist().index("emo") ix = (outcome_exog.columns.tolist().index("age"), mediator_exog.columns.tolist().index("age")) moderators = {ix: 20} med = Mediation(outcome_model, mediator_model, tx_pos, med_pos, moderators=moderators) # Just a smoke test np.random.seed(4231) med_rslt = med.fit(method='parametric', n_rep=100)
def glm(data, xseq, **params): """ Fit GLM """ X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) init_kwargs, fit_kwargs = separate_method_kwargs(params['method_args'], sm.GLM, sm.GLM.fit) model = sm.GLM(data['y'], X, **init_kwargs) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(Xseq) if params['se']: prediction = results.get_prediction(Xseq) ci = prediction.conf_int(1 - params['level']) data['ymin'] = ci[:, 0] data['ymax'] = ci[:, 1] return data
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs, 6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 0.1 + 0.1 * rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE self.mod = PoissonGMLE(data_endog, data_exog) self.res = self.mod.fit(start_params=0.9 * self.res_discrete.params, method='nm', disp=0)
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG): mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis] index = sp.where(disp_conv)[0] lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1) upperBound = sp.percentile(sp.unique(disp_raw[index]), 99) idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0] matrix = sp.ones((idx.shape[0], 2), dtype='float') matrix[:, 0] /= mean_count[idx].ravel() modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity)) res = modGamma.fit() Lambda = res.params disp_fitted = disp_raw.copy() ok_idx = sp.where(~sp.isnan(disp_fitted))[0] disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1] if sp.sum(disp_fitted > 0) > 0: print "Found dispersion fit" if CFG['debug']: fig = plt.figure(figsize=(8, 6), dpi=100) ax = fig.add_subplot(111) idx = sp.where(~sp.isnan(disp_fitted))[0] ax.plot( sp.mean(sp.log10(counts + 1), axis=1)[idx], disp_fitted[idx], 'bo') ax.set_title('Fitted Dispersion Estimate') ax.set_xlabel('Mean expression count') ax.set_ylabel('Dispersion') plt.savefig('dispersion_fitted.pdf', format='pdf', bbox_inches='tight') plt.close(fig) return (disp_fitted, Lambda, idx)
def evaluate_payoffs(self): self.intr = np.maximum(self.model.spots - self.K, 0.0) if self.is_call else np.maximum( self.K - self.model.spots, 0.0) for step in reversed(self.model.steps): curr_spots = self.model.spots[:, step] if step == self.model.steps[-1]: self.spv[:, step] = self.intr[:, step] self.epv[:, step] = self.intr[:, step] #for plotting next_spv = self.spv[:, step] else: next_spv = self.spv[:, step + 1] * math.exp( -self.mkt_data.rr * self.model.t_diffs[step]) ols = sm.GLM( next_spv, sm.add_constant( np.column_stack((curr_spots, np.square(curr_spots), np.power(curr_spots, 3), np.power(curr_spots, 4))))) res_ols = ols.fit() self.epv[:, step] = res_ols.fittedvalues self.spv[:, step] = np.where( self.is_amer and np.logical_and(self.intr[:, step] > self.epv[:, step], self.epv[:, step] > 0), self.intr[:, step], next_spv) self.exercise_times = np.where( self.is_amer and np.logical_and(self.intr[:, step] > self.epv[:, step], self.epv[:, step] > 0), step, self.exercise_times) if self.plot: self.plot_result(self.epv[:, step], curr_spots, next_spv, "dummy title", self.path) self.payoff_evaluated = True
def link_functions_gaussian(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in prostate data.") h2o_data = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 9] sm_data_features = sm_data[:, 1:9] print("Testing for family: GAUSSIAN") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] print("Create models with canonical link: IDENTITY") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0], n_folds=0) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian( sm.families.links.identity)).fit() print("Compare model deviances for link function identity") h2o_deviance = h2o_model._model_json['output'][ 'residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def run_GLM(X,y,family=None,link=None): ''' Runs a Generalized linear model on the design matrix X given the target y. This function adds its own constant term to the design matrix''' # assumes Binomial distribution if link==None: link = sm.genmod.families.links.logit if family==None: family=sm.families.Binomial(link=link) else: family = family(link=link) # make y a column vector if y.ndim==1: y = y[:,np.newaxis] # make y a float if y.dtype=='bool': y = y.astype('f8') # init yhat yhat = np.empty_like(y).ravel() yhat[:] = np.nan # get nans so we dont predict on nans idx = np.all(np.isfinite(X), axis=1) # add a constant term to the design matrix constant = np.ones([X.shape[0],1]) X = np.concatenate([constant, X], axis=1) # fit and predict glm_binom = sm.GLM(y,X,family=family,missing='drop') glm_result = glm_binom.fit() # history_names = glm_binom.exog_names[-2:] # res_c = glm_binom.fit_constrained(['{}<=0'.format(history_names[0]),'{}<=0'.format(history_names[1])]) yhat[idx] = glm_result.predict(X[idx,:]) return yhat,glm_result
def test_logistic_model_without_regularization_no_rate(n=100): d = dict() test_results = pd.DataFrame(d.items()) weights, y, X = generate_data(n, loss='logistic', wt_param=1000, return_rate=False) # Statmodels y = np.array(y).ravel() # weights = np.array(weights).ravel().astype('int') # y = np.multiply(y, weights).astype('int') # non_actions = np.subtract(weights, y).astype('int') # y_sm = np.array(zip(y, non_actions)) glm = sm.GLM(y, X, family=sm.families.Binomial()) res = glm.fit() test_results['sm_glm'] = res.params # Sklearn (Expects labels not probabilities) sk_glm = linear_model.LogisticRegression( fit_intercept=False) # Since design matrix has added intercept sk_glm.fit(X, y) test_results['sklearn_glm'] = sk_glm.coef_.ravel().tolist() print test_results
def cces_glm(Y, X, W, summarize): logit = sm.GLM(Y, X, family=sm.families.Binomial(), freq_weights=W, missing='drop') result = logit.fit() if summarize == True: print(result.summary()) params = result.params conf_int = result.conf_int() conf_int['coef'] = params result_df = pd.concat([conf_int, np.exp(conf_int)], axis=1) result_df.columns = [ 'coef_2.5%', 'coef_97.5%', 'coef', 'or_2.5%', 'or_97.5%', 'or' ] result_df['factor'] = result_df.index return result_df
def setup_class(cls): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs, 6) data_exog = rvs data_exog = sm.add_constant(data_exog, prepend=False) xbeta = 0.1 + 0.1 * rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark cls.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) cls.res_glm = mod_glm.fit() #estimate generic MLE cls.mod = PoissonGMLE(data_endog, data_exog) cls.res = cls.mod.fit(start_params=0.9 * cls.res_discrete.params, method='bfgs', disp=0)
def anova_ml(args): if args.acase == 1 and not args.rad: fname = "csv/rio_c1.csv" if args.acase == 2 and not args.rad: fname = "csv/rio_c2.csv" if args.acase == 1 and args.rad: fname = "csv/rad_c1.csv" if args.acase == 2 and args.rad: fname = "csv/rad_c2.csv" df = pd.read_csv(fname) df.dt = np.abs(df.dt) if args.acase == 1 and not args.rad: df = df[(df.dt > 20) & (df.sza < 140) & (df.sza > 60)] if args.acase == 2 and args.rad: df = df[(df.dt != 0) & (df.dt != 360)] df["cossza"] = df.sza.transform(lambda x: np.cos(np.deg2rad(x))) df["logfmax"] = df.fmax.transform(lambda x: np.log10(x)) models = {} models["m1"] = sm.GLM(df.dt, df[["cossza", "lat", "logfmax", "lt"]].values, family=sm.families.NegativeBinomial()) response = [] for k in models.keys(): res = models[k].fit() response.append(res) print(res.summary().as_text()) return response[0]
def get_coefs(dim, dif_other, dur, ch, cond, t_RDK_dur, correct_only=True): """ :param dim: :param dif_other: :param dur: [tr] :param ch: [tr, dim] :param cond: [tr, dim] :param t_RDK_dur: :param correct_only: :return: glmres.params, glmres.bse, glmres, glmmodel """ id_dif = np.empty_like(cond) for dim1 in range(consts.N_DIM): out = np.unique(np.abs(cond[:, dim1]), return_inverse=True) _, id_dif[:, dim1] = out odim = consts.N_DIM - 1 - dim incl = ((t_RDK_dur == dur) & (np.isin(id_dif[:, odim], dif_other))) if correct_only: incl = (incl & (np.sign(ch[:, odim] - 0.5) == np.sign(cond[:, odim]))) ch1 = ch[incl, dim] coh1 = cond[incl, dim] cohs, id_cohs = np.unique(coh1, return_inverse=True) if np.issubdtype(ch1.dtype, np.floating): # p_ch=1 is given ch11 = np.stack( [npg.aggregate(id_cohs, ch1), npg.aggregate(id_cohs, 1 - ch1)], -1) else: ch11 = npg.aggregate(np.vstack((id_cohs, 1 - ch1)), 1) glmmodel = sm.GLM(ch11, sm.add_constant(cohs), family=sm.families.Binomial()) glmres = glmmodel.fit() return glmres.params, glmres.bse, glmres, glmmodel
def test_on_clear_data(self): """ When applied to non-noisy data, the svinfer's LogisticRegression is expected to return the same results as a classic logistic regression. The benchmark results are from statsmodels.api.GLM """ # fit by svinfer svinfer_model = LogisticRegression( self.predictors_clear, self.response, [0] * len(self.predictors_clear)).fit( DataFrameProcessor(self.data)) svinfer_beta = svinfer_model.beta svinfer_vcov = svinfer_model.beta_vcov # fit by statsmodels sm_model = sm.GLM( self.data[self.response].values, sm.add_constant(self.data[self.predictors_clear].values), family=sm.families.Binomial(), ).fit(cov_type="HC0") # use basic sandwich sm_beta = sm_model.params sm_vcov = sm_model.cov_params() self.assertTrue( check_if_almost_equal( svinfer_beta, sm_beta, absolute_tolerance=1e-12, relative_tolerance=1e-12, )) self.assertTrue( check_if_almost_equal( svinfer_vcov, sm_vcov, absolute_tolerance=1e-12, relative_tolerance=1e-12, ))
def linear_model_with_interxns(trainDf, testDf, max_depth=2, printem=False): trainDf = trainDf.copy() testDf = testDf.copy() explanatories = [c for c in trainDf.columns if c[0] == 'x'] trainDf = interact(trainDf, explanatories, max_depth) testDf = interact(testDf, explanatories, max_depth) trainDf["x0"] = 1 testDf["x0"] = 1 explanatories = [c for c in trainDf.columns if c[0] == 'x'] if max_depth == 0: explanatories = ["x0"] linearModel = sm.GLM(trainDf["y"], trainDf[explanatories], family=sm.families.Gaussian()) linearModel = linearModel.fit() preds = np.array(linearModel.predict(testDf[explanatories])) acts = np.array(testDf["y"]) if printem: display(preds) print("") display(acts) print("") errs = preds - acts MAE = sum(abs(errs)) / len(errs) RMSE = np.sqrt(sum(errs * errs) / len(errs)) return MAE, RMSE
def make_model(draw=True): dict_data = get_data() y = dict_data['y_param'] sparse_matrix, i = [], 1 for name, value in dict_data.iteritems(): if name != 'y_param': print 'x{} = {}'.format(i, name) sparse_matrix.append(value) i += 1 ones = np.ones(len(sparse_matrix[0])) X = sm.add_constant(np.column_stack((sparse_matrix[0], ones))) for ele in sparse_matrix[1:]: X = sm.add_constant(np.column_stack((ele, X))) glm_binom = sm.GLM(y, X) res = glm_binom.fit() # general info about Gaussian Model (Exp model) print res.summary() # This is a general specification test, # for additional non-linear effects in a model. # If the p-value of the f-test is below a threshold, e.g. 0.1, then this # indicates that there might be additional non-linear effects in the model # and that the linear model is mis-specified. print reset_ramsey(res, len(sparse_matrix)) check_sum_params(res.params) if draw: y_sum = sum(y) y = [float(item) / y_sum for item in y] yhat = res.mu make_observed_values(yhat, y) make_residual_dependence(yhat, res.resid_pearson) make_normalised_distribution(res.resid_deviance.copy())
def link_functions_binomial(ip, port): # Connect to h2o h2o.init(ip, port) print("Read in prostate data.") h2o_data = h2o.import_frame( path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() sm_data = pd.read_csv( zipfile.ZipFile( h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open( "prostate_complete.csv")).as_matrix() sm_data_response = sm_data[:, 2] sm_data_features = sm_data[:, [1, 3, 4, 5, 6, 7, 8, 9]] print("Testing for family: BINOMIAL") print("Set variables for h2o.") myY = "CAPSULE" myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"] print("Create models with canonical link: LOGIT") h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit", alpha=[0.5], Lambda=[0]) sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial( sm.families.links.logit)).fit() print("Compare model deviances for link function logit") h2o_deviance = h2o_model._model_json['output'][ 'residual_deviance'] / h2o_model._model_json['output']['null_deviance'] sm_deviance = sm_model.deviance / sm_model.null_deviance assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def calculate_examplebehav(sigma=1): xc = np.random.uniform(0, 10, (n, 4)) #xc = x continuous beta0 = [1, 1, 1, 1] beta0 = mrc.normalize(beta0) I0 = np.dot(xc, beta0) y = 1.5 * (I0 - 8) + np.random.lognormal(0, sigma, n) y[y < 0] = 0 y[y > 10] = 10 #plt.plot(I0, y, 'o') tau_real = st.kendalltau(np.dot(xc, beta0), y)[0] x = xc.astype(int) #y = y.astype(int) # x[abs(x)<2]=0 # x[abs(x)>5]=5 tau0 = st.kendalltau(np.dot(x, beta0), y)[0] I00 = np.dot(x, beta0) #plt.plot(I00,y,'o') beta_ini = np.random.uniform(-1, 1, dim) beta_ori, tauback, iterations = mrc.ori(y, x, 5, 100, beta_ini) Ix = np.dot(x, beta_ori) xx = sm.add_constant(x) Gaussian_model = sm.GLM(y, xx, family=sm.families.Gaussian()) Gaussian_results = Gaussian_model.fit() beta_glm = mrc.normalize(Gaussian_results.params[1:]) tauglm = st.kendalltau(np.dot(x, beta_glm), y)[0] ### STATISTICAL SIGNIFICANCE PART ##### # for n=100, d=4, shape1= 8.336884 , shape2= 51.11006 . Think if these estimates apply. shape1 = 8.336884 shape2 = 51.11006 pval = 1 - bt.cdf(tauback, shape1, shape2, loc=0, scale=1) cos_ori = np.dot(beta0, beta_ori) cos_glm = np.dot(beta0, beta_glm) deltacos = cos_ori - cos_glm #plt.plot(Ix,y,'o') return (cos_ori, cos_glm, deltacos)
def fit( self, start_params=None, maxiter=100000, maxfun=5000, disp=False, method="bfgs", **kwds ): """ Fit the model. Parameters ---------- start_params : array-like A vector of starting values for the regression coefficients. If None, a default is chosen. maxiter : integer The maximum number of iterations disp : bool Show convergence stats. method : str The optimization method to use. """ if start_params is None: start_params = ( sm.GLM(self.endog, self.exog, family=Binomial()).fit(disp=False).params ) start_params = np.append(start_params, [0.5] * self.Z.shape[1]) return super(Beta, self).fit( start_params=start_params, maxiter=maxiter, maxfun=maxfun, method=method, disp=disp, **kwds )
def logitrank(df): teams = list(sorted(df['H'].unique())) dummies = dict( (team, (df['H'] == team).astype(np.int) - (df['V'] == team).astype(np.int)) for team in teams) df2 = pd.DataFrame(dummies) df2['pdiff'] = df['PTS/H'] - df['PTS/V'] y, X = dmatrices('I(pdiff>0) ~ %s' % ' + '.join(teams[:-1]), df2) results = sm.GLM(y, X, family=sm.families.Binomial()).fit() strengths = { name: val for name, val in zip(teams, np.append(results.params[1:], 0)) } return { key: i for i, ( key, v) in enumerate(sorted(strengths.iteritems(), key=lambda x: x[1])) }
def calculate_cauchy(sigma): x = np.random.uniform(0, 1, (n, dim)) beta0 = [2, 1, -1, -1] beta0 = mrc.normalize(beta0) noise = cauchy.rvs(loc=0, scale=.1, size=n) I0 = np.dot(x, beta0) y = I0 + sigma * (noise) beta_ini = np.random.uniform(-1, -1, dim) beta_ori, tauori, iterations = mrc.ori(y, x, 2, 500, beta_ini) Ix = np.dot(x, beta_ori) beta_ori = mrc.normalize(beta_ori) xx = sm.add_constant(x) Gaussian_model = sm.GLM(y, xx, family=sm.families.Gaussian()) Gaussian_results = Gaussian_model.fit() #print(Gaussian_results.summary()) beta_glm = mrc.normalize(Gaussian_results.params[1:]) cos_glm = np.abs(np.dot(beta0, beta_glm)) cos_ori = np.abs(np.dot(beta0, beta_ori)) deltacos = cos_ori - cos_glm return (cos_ori, cos_glm, deltacos)
def get_dataset(adjacency_pairs_list): normalise = False c_count, c_gender, c_plen, y = create_predictors(adjacency_pairs_list, 3, normalise) #TODO compute these beta's based on the training set only betas={} for w in CATEGORIES: c_w = c_count[w] g_w = c_gender[w] y_w = y[w] if sum(y_w) > 0: c_w = np.array(c_w) g_w = np.array(g_w) X = np.array([np.ones(len(c_w)), c_w, g_w, c_w * g_w]).T y_w = np.array(y_w) res = sm.GLM(y_w, X, family=sm.families.Binomial()).fit() # print('params', w, res.params) betas[w] = res.params return c_count, c_gender, c_plen, y, betas
def hic_norm(mat, count_neig, fire): """Poisson normalization. Parameters: ---------- mat : DataFrame Pandas DataFrame consisting of genomic regions count_neig : str DataFrame column name where neighbor count is located fire : str DataFrame column name where fire score should be stored """ y = mat[count_neig] x = mat[["F", "GC", "M"]] x = sm.add_constant(x) glm = sm.GLM(y, x, family=sm.families.Poisson()) res = glm.fit() mat[fire] = mat[count_neig] / np.exp(res.params[0] + mat["F"] * res.params[1] + mat["GC"] * res.params[2] + mat["M"] * res.params[3]) logging.debug(f"Done calculating Poisson normalization.")
def fit(self, data): """ fit log-linear model to predicted weights """ if self.marginals.is_empty: self.calculate_marginals() self.actual_counts = group_counts(data, self.demos) expected_counts = _get_expected_values(self.marginals, self.demos, self.population) model_data = sm.add_constant(pd.get_dummies(pd.merge( self.actual_counts, expected_counts, ), columns=self.demos), prepend=False) features = model_data.columns[2:] model = sm.GLM(model_data["N"], model_data[features], family=sm.families.Poisson(), offset=np.log(model_data["n"])).fit() self.model = model return self
def fit(self, X, y): """ takes in training data and fits a model """ self.classes_ = list(set(y)) X = sm.add_constant(X) # returns statsmodel link and distribution functions based on user input link = self.get_link() family = self.get_family(link) # fits and stores statsmodel glm model = sm.GLM(y, X, family=family) self.fitted_model = model.fit() # adds attributes for explainability # intercept cant be multidimensional np array like in classification # as scoring_base.py func compute_lm_significant hstack method will fail self.coef_ = np.array(self.fitted_model.params[1:] ) #removes first value which is the intercept self.intercept_ = float(self.fitted_model.params[0])
def cal_cv2(data, filename): b = data.iloc[:, 1:].T.values c = b.copy() # means = np.mean(c, axis=1) variance = np.var(c, axis=1) cv2 = variance/means**2 # minMeanForFit = np.quantile(means[np.where(cv2 > 0.5)], 0.95) useForFit = means >= minMeanForFit gamma_model = sm.GLM(cv2[useForFit], np.array([np.repeat(1, means[useForFit].shape[0]), 1/means[useForFit]]).T, family=sm.families.Gamma(link=sm.genmod.families.links.identity)) gamma_results = gamma_model.fit() a0 = gamma_results.params[0] a1 = gamma_results.params[1] afit = a1/means + a0 varFitRatio = variance / (afit*(means**2)) cv2_score = pd.DataFrame({'Feature': data.columns[1:], "cv2_score": varFitRatio}) cv2_score = cv2_score.sort_values('cv2_score', ascending=False) cv2_score.to_csv("{}_cv2.csv".format(filename), index=False) data_train = data.reindex(['Label']+list(cv2_score['Feature']), axis=1) data_train.to_csv("{}_cv2_data.csv".format(filename), index=None) return data_train
def define_bin_params(Y, n, X=None): n_obs = len(Y) p = ncol(X) g = max(2, int(n_obs / 2)) try: bin_mod = sm.GLM(endog=np.c_[Y, n], exog=X, family=sm.families.Binomial()).fit() bin_params = bin_mod.params bin_cov = fill_diag((g / (1 + g)) * bin_mod.cov_params()) except: if np.sum(Y) > 0: binmean = np.sum(Y) / np.sum(n) binmean = np.log(binmean / (1 - binmean)) bin_params = np.zeros(p) bin_params[0] = binmean else: bin_params = np.zeros(p) bin_params[0] = np.max([-3, -np.sum(n)]) bin_cov = np.identity(p) return bin_params, bin_cov, p
def prostate(): h2o_data = h2o.upload_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) h2o_data.summary() sm_data = pd.read_csv( pyunit_utils.locate("smalldata/logreg/prostate.csv")).as_matrix() sm_data_response = sm_data[:, 1] sm_data_features = sm_data[:, 2:] h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", nfolds=10, alpha=0.5) h2o_glm.train(x=range(2, h2o_data.ncol), y=1, training_frame=h2o_data) sm_glm = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial()).fit() print "statsmodels null deviance {0}".format(sm_glm.null_deviance) print "h2o null deviance {0}".format(h2o_glm.null_deviance()) assert abs(sm_glm.null_deviance - h2o_glm.null_deviance() ) < 1e-5, "Expected null deviances to be the same"
def _regress_out_chunk(data): # data is a tuple containing the selected columns from adata.X # and the regressors dataFrame data_chunk = data[0] regressors = data[1] variable_is_categorical = data[2] responses_chunk_list = [] import statsmodels.api as sm from statsmodels.tools.sm_exceptions import PerfectSeparationError for col_index in range(data_chunk.shape[1]): # if all values are identical, the statsmodel.api.GLM throws an error; # but then no regression is necessary anyways... if not (data_chunk[:, col_index] != data_chunk[0, col_index]).any(): responses_chunk_list.append(data_chunk[:, col_index]) continue if variable_is_categorical: regres = np.c_[np.ones(regressors.shape[0]), regressors[:, col_index]] else: regres = regressors try: result = sm.GLM(data_chunk[:, col_index], regres, family=sm.families.Gaussian()).fit() new_column = result.resid_response except PerfectSeparationError: # this emulates R's behavior logg.warning( 'Encountered PerfectSeparationError, setting to 0 as in R.') new_column = np.zeros(data_chunk.shape[0]) responses_chunk_list.append(new_column) return np.vstack(responses_chunk_list)
def looCV(clim, predic, fnc): """This function calculates the the leave-one-out-Cross-Validation Parameters ---------- clim : damage time series DESCRIPTION. predic : DataFrame DESCRIPTION. fnc : LinkFunctionObject link function Returns ------- out of sample error """ err = 0 for lo_index in range(len(clim)): clim_mask = np.ma.array(clim, mask=False) clim_mask.mask[lo_index] = True clim_lo = clim_mask.compressed() predic_lo = predic.reset_index().drop(lo_index, axis=0).drop('index', 1) model_res = sm.GLM(clim_lo, predic_lo, family=sm.families.Gamma(fnc)).fit(maxiter=5000, scale=1.) value_pred = model_res.predict(predic).iloc[lo_index] err = err + (clim[lo_index] - value_pred)**2 return err / len(clim)
def fit_beta_reg(y, X, df, group_title): curr_best_fit = 0 curr_best_model = None for i in tqdm(range(10)): X_sample = df.groupby(group_title).apply( lambda temp: temp.sample(int(HELD_OUT_PROP * len(temp)))) train_idx = pd.Series(X_sample.index.get_level_values(1)) test_idx = df.index.difference(train_idx).tolist() np.random.shuffle(test_idx) train_idx = train_idx.tolist() # print(train_idx) # print(len(train_idx)) # print(len(test_idx)) # print(len(set(train_idx).union(set(test_idx)))) # print(X.shape) # print(df.shape) X_train = X[train_idx] X_test = X[test_idx] y_train = y[train_idx] y_test = y[test_idx] binom_glm = sm.GLM(y_train, X_train, family=sm.families.Binomial()) binom_fit_model = binom_glm.fit() fit_val = LexicalAnalysis.goodness_of_fit(binom_fit_model, y_test, X_test) if fit_val > curr_best_fit: print("NEW BEST MODEL") print(f"R^2 score of: {fit_val}") curr_best_fit = fit_val curr_best_model = binom_fit_model return curr_best_model