def test_framing_example(): cur_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv")) outcome = np.asarray(data["cong_mesg"]) outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data, return_type='dataframe') probit = sm.families.links.probit outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit())) mediator = np.asarray(data["emo"]) mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data, return_type='dataframe') mediator_model = sm.OLS(mediator, mediator_exog) tx_pos = [outcome_exog.columns.tolist().index("treat"), mediator_exog.columns.tolist().index("treat")] med_pos = outcome_exog.columns.tolist().index("emo") med = Mediation(outcome_model, mediator_model, tx_pos, med_pos, outcome_fit_kwargs={'atol':1e-11}) np.random.seed(4231) para_rslt = med.fit(method='parametric', n_rep=100) diff = np.asarray(para_rslt.summary() - framing_para_4231) assert_allclose(diff, 0, atol=1e-6) np.random.seed(4231) boot_rslt = med.fit(method='boot', n_rep=100) diff = np.asarray(boot_rslt.summary() - framing_boot_4231) assert_allclose(diff, 0, atol=1e-6)
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False): """LASSO polynomial fit with cross-validation. Regularized polynomial regression (by penalized least-squares) from a range of degrees up to n = max_deg. The LASSO regression minimises MSE and penalizes the size of the parameter vector using L1-norm, which leads to fewer coefficients in the fitted model. - The 'alpha' parameter (amount of penalization) is selected by k-fold CV. - Predicts fitted model on given values 'x_pred' (default use 'x'). - Supports NaNs. """ ind, = np.where((~np.isnan(x)) & (~np.isnan(y))) x_, y_ = x[ind], y[ind] X_ = dmatrix('C(x_, Poly)') if x_pred is None: X = dmatrix('C(x, Poly)') # predict on original values else: X = dmatrix('C(x_pred, Poly)') # predict on given values lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter) lasso = lasso.fit(X_[:,1:max_deg+1], y_) y_pred = lasso.predict(X[:,1:max_deg+1]) if return_model: y_pred = [y_pred, lasso] return y_pred
def plot_results(band, yatsm_config, yatsm_model, plot_type='TS'): step = -1 if yatsm_config['reverse'] else 1 design = re.sub(r'[\+\-][\ ]+C\(.*\)', '', yatsm_config['design_matrix']) for i, r in enumerate(yatsm_model.record): label = 'Model {i}'.format(i=i) if plot_type == 'TS': mx = np.arange(r['start'], r['end'], step) mX = patsy.dmatrix(design, {'x': mx}).T my = np.dot(r['coef'][:, band], mX) mx_date = np.array([dt.datetime.fromordinal(int(_x)) for _x in mx]) elif plot_type == 'DOY': yr_end = dt.datetime.fromordinal(r['end']).year yr_start = dt.datetime.fromordinal(r['start']).year yr_mid = int(yr_end - (yr_end - yr_start) / 2) mx = np.arange(dt.date(yr_mid, 1, 1).toordinal(), dt.date(yr_mid + 1, 1, 1).toordinal(), 1) mX = patsy.dmatrix(design, {'x': mx}).T my = np.dot(r['coef'][:, band], mX) mx_date = np.array([dt.datetime.fromordinal(d).timetuple().tm_yday for d in mx]) label = 'Model {i} - {yr}'.format(i=i, yr=yr_mid) plt.plot(mx_date, my, lw=2, label=label) plt.legend()
def design_formula(train_metadata, test_metadata, formula): """ Generate and align two design matrices. Parameters ---------- train_metadata : pd.DataFrame Training metadata test_metadata : pd.DataFrame Testing metadata formula : str Statistical formula specifying design matrix Return ------ train_design : pd.DataFrame Train design matrix test_design : pd.DataFrame Test design matrix """ train_design = dmatrix(formula, train_metadata, return_type='dataframe') test_design = dmatrix(formula, test_metadata, return_type='dataframe') # pad extra columns with zeros, so that we can still make predictions extra_columns = list(set(train_design.columns) - set(test_design.columns)) df = pd.DataFrame({C: np.zeros(test_design.shape[0]) for C in extra_columns}, index=test_design.index) test_design = pd.concat((test_design, df), axis=1) test_design = test_design.reindex(columns=train_design.columns) return train_design, test_design
def test_framing_example_moderator(): # moderation without formulas, generally not useful but test anyway cur_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(os.path.join(cur_dir, 'results', "framing.csv")) outcome = np.asarray(data["cong_mesg"]) outcome_exog = patsy.dmatrix("emo + treat + age + educ + gender + income", data, return_type='dataframe') probit = sm.families.links.probit outcome_model = sm.GLM(outcome, outcome_exog, family=sm.families.Binomial(link=probit())) mediator = np.asarray(data["emo"]) mediator_exog = patsy.dmatrix("treat + age + educ + gender + income", data, return_type='dataframe') mediator_model = sm.OLS(mediator, mediator_exog) tx_pos = [outcome_exog.columns.tolist().index("treat"), mediator_exog.columns.tolist().index("treat")] med_pos = outcome_exog.columns.tolist().index("emo") ix = (outcome_exog.columns.tolist().index("age"), mediator_exog.columns.tolist().index("age")) moderators = {ix : 20} med = Mediation(outcome_model, mediator_model, tx_pos, med_pos, moderators=moderators) # Just a smoke test np.random.seed(4231) med_rslt = med.fit(method='parametric', n_rep=100)
def lr_tests(ds, alt_model, null_model='~ 1', gene_column='Gene', batch_size=2000, transformation=np.log1p, rcond=-1): ''' Compare alt_model and null_model by a likelihood ratio test for every gene in ds. Args: ds (LoomConnection): Dataset alt_model (str): Formula describing alternative model null_model (str): Formula describing null model gene_column (str): Name of the gene labels to use in the ds (default "Gene") batch_size (int): The number of genes to read from disk in each iteration (default 2000) transformation (function): Transformation to apply to expression values before fitting (default np.log1p) rcond (float): Conditioning for the least square fitting (default -1, which has no effect) Returns: results (DataFrame): Dataframe with model parameter estimates for each gene, with P values from LRT. ''' sample_info = pd.DataFrame() for k in ds.ca.keys(): sample_info[k] = ds.ca[k] alt_design = patsy.dmatrix(alt_model, sample_info, return_type='dataframe') null_design = patsy.dmatrix(null_model, sample_info, return_type='dataframe') n = ds.shape[1] genes = [] betas = [] pvals = [] total_batches = np.ceil(ds.shape[0] / batch_size).astype(int) for (ix, selection, vals) in tqdm(ds.scan(axis=0, batch_size=batch_size), total=total_batches): expression_matrix = transformation(vals[:, :]) beta_alt, res_alt, rank_alt, s_alt = np.linalg.lstsq(alt_design, expression_matrix.T, rcond=rcond) beta_null, res_null, rank_null, s_null = np.linalg.lstsq(null_design, expression_matrix.T, rcond=rcond) genes.append(vals.ra[gene_column]) ll_alt = -n / 2. * np.log(2 * np.pi) - n / 2. * np.ma.log(res_alt / n) - n / 2. ll_null = -n / 2. * np.log(2 * np.pi) - n / 2. * np.ma.log(res_null / n) - n / 2. llr = ll_alt - ll_null pval = stats.chi2.sf(2 * llr, df=(beta_alt.shape[0] - beta_null.shape[0])) pval = np.ma.MaskedArray(pval, mask=llr.mask).filled(1.) betas.append(beta_alt) pvals.append(pval) results = pd.DataFrame({gene_column: np.hstack(genes)}) for name, beta in zip(alt_design.columns, np.hstack(betas)): results[name] = beta results['pval'] = np.hstack(pvals) min_pval = results.pval[results.pval != 0].min() results['pval'] = results.pval.clip_lower(min_pval) return results
def test_patsy_577(): X = np.random.random((10, 2)) df = pandas.DataFrame(X, columns=["var1", "var2"]) from patsy import dmatrix endog = dmatrix("var1 - 1", df) np.testing.assert_(data._is_using_patsy(endog, None)) exog = dmatrix("var2 - 1", df) np.testing.assert_(data._is_using_patsy(endog, exog))
def plot_results(band, cfg, model, design_info, plot_type='TS'): """ Plot model results Args: band (int): plot results for this band cfg (dict): YATSM configuration dictionary model (YATSM model): fitted YATSM timeseries model design_info (patsy.DesignInfo): patsy design information plot_type (str): type of plot to add results to (TS, DOY, or VAL) """ # Handle reverse step = -1 if cfg['YATSM']['reverse'] else 1 # Remove categorical info from predictions design = re.sub(r'[\+\-][\ ]+C\(.*\)', '', cfg['YATSM']['design_matrix']) i_coef = [] for k, v in design_info.column_name_indexes.iteritems(): if not re.match('C\(.*\)', k): i_coef.append(v) i_coef = np.asarray(i_coef) for i, r in enumerate(model.record): label = 'Model {i}'.format(i=i) if plot_type == 'TS': # Prediction mx = np.arange(r['start'], r['end'], step) mX = patsy.dmatrix(design, {'x': mx}).T my = np.dot(r['coef'][i_coef, band], mX) mx_date = np.array([dt.datetime.fromordinal(int(_x)) for _x in mx]) # Break if r['break']: bx = dt.datetime.fromordinal(r['break']) plt.axvline(bx, c='red', lw=2) elif plot_type in ('DOY', 'VAL'): yr_end = dt.datetime.fromordinal(r['end']).year yr_start = dt.datetime.fromordinal(r['start']).year yr_mid = int(yr_end - (yr_end - yr_start) / 2) mx = np.arange(dt.date(yr_mid, 1, 1).toordinal(), dt.date(yr_mid + 1, 1, 1).toordinal(), 1) mX = patsy.dmatrix(design, {'x': mx}).T my = np.dot(r['coef'][i_coef, band], mX) mx_date = np.array([dt.datetime.fromordinal(d).timetuple().tm_yday for d in mx]) label = 'Model {i} - {yr}'.format(i=i, yr=yr_mid) plt.plot(mx_date, my, lw=2, label=label) leg = plt.legend() leg.draggable(state=True)
def covariance(self, time, scale_params, smooth_params, scale_data, smooth_data): """ Returns a Gaussian process covariance matrix. Parameters ---------- time : array-like The time points at which the fitted covariance matrix is calculated. scale_params : array-like The regression parameters for the scaling part of the covariance structure. smooth_params : array-like The regression parameters for the smoothing part of the covariance structure. scale_data : Dataframe The data used to determine the scale parameter, must have len(time) rows. smooth_data: Dataframe The data used to determine the smoothness parameter, must have len(time) rows. Returns ------- A covariance matrix. Notes ----- If the model was fit using formulas, `scale` and `smooth` should be Dataframes, containing all variables that were present in the respective scaling and smoothing formulas used to fit the model. Otherwise, `scale` and `smooth` should contain data arrays whose columns align with the fitted scaling and smoothing parameters. The covariance is only for the Gaussian process and does not include the white noise variance. """ if not hasattr(self.data, "scale_design_info"): sca = np.dot(scale_data, scale_params) smo = np.dot(smooth_data, smooth_params) else: sc = patsy.dmatrix(self.data.scale_design_info, scale_data) sm = patsy.dmatrix(self.data.smooth_design_info, smooth_data) sca = np.exp(np.dot(sc, scale_params)) smo = np.exp(np.dot(sm, smooth_params)) return self.cov.get_cov(time, sca, smo)
def transform(self, data): """Transform with estimator using formula. Transform the data using formula, then transform it using the estimator. Parameters ---------- data : dict-like (pandas dataframe) Input data. Column names need to match variables in formula. """ if self.return_type == 'dataframe': return dmatrix(self.design_, data, return_type='dataframe') else: return np.array(dmatrix(self.design_, data))
def regress_out(sample_info, expression_matrix, covariate_formula, design_formula='1'): ''' Implementation of limma's removeBatchEffect function ''' # Ensure intercept is not part of covariates covariate_formula += ' - 1' covariate_matrix = patsy.dmatrix(covariate_formula, sample_info) design_matrix = patsy.dmatrix(design_formula, sample_info) design_batch = np.hstack((design_matrix, covariate_matrix)) coefficients, res, rank, s = np.linalg.lstsq(design_batch, expression_matrix.T) beta = coefficients[-design_matrix.shape[1]][:, None] regressed = expression_matrix - beta.dot(covariate_matrix.T) return regressed
def fitMinSpline(self, Yvar, Xvar, smoothingWindow, plot=False, plotVar = None): ''' This function is to fit/interpolate a spline in the data ''' # use patsy class to define a matrix X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar})) # redefine dataframe modDat = pd.DataFrame(X, index=Yvar.index) # redefine our data into X1-X7 modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] modDatTrunc = modDat.iloc[self._smoothingWindow/2:-self._smoothingWindow/2].copy() window = np.ones(self._smoothingWindow)/float(self._smoothingWindow) modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[self._smoothingWindow/2:-self._smoothingWindow/2] mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc) res = mod.fit(q=0.01) preds = pd.Series(res.predict(modDat), index = Xvar.index) if plot: plotDF = pd.concat([plotVar, Yvar, preds],1) print(plotDF.columns) plotDF.columns = [plotVar.name, Yvar.name, 'fitted'] p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\ geom_line(aes(y='fitted'), color='red')+\ ylim(0,5) +\ xlab('') + ylab('Sensor (V)') print(p) #return regression predictors return(preds)
def test_predict_formula(self): n = 100 np.random.seed(34234) time = 50 * np.random.uniform(size=n) status = np.random.randint(0, 2, n).astype(np.float64) exog = np.random.uniform(1, 2, size=(n, 2)) df = pd.DataFrame({"time": time, "status": status, "exog1": exog[:, 0], "exog2": exog[:, 1]}) fml = "time ~ 0 + exog1 + np.log(exog2) + exog1*exog2" model1 = PHReg.from_formula(fml, df, status=status) result1 = model1.fit() from patsy import dmatrix dfp = dmatrix(model1.data.design_info.builder, df) pr1 = result1.predict() pr2 = result1.predict(exog=df) pr3 = model1.predict(result1.params, exog=dfp) # No standard errors pr4 = model1.predict(result1.params, cov_params=result1.cov_params(), exog=dfp) prl = (pr1, pr2, pr3, pr4) for i in range(4): for j in range(i): assert_allclose(prl[i].predicted_values, prl[j].predicted_values) prl = (pr1, pr2, pr4) for i in range(3): for j in range(i): assert_allclose(prl[i].standard_errors, prl[j].standard_errors)
def xtab(formula, covariate_df): y, X = patsy.dmatrices(str(formula), covariate_df) X = patsy.dmatrix('genotype', covariate_df) ix = get_genotype_ix(X) tbl = pd.crosstab(X[:, ix], y.ravel()) try: tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)] except: return None # too few samples tbl.index = ['%i_alts' % i for i in tbl.index] alts = set(tbl.index) if len(alts) < 2 or not '0_alts' in alts: tbl_dom = None else: tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T # can't test recessive without any homoz alts. if not '2_alts' in alts or len(alts) < 2: tbl_rec = None else: tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]}) d = {} for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)): if xtbl is None: d['p.chi.%s' % name] = 'nan' continue chi, p, ddof, e = chi2_contingency(xtbl) if name == 'additive': d = xtbl.to_dict() d['p.chi.%s' % name] = "%.3g" % p return d
def test_mnl_estimation(obs, alts): """ Confirm that estimated params from the new interface match urbansim.urbanchoice. Only runs if the urbansim package has been installed. """ try: from urbansim.urbanchoice.mnl import mnl_estimate except: print("Comparison of MNL estimation results skipped because urbansim is not installed") return model_expression = 'obsval + altval - 1' mct = MergedChoiceTable(obs, alts, 'choice') # new interface m = MultinomialLogit(mct, model_expression) r = m.fit().get_raw_results() # old interface dm = dmatrix(model_expression, mct.to_frame()) chosen = np.reshape(mct.to_frame()[mct.choice_col].values, (100, 5)) log_lik, fit = mnl_estimate(np.array(dm), chosen, numalts=5) for k,v in log_lik.items(): assert(v == pytest.approx(r['log_likelihood'][k], 0.00001)) assert_frame_equal(fit, r['fit_parameters'][['Coefficient', 'Std. Error', 'T-Score']])
def main(): train_df_filled=fill_null_vals(train_df,'Fare') train_df_filled=fill_null_vals(train_df_filled,'Age') assert len(train_df_filled)==len(train_df) test_df_filled=fill_null_vals(test_df,'Fare') test_df_filled=fill_null_vals(test_df_filled,'Age') assert len(test_df_filled)==len(test_df) for formula_name, formula in formula_map.iteritems(): print "name=%s formula=%s" % (formula_name,formula) y_train,X_train = dmatrices('Survived ~ ' + formula, train_df_filled,return_type='dataframe') print "Running DecisionTreeClassifier with formula : %s" % formula print "X_train cols=%s " % X_train.columns y_train = np.ravel(y_train) model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3,min_samples_leaf=5) print "About to fit..." dt_model = model.fit(X_train, y_train) print "Training score:%s" % dt_model.score(X_train,y_train) X_test=dmatrix(formula,test_df_filled) predicted=dt_model.predict(X_test) print "predicted:%s" % predicted[:5] assert len(predicted)==len(test_df) pred_results=pd.Series(predicted,name='Survived') dt_results=pd.concat([test_df['PassengerId'],pred_results],axis=1) dt_results.Survived=dt_results.Survived.astype(int) results_file='csv/dt_%s.csv' % (formula_name) print "output file: %s\n" % results_file #results_file = re.sub('[+ ()C]','',results_file) dt_results.to_csv(results_file,index=False)
def _temp_plot(dates, Y_seg_mean, Y_seg_std, Y_seg_stderr, Y_seg_mask, seg_id, plot_idx, results=None): import matplotlib.pyplot as plt seg_id -= 1 plot_idx = 5 plt.subplot(3, 1, 1) plt.plot(dates[Y_seg_mask[seg_id, :]], Y_seg_mean[seg_id, plot_idx, Y_seg_mask[seg_id, :]], 'ro') plt.ylabel('Mean idx {i}'.format(i=plot_idx)) plt.subplot(3, 1, 2) plt.plot(dates[Y_seg_mask[seg_id, :]], Y_seg_std[seg_id, plot_idx, Y_seg_mask[seg_id, :]], 'ro') plt.ylabel('Std idx {i}'.format(i=plot_idx)) plt.subplot(3, 1, 3) plt.errorbar(dates[Y_seg_mask[seg_id, :]], Y_seg_mean[seg_id, plot_idx, Y_seg_mask[seg_id, :]], yerr=Y_seg_stderr[seg_id, plot_idx, Y_seg_mask[seg_id, :]], fmt='o') plt.ylabel('Mean/stderr idx {i}'.format(i=plot_idx)) if results is not None: for i, r in enumerate(results.record): mx = np.arange(r['start'], r['end'], 1) from IPython.core.debugger import Pdb Pdb().set_trace() mX = patsy.dmatrix(results.design_info, {'x': mx}) my = np.dot(r['coef'][:, plot_idx], mX) # dates = plt.show()
def fitMinSpline(Yvar, Xvar, smoothingWindow, plot=False, plotVar = None): ''' Function returns minimal interpolation spline Inputs: Yvar : dependent variables that needed to be fit Xvar : independent variables that needed to be fit smoothingWindow : the smoothing time average plot = boolean value to plot or not, default is not to plot plotVar = plot a specific variable, default none ''' X = np.asarray(patsy.dmatrix("cr(x, df=7)-1", {"x": Xvar})) modDat = pd.DataFrame(X, index=Yvar.index) modDat.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] modDatTrunc = modDat.iloc[smoothingWindow/2:-smoothingWindow/2].copy() window = np.ones(smoothingWindow)/float(smoothingWindow) modDatTrunc['Y'] = np.convolve(Yvar, window, 'same')[smoothingWindow/2:-smoothingWindow/2] mod = smf.quantreg('Y~X1+X2+X3+X4+X5', modDatTrunc) res = mod.fit(q=0.01) preds = pd.Series(res.predict(modDat), index = Xvar.index) if plot: plotDF = pd.concat([plotVar, Yvar, preds],1) print(plotDF.columns) plotDF.columns = [plotVar.name, Yvar.name, 'fitted'] p = ggplot(aes(x=plotVar.name, y=Yvar.name), data=plotDF) + geom_line() +\ geom_line(aes(y='fitted'), color='red')+\ ylim(0,5) +\ xlab('') + ylab('Sensor (V)') print(p) return(preds)
def setup_class(cls): sp = np.array([40491.3940640059, 232455.530262537]) # s_scale is same as before cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) cls.exog = patsy.dmatrix('fuel + drive', data=df_autos) x_spline = df_autos[['weight', 'hp']].values bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', df_autos, smoother=bs, family=family.Poisson(), alpha=alpha0) cls.res1a = gam_bs.fit(use_t=False) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed
def hsalinarum_replicate_data(): import patsy data = pd.read_excel("data/hsalinarum/Raw_growth_data2.xlsx",sheetname='Raw data (OD600)SLIM') # time = np.arange(4,48,.5) time = np.arange(4,48,4) # temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3') | (data.Strain=='trmB') | (data.Strain == 'rosR'))] # temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR'))] # temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2'))] temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3'))] # temp = data[(data.Condition.isnull()) & (data.Strain!='ura3')] temp.Condition[temp.Condition.isnull()] = '' y = temp[time].T.values y = np.log2(y) y = y - y[0,:] x = time x = (x-x.mean())/x.std() x = x[:,None] effect = patsy.dmatrix('C(Experiment):C(Well)+0',temp) effect = np.where(effect!=0)[1][:,None] return x,y,effect
def test_mnl_prediction(obs, alts): """ Confirm that fitted probabilities in the new codebase match urbansim.urbanchoice. Only runs if the urbansim package has been installed. """ try: from urbansim.urbanchoice.mnl import mnl_simulate except: print("Comparison of MNL simulation results skipped because urbansim is not installed") return # produce a fitted model mct = MergedChoiceTable(obs, alts, 'choice', 5) m = MultinomialLogit(mct, model_expression='obsval + altval - 1') results = m.fit() # get predicted probabilities using choicemodels probs1 = results.probabilities(mct) # compare to probabilities from urbansim.urbanchoice dm = dmatrix(results.model_expression, data=mct.to_frame(), return_type='dataframe') probs = mnl_simulate(data=dm, coeff=results.fitted_parameters, numalts=mct.sample_size, returnprobs=True) df = mct.to_frame() df['prob'] = probs.flatten() probs2 = df.prob pd.testing.assert_series_equal(probs1, probs2)
def setup_class(cls): sp = np.array([0.830689464223685, 425.361212061649]) cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) x_spline = df_autos[['weight', 'hp']].values # We need asarray to remove the design_info # If design_info is attached, # then exog_linear will also be transformed in predict. cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos)) bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) # TODO alpha needs to be list alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs, alpha=(alpha0).tolist()) cls.res1a = gam_bs.fit(use_t=True) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs.mpg_bs cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed # for checking that alpha model attribute is unchanged, same as alpha0 cls.alpha = [169947.78222669504, 26767.58046340008]
def main(): train_df_filled=fill_null_vals(train_df,'Fare') train_df_filled=fill_null_vals(train_df_filled,'Age') assert len(train_df_filled)==len(train_df) test_df_filled=fill_null_vals(test_df,'Fare') test_df_filled=fill_null_vals(test_df_filled,'Age') assert len(test_df_filled)==len(test_df) num_estimators=10000 for formula_name, formula in formula_map.iteritems(): print "name=%s formula=%s" % (formula_name,formula) y_train,X_train = dmatrices('Survived ~ ' + formula, train_df_filled,return_type='dataframe') print "Running RandomForestClassifier with formula : %s" % formula print "X_train cols=%s " % X_train.columns y_train = np.ravel(y_train) model = RandomForestClassifier(n_estimators=num_estimators, random_state=0) print "About to fit..." rf_model = model.fit(X_train, y_train) print "Training score:%s" % rf_model.score(X_train,y_train) X_test=dmatrix(formula,test_df_filled) predicted=rf_model.predict(X_test) print "predicted:%s" % predicted[:5] assert len(predicted)==len(test_df) pred_results=pd.Series(predicted,name='Survived') rf_results=pd.concat([test_df['PassengerId'],pred_results],axis=1) rf_results.Survived=rf_results.Survived.astype(int) results_file='csv/rf_%s_n_est_%s.csv' % (formula_name,num_estimators) print "output file: %s\n" % results_file #results_file = re.sub('[+ ()C]','',results_file) rf_results.to_csv(results_file,index=False)
def hsalinarum_data(): import patsy data = pd.read_excel("data/hsalinarum/Raw_growth_data2.xlsx",sheetname='Raw data (OD600)SLIM') # time = np.arange(4,48,.5) time = np.arange(4,48,2) # temp = data[(data.Condition.isnull()) & ((data.Strain == 'ura3') | (data.Strain=='trmB') | (data.Strain == 'rosR'))] # temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR'))] # temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2'))] temp = data[(data.Condition.isnull()) & ((data.Strain=='trmB') | (data.Strain == 'rosR') | (data.Strain == 'trh2') | (data.Strain == 'idr1'))] # temp = data[(data.Condition.isnull()) & (data.Strain!='ura3')] y = temp[time].T.values y = np.log2(y) y = y - y[0,:] x = time x = (x-x.mean())/x.std() x = x[:,None] effect = patsy.dmatrix('C(Strain)+0',temp) effect = np.where(effect!=0)[1][:,None] # effect = (temp.Strain != "ura3").astype(int).values[:,None] return x,y,effect
def test_harmonic_transform(): x = np.arange(735688, 735688 + 100, 1) design = patsy.dmatrix("0 + harm(x, 1)") truth = np.vstack((np.cos(2 * np.pi / 365.25 * x), np.sin(2 * np.pi / 365.25 * x))).T np.testing.assert_equal(np.asarray(design), truth)
def create_node(self, name, kwargs, data): reg = kwargs['regressor'] # order parents according to user-supplied args args = [] for arg in reg['params']: for parent_name, parent in kwargs['parents'].items(): if parent_name == arg: args.append(parent) parents = {'args': args} # Make sure design matrix is kosher dm = dmatrix(reg['model'], data=data) if math.isnan(dm.sum()): raise NotImplementedError('DesignMatrix contains NaNs.') def func(args, design_matrix=dmatrix(reg['model'], data=data), link_func=reg['link_func']): # convert parents to matrix params = np.matrix(args) # Apply design matrix to input data if design_matrix.shape[1] != params.shape[1]: raise NotImplementedError('Missing columns in design matrix. You need data for all conditions for all subjects.') predictor = link_func(pd.DataFrame((design_matrix * params).sum(axis=1), index=data.index)) return pd.DataFrame(predictor, index=data.index) return self.pymc_node(func, kwargs['doc'], name, parents=parents, trace=self.keep_regressor_trace)
def from_formula(cls, formula, vc_formulas, data, family=None, vcp_p=1, fe_p=2): """ Fit a BayesMixedGLM using a formula. Parameters ---------- formula : string Formula for the endog and fixed effects terms (use ~ to separate dependent and independent expressions). vc_formulas : dictionary vc_formulas[name] is a one-sided formula that creates one collection of random effects with a common variance prameter. If using a categorical expression to produce variance components, note that generally `0 + ...` should be used so that an intercept is not included. data : data frame The data to which the formulas are applied. family : genmod.families instance A GLM family. vcp_p : float The prior standard deviation for the logarithms of the standard deviations of the random effects. fe_p : float The prior standard deviation for the fixed effects parameters. """ ident = [] exog_vc = [] vcp_names = [] j = 0 for na, fml in vc_formulas.items(): mat = patsy.dmatrix(fml, data, return_type='dataframe') exog_vc.append(mat) vcp_names.append(na) ident.append(j * np.ones(mat.shape[1])) j += 1 exog_vc = pd.concat(exog_vc, axis=1) vc_names = exog_vc.columns.tolist() ident = np.concatenate(ident) model = super(_BayesMixedGLM, cls).from_formula( formula, data=data, family=family, subset=None, exog_vc=exog_vc, ident=ident, vc_names=vc_names, vcp_names=vcp_names, fe_p=fe_p, vcp_p=vcp_p) return model
def logisticpatsy(): df = pd.read_csv("train.csv") cleanpatsy(df) #y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe") y, X = dmatrices('Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',df, return_type="dataframe") y = np.ravel(y) model = LogisticRegression() model = model.fit(X, y) # check the accuracy on the training set print model.score(X, y) # # evaluate the model by splitting into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model2 = LogisticRegression() model2.fit(X_train, y_train) predicted = model2.predict(X_test) print metrics.accuracy_score(y_test, predicted) dftest = pd.read_csv("test.csv") cleanpatsy(dftest) X = dmatrix('Pclass + Sex + Age + SibSp + Parch + Fare + Cabin + Embarked',dftest, return_type="dataframe") predict_survive = model.predict(X) result = {'PassengerId':dftest.PassengerId, 'Survived':predict_survive} dfresult = pd.DataFrame(result) dfresult.to_csv("result.csv",index=False) print pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print "Running Combat...", expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile(expr_input_dir, removeExtra="#") ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() # print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print "Original expression file over-written with batch effect removal results..." ebat.to_csv(expr_input_dir, sep="\t")
def prediction_given_R(all_samples, formula_variables, entries, prediction, outcomes): from patsy import dmatrix entries_new_R = entries.copy() formula = " + ".join(formula_variables) Rs_to_test = arange(0,1.1,.1) prediction_given_R = pd.Panel(items=outcomes, major_axis=arange(all_samples.shape[1]), minor_axis=Rs_to_test) for R in Rs_to_test: print(R) entries_new_R['Relatedness'] = R predictors = array(dmatrix(formula, entries_new_R)) for outcome in outcomes: print(outcome) betas = all_samples[outcome][['Intercept']+formula_variables].values prediction_given_R.ix[outcome, :, R] = prediction(predictors,betas.T).mean(axis=0) quantiles_for_prediction = [.025, .5, .975] prediction_quantiles_given_R = pd.Panel(items=prediction_given_R.items, major_axis = quantiles_for_prediction, minor_axis = prediction_given_R.minor_axis ) for item in prediction_quantiles_given_R.items: prediction_quantiles_given_R[item] = prediction_given_R[item].quantile(quantiles_for_prediction) return prediction_quantiles_given_R
def generate_sample_description( self, num_conditions=2, num_batches=4, intercept_scale: bool = False, **kwargs ): self.sim_design_loc, self.sample_description = generate_sample_description( self.nobs, num_conditions=num_conditions, num_batches=num_batches, **kwargs ) if intercept_scale: self.sim_design_scale = patsy.dmatrix("~1", self.sample_description) else: self.sim_design_scale = self.sim_design_loc
def func( args, design_matrix=dmatrix(reg["model"], data=data), link_func=reg["link_func"], ): # convert parents to matrix params = np.matrix(args) # Apply design matrix to input data if design_matrix.shape[1] != params.shape[1]: raise NotImplementedError( "Missing columns in design matrix. You need data for all conditions for all subjects." ) predictor = link_func( pd.DataFrame((design_matrix * params).sum(axis=1), index=data.index) ) return pd.DataFrame(predictor, index=data.index)
def __init__(self, sample, formula=None, design=None): assert type(sample) is Sample, 'sample must be of type Sample' self.sample = sample self.covariates = sample.covariates self.statistics = sample.statistics if (formula is not None) and (design is None): dmat = dmatrix(formula, self.covariates, eval_env=-1) parameter_names = dmat.design_info.column_names design = np.asarray(dmat) else: parameter_names = ['Intercept'] self.formula = formula self.parameter_names = parameter_names self.design = design
def _transform_predict_exog(model, exog, design_info=None): """transform exog for predict using design_info Note: this is copied from base.model.Results.predict and converted to standalone function with additional options. """ is_pandas = _is_using_pandas(exog, None) exog_index = exog.index if is_pandas else None if design_info is None: design_info = getattr(model.data, 'design_info', None) if design_info is not None and (exog is not None): from patsy import dmatrix if isinstance(exog, pd.Series): # we are guessing whether it should be column or row if (hasattr(exog, 'name') and isinstance(exog.name, str) and exog.name in design_info.describe()): # assume we need one column exog = pd.DataFrame(exog) else: # assume we need a row exog = pd.DataFrame(exog).T orig_exog_len = len(exog) is_dict = isinstance(exog, dict) exog = dmatrix(design_info, exog, return_type="dataframe") if orig_exog_len > len(exog) and not is_dict: import warnings if exog_index is None: warnings.warn('nan values have been dropped', ValueWarning) else: exog = exog.reindex(exog_index) exog_index = exog.index if exog is not None: exog = np.asarray(exog) if exog.ndim == 1 and (model.exog.ndim == 1 or model.exog.shape[1] == 1): exog = exog[:, None] exog = np.atleast_2d(exog) # needed in count model shape[1] return exog, exog_index
def match_and_filter(table, metadata, formula, min_sample_count, min_feature_count): """ Matches and aligns biom and metadata tables. This will also return the patsy representation. Parameters ---------- table : biom.Table Table of abundances metadata : pd.DataFrame Sample metadata Returns ------- table : biom.Table Filtered biom table metadata : pd.DataFrame Sample metadata """ # match them def sample_filter(val, id_, md): return id_ in metadata.index and np.sum(val) > min_sample_count def read_filter(val, id_, md): return np.sum(val > 0) > min_feature_count table = table.filter(sample_filter, axis='sample', inplace=False) table = table.filter(read_filter, axis='observation', inplace=False) metadata = metadata.loc[table.ids(axis='sample')] metadata = metadata.loc[~metadata.index.duplicated(keep='first')] def sort_f(xs): return [xs[metadata.index.get_loc(x)] for x in xs] table = table.sort(sort_f=sort_f, axis='sample') design = dmatrix(formula, metadata, return_type='dataframe') design = design.dropna() def design_filter(val, id_, md): return id_ in design.index table = table.filter(design_filter, axis='sample') return table, metadata, design
def load_census_data(fname_census, fname_census_tst): ''' load UCI Adult Census dataset keep it as training + test to match the avaialble classif. accuracy results ''' census = pd.read_table(fname_census, sep = ',', header = False, names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'label']) census_tst = pd.read_table(fname_census_tst, sep = ',', header= False, names = census.columns) ### removing NaNs print("Removing rows with missing data") census = census.dropna() census_tst = census_tst.dropna() #census_tst.index = census_tst.index + len(census) ### change the index to enable concatenation inds_tr = np.arange(len(census)) inds_tst = np.arange(len(inds_tr), len(inds_tr) + len(census_tst)) census = pd.concat([census, census_tst], ignore_index = True) ### find out what kind of features we're dealing with col_names = [x.replace(' ', '_') for x in census.columns] print("WARNING: patsy is dropping the _reference_ label, need to disable this.. ") patsy_formula = '+'.join(col_names) + '-1' ### -1 to remove intercept X = patsy.dmatrix(patsy_formula, census, return_type = 'dataframe') #del X['Intercept'] ### is there a way to do it in dmatrix directly #new_X_col_names = [x.replace(']','').replace('[T. ', ':') for x in X.columns] new_X_col_names = [x.replace(']','').replace('[T.', ':').replace('[ ', ':').replace(' ', '') for x in X.columns] X.columns = new_X_col_names label_col = 'label:>50K' ind_label = np.where(X.columns == label_col)[0][0] cols_reorder = X.columns.tolist() cols_reorder[ind_label] = cols_reorder[0] cols_reorder[0] = label_col X = X[cols_reorder] X_tr = X.iloc[inds_tr, :] X_tst = X.iloc[inds_tst, :] return(census, X_tr, X_tst)
def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model( data=self._df, obs_id_col=self._observation_id_col, alt_id_col=self._alternative_id_col, choice_col=self._choice_col, specification=self._model_expression, names=self._model_labels, model_type='MNL') m.fit_mle(init_vals=self._initial_coefs) results = MultinomialLogitResults( estimation_engine=self._estimation_engine, model_expression=self._model_expression, results=m) elif (self._estimation_engine == 'ChoiceModels'): dm = dmatrix(self._model_expression, data=self._df) chosen = np.reshape(self._df[[self._choice_col]].values, (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts) result_params = dict(log_likelihood=log_lik, fit_parameters=fit, x_names=dm.design_info.column_names) results = MultinomialLogitResults( estimation_engine=self._estimation_engine, model_expression=self._model_expression, results=result_params) return results
def predict(self, data): """ Predict new values by running data through the fit model. Parameters ---------- data : pandas.DataFrame Table with columns corresponding to the RHS of `model_expression`. Returns ------- predicted : ndarray Array of predicted values. """ model_design = dmatrix(self._rhs, data=data, return_type='dataframe') return model_design.dot(self.params).values
def glm_wrap_continuous(conn, pheno, contrast, regressors, report=False, fast=False): # Make sure pheno and conn have the same number of cases if not conn.shape[0] == pheno.shape[0]: print( f'Conn ({conn.shape[0]}) and pheno ({pheno.shape[0]}) must be same number of cases' ) # Define the subset of the sample sub_mask = find_subset(pheno, contrast) sub_conn = conn[sub_mask, :] sub_pheno = pheno.loc[sub_mask] n_sub = sub_pheno.shape[0] n_data = sub_conn.shape[1] sub_conn_stand = standardize(sub_conn, np.ones(n_sub).astype(bool)) if report: print( f'Selected sample based on contrast variable {contrast}.\n' f'Found {n_sub} subjects with no missing data for {contrast}\n' f'original sample: n={pheno.shape[0]}; new sample: n={n_sub}\n' f'{n_data} data points available\n' f'standardized estimators are based on all subjects with no missing data for {contrast}' ) formula = ' + '.join((regressors, contrast)) design_matrix = pat.dmatrix(formula, sub_pheno, return_type='dataframe') if fast: betas = fast_glm(sub_conn, design_matrix, contrast) table = pd.DataFrame(data={'betas': betas}) else: betas, pvals = glm(sub_conn, design_matrix, contrast) stand_betas, _ = glm(sub_conn_stand, design_matrix, contrast) table = pd.DataFrame(data={ 'betas': betas, 'stand_betas': stand_betas, 'pvals': pvals }) return table
def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self.exposure + ' ~ ' + model self.__mweight = model # Step 3) Estimation of g-model (exposure model) if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.g1W = fitmodel.predict(self.df) # User-specified prediction model else: warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with " "machine learning algorithms. TMLE will no longer support custom machine learning " "models in v0.9.0") self._exp_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) self.g1W = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray(self.df[self.exposure]), ml_model=custom_model, print_results=print_results) self.g0W = 1 - self.g1W if bound: # Bounding predicted probabilities if requested self.g1W = _bounding_(self.g1W, bounds=bound) self.g0W = _bounding_(self.g0W, bounds=bound) self._fit_exposure_model = True
def _fit_transform(self, data, y=None): eval_env = EvalEnvironment.capture(self.eval_env, reference=2) formula = _drop_intercept(self.formula, self.add_intercept) design = dmatrix(formula, data, eval_env=eval_env, NA_action=self.NA_action, return_type='dataframe') self.design_ = design.design_info if self.return_type == 'dataframe': return design else: return np.array(design) self.feature_names_ = design.design_info.column_names return np.array(design)
def wrap_subtype_stability(arg): data_stack = arg['data_stack'] sbt_idx = arg['sbt_idx'] dist_thr = arg['dist_thr'] part_thr = arg['part_thr'] regressors = arg['regressors'] pheno = arg['pheno'] # Regress nuisance for these individuals first design_matrix = pat.dmatrix(regressors, data=pheno.iloc[sbt_idx]) residuals = asdfc.stats.nuisance_correction(data_stack[sbt_idx, ...], design_matrix, n_jobs=-1) # Then extract the subtype part, _, _ = asdfc.stats.subtype_partition(residuals, mode='core', dist_thr=dist_thr, part_thr=part_thr) return part
def fit(self, X, y): # Build the design matrix via a tensor basis expansion of natural spline bases data = {'x{}'.format(i + 1): x for i, x in enumerate(X.T)} design_matrix = dmatrix( "te(" + ",".join([ 'cr(x{}, df={})'.format(i + 1, self.df) for i in range(X.shape[1]) ]) + ", constraints='center')", data) # Save the design information for future predictions self.design_info = design_matrix.design_info # Fit the model using the basis mod = smf.quantreg('y ~ x - 1', {'y': y, 'x': design_matrix}) if np.isscalar(self.quantiles): self.model = mod.fit(q=self.quantiles) else: self.model = [mod.fit(q=q) for q in self.quantiles]
def __init__(self, t, y, _F, _gradF, dof=5, deg=3, ndiff=1): assert t.size == y.size, 't and y mist have the same size' self.t = t # self.y = y self.y = (y - y.min())/(y.max() - y.min()) self.m = t.size self.k = dof - ndiff self._F = _F self._gradF = _gradF # self.theta0 = theta0 str_input = f"bs(x, df={dof}, degree={deg}, include_intercept=True) - 1" self.A = dmatrix(str_input, {"x": t}) # self.D = L(dof, ndiff) self.D = L(self.m, ndiff) @ self.A
def model_experimental(data, formula, baseline_index=None): cell_types = data.var.index.to_list() # Get count data data_matrix = data.X.astype("float32") # Build covariate matrix from R-like formula covariate_matrix = pt.dmatrix(formula, data.obs) covariate_names = covariate_matrix.design_info.column_names[1:] covariate_matrix = covariate_matrix[:, 1:] return NoBaselineModelExperimental( covariate_matrix=np.array(covariate_matrix), data_matrix=data_matrix, cell_types=cell_types, covariate_names=covariate_names, formula=formula)
def prep_model(csv_name): """ Loads CSV file of merged pandas DataFrame, cleans, converts categorical variable to a one-hot configuration Separates into dependent and independent features for regression :param csv_name: Name of CSV file to import, assumed to be in the ../data directory :return: x, y pandas DataFrames representing the features and dependent variable to perform regression upon """ df = pd.read_csv('../data/' + csv_name) df = df.drop(columns=['Unnamed: 0', 'index', 'Unnamed: 0_y'], errors='ignore') housing_categorical = patsy.dmatrix('type', data=df, return_type='dataframe') df = df.join(housing_categorical) df = df.dropna() y = df['price'] x = df.drop(columns=['price', 'type', 'hood', 'title', 'link']) return x, y
def transform_exog_to_model(fit, exog): transform = True self = fit # The following is lifted straight from statsmodels.base.model.Results.predict() if transform and hasattr(self.model, 'formula') and exog is not None: from patsy import dmatrix exog = dmatrix(self.model.data.orig_exog.design_info.builder, exog) if exog is not None: exog = np.asarray(exog) if exog.ndim == 1 and (self.model.exog.ndim == 1 or self.model.exog.shape[1] == 1): exog = exog[:, None] exog = np.atleast_2d(exog) # needed in count model shape[1] # end lifted code return exog
def construct_random_effects(groups, data, n_vars): re_vars, re_groupings = list(zip(*groups)) re_vars, re_groupings = set(re_vars), set(re_groupings) Zdict = dict( zip(re_vars, [ _check_np(patsy.dmatrix(x, data=data, return_type='dataframe')) for x in re_vars ])) Jdict = dict(zip(re_groupings, [dummy(data[x]) for x in re_groupings])) dim_dict = {} Z = [] for x, y in groups: Ji, Xi = Jdict[y], Zdict[x] dim_dict[y] = {'n_groups': Ji.shape[1], 'n_vars': Xi.shape[1]} Zi = khatri_rao(Ji.T, Xi.T).T Z.append(Zi) Z = np.concatenate(Z, axis=1) return Z, dim_dict
def survival(row, phenotype_df, duration_col='T', event_col='E', other_cols=[]): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ phenotype_df = phenotype_df.T phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ', '_').replace('.', '_').replace('-', '_') event_col = event_col.replace(' ', '_').replace('.', '_').replace('-', '_') other_cols = [ x.replace(' ', '_').replace('.', '_').replace('-', '_') for x in other_cols ] row.name = row.name.replace(' ', '_').replace('.', '_').replace('-', '_') phenotype_df.columns = [ x.replace(' ', '_').replace('.', '_').replace('-', '_') for x in phenotype_df.columns ] formula = row.name + ' + ' + duration_col + ' + ' + event_col if not not other_cols: other_cols = [ x.replace(' ', '_').replace('.', '_') for x in other_cols ] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like=formula, data=phenotype_df, return_type='dataframe') X = X.drop(['Intercept'], axis=1) cph = CoxPHFitter() cph.fit(X, duration_col=duration_col, event_col=event_col) result = cph.summary.loc[row.name] return result
def set_changepoints(self, changepoints, validate=True): # nodes are static unless logic below determines otherwise self.nodes_parametric = False # trivial case if changepoints is None: self.changepoint_coefs = None return # if we have received a list if isinstance(changepoints, list): # if we have received parametric node placement specifications if isinstance(changepoints[0], str): self.nodes_parametric = True if len(changepoints) > 1: raise ValueError( "Only a single changepoint may be specified currently." ) if self.data is None: raise ValueError( "Cannot specify changepoints without valid data.") if "~" in changepoints[0]: raise ValueError( "Received an invalid changepoint specification. Changepoints may not specify an outcome variable." ) # insert dummy intercept (left, terminal node) into list of changepoints # we should probably make users do this at some point self.changepoint_specifications = ['1'] + changepoints # extract dmatrices, hacking the first dummy self.changepoint_dmatrices = [ pd.DataFrame({'Intercept': np.ones(self.data.shape[0])}), patsy.dmatrix(changepoints[0], self.data, return_type='dataframe') ] else: raise ValueError("Changepoints must be patsy strings.") if validate: # validate self.validate_parameters()
def survival_npcs(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []): """ duration_col: survival time event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes other_cols: other variables to consider in the regression """ row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') row_npcs = row columns_names = [] formula = '' for n in range(len(row_npcs[0])): pc_name = row.name + '_pc' + str(n+1) columns_names.append(pc_name) formula = formula + pc_name + ' + ' row_npcs = pd.DataFrame(row_npcs.tolist(), index = row_npcs.index) row_npcs.columns = columns_names # phenotype_df = phenotype_df.join(row.astype(float)) phenotype_df = phenotype_df.join(row_npcs.astype(float)) phenotype_df[duration_col] = phenotype_df[duration_col].astype(float) phenotype_df[event_col] = phenotype_df[event_col].astype(int) # The following lines deal with char conflicts in patsy formulas duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_') event_col = event_col.replace(' ','_').replace('.','_').replace('-','_') other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols] # row.name = row.name.replace(' ','_').replace('.','_').replace('-','_') phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns] # formula = row.name + ' + ' + duration_col + ' + ' + event_col formula = formula + duration_col + ' + ' + event_col if not not other_cols: other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols] formula = formula + ' + ' + ' + '.join(other_cols) X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe') X = X.drop(['Intercept'], axis = 1) cph = lifelines.CoxPHFitter() cph.fit(X, duration_col = duration_col, event_col = event_col) result = cph.summary.loc[columns_names] return result
def estimate_sorted_spike_encoding_model(train_position_info, train_spikes_data, place_bin_centers): '''The conditional intensities for each state (Outbound-Forward, Outbound-Reverse, Inbound-Forward, Inbound-Reverse) Parameters ---------- train_position_info : pandas dataframe train_spikes_data : array_like place_bin_centers : array_like, shape=(n_parameters,) Returns ------- combined_likelihood_kwargs : dict ''' formula = ('1 + trajectory_direction * ' 'bs(linear_distance, df=10, degree=3)') design_matrix = dmatrix(formula, train_position_info, return_type='dataframe') fit = [ glm_fit(spikes, design_matrix, ind) for ind, spikes in enumerate(train_spikes_data) ] inbound_predict_design_matrix = _predictors_by_trajectory_direction( 'Inbound', place_bin_centers, design_matrix) outbound_predict_design_matrix = _predictors_by_trajectory_direction( 'Outbound', place_bin_centers, design_matrix) inbound_conditional_intensity = _get_conditional_intensity( fit, inbound_predict_design_matrix) outbound_conditional_intensity = _get_conditional_intensity( fit, outbound_predict_design_matrix) conditional_intensity = np.vstack([ outbound_conditional_intensity, outbound_conditional_intensity, inbound_conditional_intensity, inbound_conditional_intensity ]).T return dict( likelihood_function=poisson_likelihood, likelihood_kwargs=dict(conditional_intensity=conditional_intensity))
def get_formula_cols(formula, df, target_val=False, feature_vals=False): if target_val: formula = formula.split("~")[0] if feature_vals: formula = formula.split("~")[1] # test just the first 2 datapoints so it runs quicker? df = df.sample(2) cols = [] for col in df.columns: try: if target_val | feature_vals: tmp_mod = patsy.dmatrix(formula, df.drop(col, axis=1)) else: tmp_mod = patsy.dmatrices(formula, df.drop(col, axis=1)) except: cols.append(col) return cols
def create_regression(self, formula: str, metadata: pd.DataFrame): """Generate design matrix for count regression modeling. :param formula: Design formula to use in model :type formula: str :param metadata: Metadata for design matrix :type metadata: pd.DataFrame """ self.dmat = dmatrix(formula, metadata.loc[self.sample_names], return_type="dataframe") self.colnames = self.dmat.columns param_dict = { "p": self.dmat.shape[1], "x": self.dmat.values, } self.add_parameters(param_dict)
def transform_with_patsy(formula, data, *args, **kwargs): try: # needs patsy v0.5.1 to support formula in Python 3.7 # https://github.com/pydata/patsy/pull/131 import patsy except ImportError: raise ImportError("'patsy' is required to transform with string formula") if '~' in formula: y, X = patsy.dmatrices(formula, data=data, return_type='dataframe', *args, **kwargs) if len(y.shape) > 1 and y.shape[1] != 1: raise ValueError('target must be 1 dimensional') y = y.iloc[:, 0] return data._constructor(X, target=y) else: X = patsy.dmatrix(formula, data=data, return_type='dataframe', *args, **kwargs) return data._constructor(X)
def exposure_model(self, model, custom_model=None, bound=None): """Estimation of the exposure model, Pr(A=1|W). This value is used as the denominator for the inverse probability weights. Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, truncating weights leads to additional confounding. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for asymmetric trunctation, with the first value being the lower bound and the second being the upper bound """ self._g_model = self.exposure + ' ~ ' + model if custom_model is None: # Standard parametric regression model fitmodel = propensity_score(self.df, self._g_model, print_results=self._verbose_) pred = fitmodel.predict(self.df) else: # User-specified prediction model self._exp_model_custom = True data = patsy.dmatrix(model + ' - 1', self.df) pred = exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray( self.df[self.exposure]), ml_model=custom_model, pdata=np.asarray(data)) if bound is not None: pred2 = bounding(ipw=pred, bound=bound) self._specified_bound_ = np.sum(np.where(pred2 == pred, 0, 1)) pred = pred2 self._denominator_ = np.where(self.df[self.exposure] == 1, pred, 1 - pred)
def design_mat(mod, numCovs, batch_levels): # require levels to make sure they are in the same order as we use in the # rest of the script. design = patsy.dmatrix("~ 0 + C(batch, levels=%s)" % str(batch_levels), mod, return_type="dataframe") mod = mod.drop(["batch"], axis=1) print >>sys.stderr, "found %i batches" % design.shape[1] other_cols = [c for i, c in enumerate(mod.columns) if not i in numCovs] factor_matrix = mod[other_cols] design = pa.concat((design, factor_matrix), axis=1) if numCovs is not None: print >>sys.stderr, "found %i numerical covariates..." % len(numCovs) for i, nC in enumerate(numCovs): cname = mod.columns[nC] print >>sys.stderr, "\t", cname design[cname] = mod[mod.columns[nC]] print >>sys.stderr, "found %i categorical variables:" % len(other_cols) print >>sys.stderr, "\t" + ", ".join(other_cols) return design
def variable_effect(pheno,var,regressors,conn): """ Test effect of continuous variable. pheno = dataframe: -filtered to be only relevant subjects (use mask_var) var = column from pheno regressors = list of strings, formatted for patsy connectomes = n_subjects x n_edges array Returns: table = n_edges - betas_std = including standardization on controls - pvalues = pvalues - qvalues = fdr corrected pvalues alpha = 0.05 """ n_edges = conn.shape[1] contrast = np.zeros(1 + len(regressors)) contrast[0] = 1 betas_std = np.zeros(n_edges) pvalues = np.zeros(n_edges) formula = ' + '.join((regressors + [var])) dmat = pat.dmatrix(formula, pheno, return_type='dataframe',NA_action='raise') mask_std = np.ones(pheno.shape[0]).astype(bool) conn_std = standardize(mask_std, conn) for edge in range(n_edges): model_std = sm.OLS(conn_std[:,edge],dmat) results_std = model_std.fit() betas_std[edge] = results_std.params[var] pvalues[edge] = results_std.pvalues[var] mt = multipletests(pvalues,method='fdr_bh') reject = mt[0] qvalues = mt[1] table = pd.DataFrame(np.array([betas_std,pvalues,qvalues,reject]).transpose(), columns=['betas_std','pvalues','qvalues','reject']) return table
def filter_data_and_create_design_matrices(self): data_for_training = self.base_predictor.input_data.copy(deep=True) data_for_prediction = self.base_predictor.input_data.copy(deep=True) if self.channel == 'all': data_for_training = data_for_training.loc[ data_for_training['days_since_first_order'] >= self.config['goal_horizon']] data_for_prediction = data_for_prediction.loc[ data_for_prediction['days_since_first_order'] >= self.config['day_horizon']] else: data_for_training = data_for_training.loc[ (data_for_training['days_since_first_order'] >= self.config['goal_horizon']) & (data_for_training['attribution_level_1'] == self.channel)] data_for_prediction = data_for_prediction.loc[ (data_for_prediction['days_since_first_order'] >= self.config['day_horizon']) & (data_for_prediction['attribution_level_1'] == self.channel)] shuffled_training_data = data_for_training.sample(frac=1) training_columns = [ i for i in iter_flatten(self.config['training_columns']) ] self.full_training_labels, filtered_training_data = patsy.dmatrices( self.config['goal_column'] + ' ~ 0 + ' + ' + '.join(training_columns), data=shuffled_training_data, return_type="dataframe") # fix enum column headers for xgb input requirements self.filtered_training_data = filtered_training_data.rename( columns=lambda x: x.replace("[", "(").replace("]", ")")) filtered_prediction_data = patsy.dmatrix('0 + ' + ' + '.join(training_columns), data=data_for_prediction, return_type="dataframe") # fix enum column headers for xgb input requirements self.filtered_prediction_data = filtered_prediction_data.rename( columns=lambda x: x.replace("[", "(").replace("]", ")"))
def from_formula(cls, formula, data, priors=None, vars=None, family='weibull', name='', model=None): import patsy ##### Here's how we parse the formula ###### # Parse the formula and split into essential components #### TODO: Automatic selection of multivariate family based on dimension of inputs outcomes= formula.split("~")[0] # get time variables time_vars = [v.strip() for v in outcomes[outcomes.find("([")+2:outcomes.find("]")].split(",")] #get event times event_raw = outcomes[outcomes.find("],")+2:] event_vars = [v.strip() for v in event_raw[event_raw.find("[")+1:event_raw.find("])")].split(",")] # Now get x, times, and events x = patsy.dmatrix(formula.split("~")[1].strip(), data) y = data[time_vars].as_matrix() e = data[event_vars].as_matrix() labels = x.design_info.column_names return cls(x=np.asarray(x), y=np.asarray(y)[:, 0], e=np.asarray(e)[:, 0] ,intercept=False, labels=labels, priors=priors, vars=vars, family=family, name=name, model=model)