def _package_attrs(self, attrs): # Sometimes features are retrieved from wrapper (stargazer does this), # other times from the actual result (statsmodels' summary_col does # this), so we'll have both. rres = RRegressionResults() # Use patsy to extract the target variable: fobj = ModelDesc.from_formula(self.formula) rres.target = fobj.lhs_termlist[0].name() rres.model = self # We need to hijack this rather than subclassing because stargazer does # not use "isinstance()" but "type()": wrap = RegressionResultsWrapper(rres) # All items except "params" are @cache_readonly and need first to be # deleted, and then redefined: for attr in attrs: if attr not in ('params', ): if hasattr(rres, attr): delattr(rres, attr) setattr(rres, attr, attrs[attr]) setattr(wrap, attr, attrs[attr]) self._debug("Set {} to {}".format(attr, attrs[attr])) rres.__class__ = RegressionResults return wrap
def _prune(self, fit, p_max): """ If the fit contains statistically insignificant parameters, remove them. Returns a pruned fit where all parameters have p-values of the t-statistic below p_max Parameters ---------- fit: fm.ols fit object Can contain insignificant parameters p_max : float Maximum allowed probability of the t-statistic Returns ------- fit: fm.ols fit object Won't contain any insignificant parameters """ model_desc = ModelDesc( fit.model.formula.lhs_termlist[:], fit.model.formula.rhs_termlist[:]) to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() to_prune.remove('Intercept') while to_prune: model_desc.rhs_termlist.remove(Term([LookupFactor(to_prune[0])])) fit = fm.ols(model_desc, data=self.data_frame).fit() to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() to_prune.remove('Intercept') return fit
def bootstrap(factor_names, model, run_count): """Create a minimal starting design that is non-singular.""" md = ModelDesc.from_formula(model) model_size = len(md.rhs_termlist) if run_count == 0: run_count = model_size if model_size > run_count: raise ValueError("Can't build a design of size {} " "for a model of rank {}. " "Model: '{}'".format(run_count, model_size, model)) factor_count = len(factor_names) x0 = np.zeros(factor_count) # add high/low bounds to constraint matrix constraint_matrix = np.zeros((factor_count * 2, factor_count)) bounds = np.zeros(factor_count * 2) c = 0 for f in range(factor_count): constraint_matrix[c][f] = -1 bounds[c] = 1 c += 1 constraint_matrix[c][f] = 1 bounds[c] = 1 c += 1 start_points = hit_and_run(x0, constraint_matrix, bounds, run_count) d = pd.DataFrame(start_points, columns=factor_names) X = dmatrix(model, d) return (d, X)
def _build_targets(formula, data): y, _ = dmatrices(ModelDesc(formula.lhs_termlist, list()), data) y = np.ravel(y) y = np.array(y) return y
def add_predictors(base_formula, extra_predictors): desc = ModelDesc.from_formula(base_formula) # Using LookupFactor here ensures that everything will work correctly even # if one of the column names in extra_columns is named like "weight.in.kg" # or "sys.exit()" or "LittleBobbyTables()". desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors] return desc
def _parse_formula(formula_str, include_intercept=False): """ Wrap some extra functionality into Patsy formula parse """ _form = ModelDesc.from_formula(formula_str) # patsy (by default) includes intercept. Discard this on RHS if not include_intercept: _form.rhs_termlist = [ t for t in _form.rhs_termlist if len(t.factors) != INTERCEPT ] #print(_form.lhs_termlist) #_categoricals = [t for t in _form.lhs_termlist #if t.startswith("C(") and t.endswith(")")] #_task = 'classify' if len(categoricals) > 0 else 'regress' #if len(_categoricals) != len(_form.lhs_termlist): #raise ValueError(f"Mixed targets detected in {formula_str}. " #"Specify all categoricals " #"using the C(...) syntax or all continuous.") #_num_classes = None if _task == 'classify' else len(_form.lhs_termlist) _num_classes = 2 _task = 'classify' return _form, _task, _num_classes
def _do_analysis_no_cross_validation(self): """ Find the best model (fit) and create self.list_of_fits and self.fit """ # first model is just the mean response_term = [Term([LookupFactor(self.y)])] model_terms = [Term([])] # empty term is the intercept all_model_terms_dict = { x: Term([LookupFactor(x)]) for x in self.list_of_x } # ...then add another term for each candidate #model_terms += [Term([LookupFactor(c)]) for c in candidates] model_desc = ModelDesc(response_term, model_terms) self._list_of_fits.append(fm.ols(model_desc, data=self.df).fit()) # try to improve the model until no improvements can be found while all_model_terms_dict: # try each x and overwrite the best_fit if we find a better one # the first best_fit is the one from the previous round ref_fit = self._list_of_fits[-1] best_fit = self._list_of_fits[-1] best_bic = best_fit.bic for x, term in all_model_terms_dict.items(): # make new_fit, compare with best found so far model_desc = ModelDesc( response_term, ref_fit.model.formula.rhs_termlist + [term]) fit = fm.ols(model_desc, data=self.df).fit() if fit.bic < best_bic: best_bic = fit.bic best_fit = fit best_x = x # Sometimes, the obtained fit may be better, but contains unsignificant parameters. # Correct the fit by removing the unsignificant parameters and estimate again best_fit = self._prune(best_fit, p_max=self.p_max) # if best_fit does not contain more variables than ref fit, exit if len(best_fit.model.formula.rhs_termlist) == len( ref_fit.model.formula.rhs_termlist): break else: self._list_of_fits.append(best_fit) all_model_terms_dict.pop(best_x) self._fit = self._list_of_fits[-1]
def _drop_intercept(formula, add_intercept): """Drop the intercept from formula if not add_intercept""" if not add_intercept: if not isinstance(formula, ModelDesc): formula = ModelDesc.from_formula(formula) if INTERCEPT in formula.rhs_termlist: formula.rhs_termlist.remove(INTERCEPT) return formula return formula
def create_patsy_model(dependent_variable, independent_variables, transformations={}, interactions=[]): ''' Construct and return patsy formula (object representation) ''' # 1) Handling passing in [{'name': X}] vs [X] lhs_var = dependent_variable rhs_vars = independent_variables if 'name' in dependent_variable: lhs_var = dependent_variable['name'] if 'name' in independent_variables[0]: new_rhs_vars = [] for iv in independent_variables: if type(iv) is list: new_rhs_vars.append([x['name'] for x in iv]) else: if 'name' in iv: new_rhs_vars.append(iv['name']) else: new_rhs_vars.append(iv) rhs_vars = new_rhs_vars if interactions: first_interaction = interactions[0] if 'name' in first_interaction: new_interactions = [] for interaction in interactions: new_interactions.append([term['name'] for term in interaction]) rhs_interactions = new_interactions else: rhs_interactions = interactions # 2) Constructing model lhs = [ Term([LookupFactor(lhs_var)]) ] rhs = [ Term([]) ] for rhs_var in rhs_vars: if type(rhs_var) is list: rhs += [ Term([ LookupFactor(term) for term in rhs_var ]) ] else: if rhs_var in transformations: transformation = transformations[rhs_var] if transformation == 'square': rhs += [ Term([ LookupFactor(rhs_var) ]) ] format_string = transformation_to_format_string[transformation] rhs += [ Term([ EvalFactor(format_string.format(rhs_var)) ]) ] else: rhs += [ Term([ LookupFactor(rhs_var) ]) ] if interactions: rhs += [ Term([ LookupFactor(term) for term in interaction ]) for interaction in rhs_interactions ] model = ModelDesc(lhs, rhs) return model
def from_r_object(cls, rsum, ci=None, debug=False): """ Reconstruct a model from an rpy2 summary object, and optionally its confidence intervals. These can be easily saved in R with save(objname, file=file_name) and loaded in Python via rpy2 with r['load'](file_name)['objname'] Parameters ---------- rsum : R object R summary of a fitted model. Typically produced with "summary(fitted)" (in R). ci : R object Confidence intervals of the fitted model Typically produced with "confint(fitted)" (in R). debug : bool, default False If True, print debug messages. """ d_res = cls._r_as_dict(None, rsum) if not 'terms' in d_res: msg = ("Interpreting r objects inside Python is only supported " "for few estimators. More will work using " "RModel.from_rdata() directly.") raise NotImplementedError(msg) formula = str(d_res['terms']).splitlines()[0] # We want to create a fake dataset, and we use patsy to get the list of # variables. We are actually creating columns for interactions and # functions too... but who cares, identifying them would be at the # moment overkill. fobj = ModelDesc.from_formula(formula) varnames = [t.name() for t in fobj.rhs_termlist + fobj.lhs_termlist][1:] # We need to pass some pd.DataFrame to from_formula() below - but it # doesn't seem to be actually used. data = pd.DataFrame(-1, index=[0], columns=[0]) # Creating the OLS object and only then hijacking it allows us to best # profit of statsmodels' machinery: mod = OLS.from_formula(formula, data) mod.__class__ = RModel # This is now an RModel: mod._initialize(debug=debug) attrs = mod._inspect_R(rsum, ci=ci) wrap = mod._package_attrs(attrs) return wrap
def get_factor_names(self, level=1): """ Gets the factors in a model which correspond to a certain level: 1 : pure factors 2 : 2-factor interactions and quadratic terms 3 : 3-factor interactions and cubic terms 4 : etc """ spec = ModelDesc.from_formula(self._model_spec) return [term.name() for term in spec.rhs_termlist \ if len(term.factors)==level]
def _prune(self, fit, p_max): """ If the fit contains statistically insignificant parameters, remove them. Returns a pruned fit where all parameters have p-values of the t-statistic below p_max Parameters ---------- fit: fm.ols fit object Can contain insignificant parameters p_max : float Maximum allowed probability of the t-statistic Returns ------- fit: fm.ols fit object Won't contain any insignificant parameters """ def remove_from_model_desc(x, model_desc): """ Return a model_desc without x """ rhs_termlist = [] for t in model_desc.rhs_termlist: if not t.factors: # intercept, add anyway rhs_termlist.append(t) elif not x == t.factors[0]._varname: # this is not the term with x rhs_termlist.append(t) md = ModelDesc(model_desc.lhs_termlist, rhs_termlist) return md corrected_model_desc = ModelDesc(fit.model.formula.lhs_termlist[:], fit.model.formula.rhs_termlist[:]) pars_to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() try: pars_to_prune.remove('Intercept') except: pass while pars_to_prune: corrected_model_desc = remove_from_model_desc( pars_to_prune[0], corrected_model_desc) fit = fm.ols(corrected_model_desc, data=self.df).fit() pars_to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() try: pars_to_prune.remove('Intercept') except: pass return fit
def dict_to_model_desc(dictionary): """Return a string representation of a patsy ModelDesc object""" lhs_termlist = [Term([LookupFactor(dictionary['lhs_termlist'][0])])] rhs_termlist = [] for name in dictionary['rhs_termlist']: if name == '': rhs_termlist.append(Term([])) else: rhs_termlist.append(Term([LookupFactor(name)])) return ModelDesc(lhs_termlist, rhs_termlist)
def _modeldesc_from_dict(self, d): """Return a string representation of a patsy ModelDesc object""" lhs_termlist = [Term([LookupFactor(d['lhs_termlist'][0])])] rhs_termlist = [] for name in d['rhs_termlist']: if name == '': rhs_termlist.append(Term([])) else: rhs_termlist.append(Term([LookupFactor(name)])) md = ModelDesc(lhs_termlist, rhs_termlist) return md
def _parse_formula(formula): # head off patsy errors if ';' in formula or formula.strip()[0].isdigit(): metric = formula.split('~')[0].strip() else: metric = None # use patsy to parse formula model_desc = ModelDesc.from_formula(formula) group_columns = set() for t in model_desc.rhs_termlist: for i in t.factors: group_columns.add(i.name()) if metric is None: metric = model_desc.lhs_termlist[0].name() return metric, group_columns
def remove_from_model_desc(x, model_desc): """ Return a model_desc without x """ rhs_termlist = [] for t in model_desc.rhs_termlist: if not t.factors: # intercept, add anyway rhs_termlist.append(t) elif not x == t.factors[0]._varname: # this is not the term with x rhs_termlist.append(t) md = ModelDesc(model_desc.lhs_termlist, rhs_termlist) return md
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: int = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: column = metadata.get_column(i.name()) if column.has_missing_values(): raise ValueError( 'adonis requires metadata columns with no ' 'NaN values (missing values in column `%s`.)' % (column.name, )) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = [ 'run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp ] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def _parse_formula(formula): # head off patsy errors if '~' not in formula: raise ValueError('Formula not valid: missing tilde.\n' 'Enter a valid formula in format "y ~ model".') if ';' in formula or formula.strip()[0].isdigit(): metric = formula.split('~')[0].strip() else: metric = None # use patsy to parse formula model_desc = ModelDesc.from_formula(formula) group_columns = set() for t in model_desc.rhs_termlist: for i in t.factors: group_columns.add(i.name()) if metric is None: metric = model_desc.lhs_termlist[0].name() return metric, group_columns
def build_model_desc(snps, no_interactions): """ Creates the model description (formula) :param snps: The selected snp labels :param no_interactions: If false, interactions will not be included in the model :return: The model description """ x_terms = [] for i in range(len(snps)): # Main effects snp_i = EvalFactor(snps[i]) x_terms.append(Term([snp_i])) if not no_interactions: for j in range(i + 1, len(snps)): # Interaction effects snp_j = EvalFactor(snps[j]) x_terms.append(Term([snp_i, snp_j])) return ModelDesc([], x_terms)
def parse_formula(f_str): patsy_formula = ModelDesc.from_formula(f_str) tokenize = patsy_formula.lhs_termlist valid_tokenizers = list() for term in tokenize: for e in term.factors: code = e.code if code in _VALID_TOKENIZERS: valid_tokenizers.append(code) if len(valid_tokenizers) == 0: tokenize.insert(0, Term([EvalFactor(_DEFAULT_TOKENIZER)])) if len(valid_tokenizers) > 1: raise RuntimeError("Multiple tokenizers found in formula\n" f"Specify one from {' '.join(_VALID_TOKENIZERS)}") preprocess = [t for t in patsy_formula.rhs_termlist if len(t.factors) > 0] return tokenize, preprocess
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def _do_analysis_cross_validation(self): """ Find the best model (fit) based on cross-valiation (leave one out) """ assert len(self.data_frame) < 15, "Minimum 15 datapoints" # initialization: first model is the mean, but compute cv correctly. errors = [] response_term = [Term([LookupFactor(self.dependent_var)])] model_desc = ModelDesc(response_term, [Term([])]) for i in self.data_frame.index: # make new_fit, compute cross-validation and store error data_frame_ = self.data_frame.drop(i, axis=0) fit = fm.ols(model_desc, data=data_frame_).fit() cross_prediction = self._predict( fit=fit, data_frame=self.data_frame.loc[[i], :]) errors.append( cross_prediction['predicted'] - cross_prediction[self.dependent_var]) self._list_of_fits = [fm.ols(model_desc, data=self.data_frame).fit()] self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))] # try to improve the model until no improvements can be found all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x} while all_model_terms_dict: # import pdb;pdb.set_trace() # try each x in all_exog and overwrite if we find a better one # at the end of iteration (and not earlier), save the best of the iteration better_model_found = False best = { "fit": self._list_of_fits[-1], "cverror": self.list_of_cverrors[-1] } for value, term in all_model_terms_dict.items(): model_desc = ModelDesc( response_term, self._list_of_fits[-1].model.formula.rhs_termlist + [term]) # cross_validation, currently only implemented for monthly data # compute the mean error for a given formula based on leave-one-out. errors = [] for i in self.data_frame.index: # make new_fit, compute cross-validation and store error data_frame_ = self.data_frame.drop(i, axis=0) fit = fm.ols(model_desc, data=data_frame_).fit() cross_prediction = self._predict( fit=fit, data_frame=self.data_frame.loc[[i], :]) errors.append( cross_prediction['predicted'] - cross_prediction[self.dependent_var]) cverror = np.mean(np.abs(np.array(errors))) # compare the model with the current fit if cverror < best['cverror']: # better model, keep it # first, reidentify using all the datapoints best['fit'] = fm.ols( model_desc, data=self.data_frame).fit() best['cverror'] = cverror better_model_found = True best_val = value if better_model_found: self._list_of_fits.append(best['fit']) self.list_of_cverrors.append(best['cverror']) else: # if we did not find a better model, exit break # next iteration with the found exog removed all_model_terms_dict.pop(best_val) self._fit = self._list_of_fits[-1]
execfile("code/03-DataPrep.py") from patsy import dmatrices, ModelDesc, Term, LookupFactor from sklearn.metrics import roc_curve from sklearn.model_selection import cross_val_score from sklearn import linear_model from sklearn.feature_selection import SelectFromModel import numpy as np ''' model 1 - logistic regression with L1 regularization ''' formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in orgfeatures]) y, x = dmatrices(formula, rawdf, return_type="dataframe") y = y.values.flatten() logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01) logreg.fit(x, y) scores = cross_val_score(logreg, x, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()}) nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist() print(len(nflist)) # feature selection using best model from cross validation and get the best features fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear') fslogreg.fit(x, y)
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env): rerp_infos = [] rerp_names = set() spans = [] design_offset = 0 expanded_design_offset = 0 data_format = data_set.data_format for rerp_idx, rerp_spec in enumerate(rerp_specs): start_offset = data_format.ms_to_samples(rerp_spec.start_time) # Offsets are half open: [start, stop) # But, it's more intuitive for times to be closed: [start, stop] # So we interpret the user times as a closed interval, and add 1 # sample when converting to offsets. stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time) if start_offset >= stop_offset: raise ValueError("Epochs must be >1 sample long!") event_set = data_set.events.find(rerp_spec.event_query) # Tricky bit: the specifies a RHS-only formula, but really we have an # implicit LHS (determined by the event_query). This makes things # complicated when it comes to e.g. keeping track of which items # survived NA removal, determining the number of rows in an # intercept-only formula, etc. Really we want patsy to just treat all # this stuff the same way as it normally handles a LHS~RHS # formula. So, we convert our RHS formula into a LHS~RHS formula, # using a special LHS that represents each event by a placeholder # integer! desc = ModelDesc.from_formula(rerp_spec.formula, eval_env) if desc.lhs_termlist: raise ValueError("Formula cannot have a left-hand side") desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])] fake_lhs, design = dmatrices(desc, event_set) surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel() design_row_idxes = np.empty(len(event_set)) design_row_idxes.fill(-1) design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0]) # Now design_row_idxes[i] is -1 if event i was thrown out, and # otherwise gives the row in 'design' which refers to event 'i'. for i in xrange(len(event_set)): event = event_set[i] # -1 for non-existent design_row_idx = design_row_idxes[i] recspan = (event.recording, event.span_id) recspan_intern = recspan_intern_table[recspan] epoch_start = start_offset + event.start_idx epoch_stop = stop_offset + event.start_idx if design_row_idx == -1: design_row = None else: design_row = design[design_row_idx, :] epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row, design_offset, expanded_design_offset, rerp_idx, []) if design_row is None: # Event thrown out due to missing predictors; this # makes its whole epoch into an artifact -- but if overlap # correction is disabled, then this artifact only affects # this epoch, not anything else. (We still want to treat # it as an artifact though so we get proper accounting at # the end.) epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR") spans.append( DataSpan((recspan_intern, epoch_start), (recspan_intern, epoch_stop), epoch, None)) if rerp_spec.name in rerp_names: raise ValueError("name %r used for two different sub-analyses" % (rerp_spec.name, )) rerp_names.add(rerp_spec.name) rerp_info = { "spec": rerp_spec, "design_info": design.design_info, "start_offset": start_offset, "stop_offset": stop_offset, "design_offset": design_offset, "expanded_design_offset": expanded_design_offset, "total_epochs": len(event_set), "epochs_with_data": 0, "epochs_with_artifacts": 0, } rerp_infos.append(rerp_info) design_offset += design.shape[1] epoch_samples = stop_offset - start_offset expanded_design_offset += epoch_samples * design.shape[1] return rerp_infos, spans, design_offset, expanded_design_offset
def __str__(self): spec = ModelDesc.from_formula(self._model_spec) return spec.describe()
chist = chist[Yhist_gpvars + ['HISTBIN', wgtvar]].groupby( Yhist_gpvars + ['HISTBIN'], as_index=False).aggregate(np.sum) return chist if __name__ == "__main__": ## Build the regression formula catvars = list(cfg.flevels.keys()) with open("fpaths.json") as fpj: FPATHS = json.load(fpj) numvar_evals = ["I(YEAR - 2000)", "INCTOT99"] catvar_evals = [ "C(" + cv + ", Treatment, levels=cfg.flevels['" + cv + "'])" for cv in catvars ] desc = ModelDesc([], [Term([EvalFactor(v)]) for v in numvar_evals]) desc.rhs_termlist += [Term([EvalFactor(v)]) for v in catvar_evals] # Interactions interact_order = 2 catvar_interact = ['SEX', 'AGECAT', 'RACE'] print("Including all order-" + str(interact_order) + " interactions of the following variables:\n\t" + ", ".join(catvar_interact + numvar_evals)) interact_evals = numvar_evals + [ catvar_evals[i] for i in [catvars.index(v) for v in catvar_interact] ] desc.rhs_termlist += [ Term([EvalFactor(v) for v in list(comb)]) for comb in combinations(interact_evals, interact_order) ] # 'implied decimals'
def scatterfit(x, y, method='pearson', adjustVars=[], labelLookup={}, plotLine=True, annotateFit=True, annotatePoints=False, returnModel=False, lc='gray', **kwargs): """Scatter plot of x vs. y with a fitted line overlaid. Expects x and y as pd.Series but will accept arrays. Prints covariate unadjusted AND adjusted rho/pvalues on the figure. Plots covariate unadjusted data. Parameters ---------- x,y : ndarrays or pd.Series method : string 'pearson' adjustVars : list labelLookup : dict plotLine : bool annotateFit : bool annotatePoints : bool returnModel : bool kwargs : additional keyword arguments Passed to the plot function for the data points. Returns ------- model : statsmodels GLM object Optionally the fitted model, depending on returnModel.""" k = kwargs.keys() if not 'mec' in k: kwargs.update({'mec': 'k'}) if not 'mfc' in k: kwargs.update({'mfc': 'k'}) if not 'ms' in k: kwargs.update({'ms': 5}) """Try to force X and Y into pandas.Series objects""" if not isinstance(x, pd.core.series.Series): x = pd.Series(x, name='X') if not isinstance(y, pd.core.series.Series): y = pd.Series(y, name='Y') xlab = x.name ylab = y.name if xlab == ylab: ylab = 'y_' + ylab xlab = 'x_' + xlab x.name = xlab y.name = ylab tmpDf = pd.concat(( x, y, ), axis=1, join='inner') for av in adjustVars: tmpDf = pd.concat((tmpDf, pd.DataFrame(av)), axis=1) """Drop any row with a nan in either column""" tmpDf = tmpDf.dropna(axis=0, how='any') plt.gca().set_xmargin(0.2) plt.gca().set_ymargin(0.2) unrho, unp = partialcorr(tmpDf[xlab], tmpDf[ylab], method=method) """Print unadjusted AND adjusted rho/pvalues Plot unadjusted data with fit though...""" if method == 'spearman' and plotLine: #unrho,unp=stats.spearmanr(tmpDf[xlab],tmpDf[ylab]) if unrho > 0: plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab]), '-', color=lc) else: plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab], reverse=True), '-', color=lc) elif method == 'pearson' and plotLine: #unrho,unp=stats.pearsonr(tmpDf[xlab],tmpDf[ylab]) formula_like = ModelDesc( [Term([LookupFactor(ylab)])], [Term([]), Term([LookupFactor(xlab)])]) Y, X = dmatrices(formula_like, data=tmpDf, return_type='dataframe') model = sm.GLM(Y, X, family=sm.families.Gaussian()) results = model.fit() mnmxi = np.array([tmpDf[xlab].idxmin(), tmpDf[xlab].idxmax()]) plt.plot(tmpDf[xlab][mnmxi], results.fittedvalues[mnmxi], '-', color=lc) plt.plot(tmpDf[xlab], tmpDf[ylab], 'o', **kwargs) if annotatePoints: annotationParams = dict(xytext=(0, 5), textcoords='offset points', size='medium') for x, y, lab in zip(tmpDf[xlab], tmpDf[ylab], tmpDf.index): plt.annotate(lab, xy=(x, y), **annotationParams) if annotateFit: if unp > 0.001: s = 'p = %1.3f\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0]) else: s = 'p = %1.1e\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0]) textTL(plt.gca(), s, color='black') if len(adjustVars) > 0: rho, p = partialcorr(tmpDf[xlab], tmpDf[ylab], adjust=adjustVars, method=method) if p > 0.001: s = 'adj-p = %1.3f\nadj-rho = %1.2f\nn = %d' % (p, rho, tmpDf.shape[0]) else: s = 'adj-p = %1.1e\nadj-rho = %1.2f\nn = %d' % (p, rho, tmpDf.shape[0]) textTR(plt.gca(), s, color='red') plt.xlabel(labelLookup.get(xlab, xlab)) plt.ylabel(labelLookup.get(ylab, ylab)) if returnModel: return model
def get_response_name(self): spec = ModelDesc.from_formula(self._model_spec) return spec.lhs_termlist[0].name()
def _group_model(spreadsheet=None, contrastdicts=None, variabledicts=None, subjects=None): rawdataframe = loadspreadsheet(spreadsheet) id_column = None for variabledict in variabledicts: if variabledict["type"] == "id": id_column = variabledict["name"] break assert id_column is not None, "Missing id column, cannot specify model" rawdataframe[id_column] = pd.Series(rawdataframe[id_column], dtype=str) if all(str(id).startswith("sub-") for id in rawdataframe[id_column]): # for bids rawdataframe[id_column] = [ str(id).replace("sub-", "") for id in rawdataframe[id_column] ] rawdataframe = rawdataframe.set_index(id_column) continuous_columns = [] categorical_columns = [] columns_in_order = [] for variabledict in variabledicts: if variabledict["type"] == "continuous": continuous_columns.append(variabledict["name"]) columns_in_order.append(variabledict["name"]) elif variabledict["type"] == "categorical": categorical_columns.append(variabledict["name"]) columns_in_order.append(variabledict["name"]) # separate continuous = rawdataframe[continuous_columns] categorical = rawdataframe[categorical_columns] # only keep subjects that are in this analysis # also sets order continuous = continuous.loc[subjects, :] categorical = categorical.loc[subjects, :] # Demean continuous for flameo continuous -= continuous.mean() # replace np.nan by 0 for demeaned_continuous file and regression models continuous = continuous.replace({np.nan: 0}) # change type first to string then to category categorical = categorical.astype(str) categorical = categorical.astype("category") # merge dataframe = categorical.join(continuous, how="outer").loc[subjects, :] # maintain order dataframe = dataframe[columns_in_order] # remove zero variance columns columns_var_gt_0 = dataframe.apply(pd.Series.nunique) > 1 dataframe = dataframe.loc[:, columns_var_gt_0] # don't need to specify lhs lhs = [] # generate rhs rhs = [Term([])] # force intercept for contrastdict in contrastdicts: if contrastdict["type"] == "infer": # for every term in the model a contrast of type infer needs to be specified rhs.append( Term([LookupFactor(name) for name in contrastdict["variable"]])) # specify patsy design matrix modelDesc = ModelDesc(lhs, rhs) dmat = dmatrix(modelDesc, dataframe, return_type="dataframe") _check_multicollinearity(dmat) # prepare lsmeans uniqueValuesForCategorical = [(0.0, ) if pd.api.types.is_numeric_dtype( dataframe[f].dtype) else dataframe[f].unique() for f in dataframe.columns] grid = pd.DataFrame(list(product(*uniqueValuesForCategorical)), columns=dataframe.columns) refDmat = dmatrix(dmat.design_info, grid, return_type="dataframe") # data frame to store contrasts contrastMats = [] for field, columnslice in dmat.design_info.term_name_slices.items(): constraint = { column: 0 for column in dmat.design_info.column_names[columnslice] } contrast = dmat.design_info.linear_constraint(constraint) assert np.all(contrast.variable_names == dmat.columns) contrastMat = pd.DataFrame(contrast.coefs, columns=dmat.columns) contrastMats.append((field, contrastMat)) for contrastdict in contrastdicts: if contrastdict["type"] == "t": (variable, ) = contrastdict["variable"] variableLevels = dataframe[variable].unique() # Generate the lsmeans matrix where there is one row for each # factor level. Each row is a contrast vector. # This contrast vector corresponds to the mean of the dependent # variable at the factor level. # For example, we would have one row that calculates the mean # for patients, and one for controls. lsmeans = pd.DataFrame(index=variableLevels, columns=dmat.columns) for level in variableLevels: lsmeans.loc[level, :] = refDmat.loc[grid[variable] == level, :].mean() valueDict = contrastdict["values"] names = [ name for name in valueDict.keys() if name in variableLevels ] values = [valueDict[name] for name in names] # If we wish to test the mean of each group against zero, # we can simply use these contrasts and be done. # To test a linear hypothesis such as patient-control=0, # which is expressed here as {"patient":1, "control":-1}, # we translate it to a contrast vector by taking the linear # combination of the lsmeans contrasts. contrastVector = lsmeans.loc[names, :].mul(values, axis=0).sum() contrastMat = pd.DataFrame([contrastVector], columns=dmat.columns) contrastMats.append((contrastdict["name"], contrastMat)) npts, nevs = dmat.shape if nevs >= npts: logger.warning("Reverting to simple intercept only design. \n" f"nevs ({nevs}) >= npts ({npts})") return ( { "intercept": [1.0] * len(subjects) }, [["mean", "T", ["intercept"], [1]]], ["mean"], ) regressors = {d: dmat[d].tolist() for d in dmat.columns} contrasts = [] contrast_names = [] for contrastName, contrastMat in contrastMats: # t contrasts if contrastMat.shape[0] == 1: contrastVec = contrastMat.squeeze() contrasts.append((contrastName, "T", list(contrastVec.keys()), list(contrastVec))) contrast_names.append(contrastName) for contrastName, contrastMat in contrastMats: # f contrasts if contrastMat.shape[0] > 1: tcontrasts = [] # an f contrast consists of multiple t contrasts for i, contrastVec in contrastMat.iterrows(): tname = f"{contrastName}_{i:d}" tcontrasts.append( (tname, "T", list(contrastVec.keys()), list(contrastVec))) contrasts.extend(tcontrasts) # add t contrasts to the model contrasts.append( (contrastName, "F", tcontrasts)) # then add the f contrast contrast_names.append( contrastName) # we only care about the f contrast return regressors, contrasts, contrast_names
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np from patsy import ModelDesc from patsy import demo_data from patsy import dmatrix, dmatrices print(ModelDesc.from_formula("y ~ x").describe()) print(ModelDesc.from_formula("y ~ x + x + x").describe()) print(ModelDesc.from_formula("y ~ -1 + x").describe()) print(ModelDesc.from_formula("~ -1").describe()) print(ModelDesc.from_formula("y ~ a:b").describe()) print(ModelDesc.from_formula("y ~ a*b").describe()) print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe()) print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe()) print( ModelDesc.from_formula("np.log(x1 + x2) " "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe()) #Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe(): desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2") print(desc.describe()) data = demo_data("a", "b", "x1", "x2") mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data) print(mat.design_info.term_names) data = demo_data("a", "b", "y") mat1 = dmatrices("y ~ 0 + a:b", data)[1]
# Now to try this in `patsy`. # # Steps: # 1. See how the model description is derived from the formula # 2. Build the design matrix that the formula specifies # 3. Use the design matrix in order to create the model in `scikit-learn` # In[87]: from patsy import ModelDesc, EvalEnvironment # In[88]: env = EvalEnvironment.capture() predicted_lat_age_mtx = ModelDesc.from_formula('Predicted ~ Age_Calc * Case', env) # In[89]: predicted_lat_age_mtx # In[90]: from patsy import dmatrix # In[91]: design_mtx = dmatrix('Case * Age_Calc', subj_case_data)
def lm( model_spec: str, data: pd.DataFrame, name: Optional[str] = None, alias_threshold: Optional[float] = 0.995, ) -> Model: """ Create a linear model. """ def find_aliases(model, model_desc, threshold_correlation=0.995): """ Finds columns which are exactly correlated, or up to at least a level of `threshold_correlation`. Returns a dictionary of aliasing and a list of columns to keep. The columns to keep will be in the order checked. Perhaps this can be improved. For example if AB = CD, then return AB to keep. For example if A = BCD, then return A, and not the BCD column to keep. """ has_variation = model.exog.std(axis=0) > np.sqrt(np.finfo(float).eps) # np.dot(model.exog.T, model.exog)/model.exog.shape[0] # Drop columns which do not have any variation corrcoef = np.corrcoef(model.exog[:, has_variation].T) # , ddof=0) # Snippet of code here is from the NumPy "corrcoef" function. Adapted. c = np.cov(model.exog.T, None, rowvar=True) dot_product = model.exog.T @ model.exog try: d = np.diag(c) except ValueError: # scalar covariance # nan if incorrect value (nan, inf, 0), 1 otherwise return c / c stddev = np.sqrt(d.real) aliasing = defaultdict(list) terms = model_desc.rhs_termlist drop_columns = [] counter = -1 corrcoef = c.copy() for idx, check in enumerate(has_variation): if check: counter += 1 for j, stddev_value in enumerate(stddev): if stddev_value == 0: pass else: corrcoef[idx, j] = c[idx, j] / stddev[idx] / stddev[j] # corrcoef = c / stddev[idx, None] # corrcoef = corrcoef / stddev[None, idx] candidates = [ i for i, val in enumerate(np.abs(corrcoef[idx, :])) if (val > threshold_correlation) ] signs = [np.sign(j) for j in corrcoef[idx, :]] else: # Columns with no variation candidates = [ i for i, j in enumerate(has_variation) if (j <= threshold_correlation) ] # Track the correlation signs signs = [np.sign(j) for j in dot_product[idx, :]] # Now drop out the candidates with the longest word lengths alias_len = [(len(terms[i].factors), i) for i in candidates] alias_len.sort(reverse=True) for entry in alias_len[0:-1]: drop_columns.append(entry[1]) for col in candidates: if col == idx: # It is of course perfectly correlated with itself pass else: aliases = [t.name() for t in terms[col].factors] if len(aliases) == 0: aliases = ["Intercept"] key = tuple([t.name() for t in terms[idx].factors]) if len(key) == 0: key = ("Intercept", ) if signs[col] > 0: aliases.insert(0, "+") if signs[col] < 0: aliases.insert(0, "-") aliasing[key].append(aliases) # Sort the aliases in length: for key, val in aliasing.items(): alias_len = [(len(i), i) if i[1] != "Intercept" else (1e5, i) for i in val] alias_len.sort() aliasing[key] = [i[1] for i in alias_len] return aliasing, list(set(drop_columns)) pre_model = smf.ols(model_spec, data=data) model_description = ModelDesc.from_formula(model_spec) aliasing, drop_columns = find_aliases( pre_model, model_description, threshold_correlation=alias_threshold) drop_column_names = [pre_model.data.xnames[i] for i in drop_columns] post_model = smf.ols(model_spec, data=data, drop_cols=drop_column_names) name = name or data.pi_title out = Model( OLS_instance=post_model.fit(), model_spec=model_spec, aliasing=aliasing, name=name, ) out.data = data return out