def bootstrap(factor_names, model, run_count): """Create a minimal starting design that is non-singular.""" md = ModelDesc.from_formula(model) model_size = len(md.rhs_termlist) if run_count == 0: run_count = model_size if model_size > run_count: raise ValueError("Can't build a design of size {} " "for a model of rank {}. " "Model: '{}'".format(run_count, model_size, model)) factor_count = len(factor_names) x0 = np.zeros(factor_count) # add high/low bounds to constraint matrix constraint_matrix = np.zeros((factor_count * 2, factor_count)) bounds = np.zeros(factor_count * 2) c = 0 for f in range(factor_count): constraint_matrix[c][f] = -1 bounds[c] = 1 c += 1 constraint_matrix[c][f] = 1 bounds[c] = 1 c += 1 start_points = hit_and_run(x0, constraint_matrix, bounds, run_count) d = pd.DataFrame(start_points, columns=factor_names) X = dmatrix(model, d) return (d, X)
def _package_attrs(self, attrs): # Sometimes features are retrieved from wrapper (stargazer does this), # other times from the actual result (statsmodels' summary_col does # this), so we'll have both. rres = RRegressionResults() # Use patsy to extract the target variable: fobj = ModelDesc.from_formula(self.formula) rres.target = fobj.lhs_termlist[0].name() rres.model = self # We need to hijack this rather than subclassing because stargazer does # not use "isinstance()" but "type()": wrap = RegressionResultsWrapper(rres) # All items except "params" are @cache_readonly and need first to be # deleted, and then redefined: for attr in attrs: if attr not in ('params', ): if hasattr(rres, attr): delattr(rres, attr) setattr(rres, attr, attrs[attr]) setattr(wrap, attr, attrs[attr]) self._debug("Set {} to {}".format(attr, attrs[attr])) rres.__class__ = RegressionResults return wrap
def add_predictors(base_formula, extra_predictors): desc = ModelDesc.from_formula(base_formula) # Using LookupFactor here ensures that everything will work correctly even # if one of the column names in extra_columns is named like "weight.in.kg" # or "sys.exit()" or "LittleBobbyTables()". desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors] return desc
def _parse_formula(formula_str, include_intercept=False): """ Wrap some extra functionality into Patsy formula parse """ _form = ModelDesc.from_formula(formula_str) # patsy (by default) includes intercept. Discard this on RHS if not include_intercept: _form.rhs_termlist = [ t for t in _form.rhs_termlist if len(t.factors) != INTERCEPT ] #print(_form.lhs_termlist) #_categoricals = [t for t in _form.lhs_termlist #if t.startswith("C(") and t.endswith(")")] #_task = 'classify' if len(categoricals) > 0 else 'regress' #if len(_categoricals) != len(_form.lhs_termlist): #raise ValueError(f"Mixed targets detected in {formula_str}. " #"Specify all categoricals " #"using the C(...) syntax or all continuous.") #_num_classes = None if _task == 'classify' else len(_form.lhs_termlist) _num_classes = 2 _task = 'classify' return _form, _task, _num_classes
def _drop_intercept(formula, add_intercept): """Drop the intercept from formula if not add_intercept""" if not add_intercept: if not isinstance(formula, ModelDesc): formula = ModelDesc.from_formula(formula) if INTERCEPT in formula.rhs_termlist: formula.rhs_termlist.remove(INTERCEPT) return formula return formula
def get_factor_names(self, level=1): """ Gets the factors in a model which correspond to a certain level: 1 : pure factors 2 : 2-factor interactions and quadratic terms 3 : 3-factor interactions and cubic terms 4 : etc """ spec = ModelDesc.from_formula(self._model_spec) return [term.name() for term in spec.rhs_termlist \ if len(term.factors)==level]
def from_r_object(cls, rsum, ci=None, debug=False): """ Reconstruct a model from an rpy2 summary object, and optionally its confidence intervals. These can be easily saved in R with save(objname, file=file_name) and loaded in Python via rpy2 with r['load'](file_name)['objname'] Parameters ---------- rsum : R object R summary of a fitted model. Typically produced with "summary(fitted)" (in R). ci : R object Confidence intervals of the fitted model Typically produced with "confint(fitted)" (in R). debug : bool, default False If True, print debug messages. """ d_res = cls._r_as_dict(None, rsum) if not 'terms' in d_res: msg = ("Interpreting r objects inside Python is only supported " "for few estimators. More will work using " "RModel.from_rdata() directly.") raise NotImplementedError(msg) formula = str(d_res['terms']).splitlines()[0] # We want to create a fake dataset, and we use patsy to get the list of # variables. We are actually creating columns for interactions and # functions too... but who cares, identifying them would be at the # moment overkill. fobj = ModelDesc.from_formula(formula) varnames = [t.name() for t in fobj.rhs_termlist + fobj.lhs_termlist][1:] # We need to pass some pd.DataFrame to from_formula() below - but it # doesn't seem to be actually used. data = pd.DataFrame(-1, index=[0], columns=[0]) # Creating the OLS object and only then hijacking it allows us to best # profit of statsmodels' machinery: mod = OLS.from_formula(formula, data) mod.__class__ = RModel # This is now an RModel: mod._initialize(debug=debug) attrs = mod._inspect_R(rsum, ci=ci) wrap = mod._package_attrs(attrs) return wrap
def _parse_formula(formula): # head off patsy errors if ';' in formula or formula.strip()[0].isdigit(): metric = formula.split('~')[0].strip() else: metric = None # use patsy to parse formula model_desc = ModelDesc.from_formula(formula) group_columns = set() for t in model_desc.rhs_termlist: for i in t.factors: group_columns.add(i.name()) if metric is None: metric = model_desc.lhs_termlist[0].name() return metric, group_columns
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: int = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: column = metadata.get_column(i.name()) if column.has_missing_values(): raise ValueError( 'adonis requires metadata columns with no ' 'NaN values (missing values in column `%s`.)' % (column.name, )) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = [ 'run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp ] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def _parse_formula(formula): # head off patsy errors if '~' not in formula: raise ValueError('Formula not valid: missing tilde.\n' 'Enter a valid formula in format "y ~ model".') if ';' in formula or formula.strip()[0].isdigit(): metric = formula.split('~')[0].strip() else: metric = None # use patsy to parse formula model_desc = ModelDesc.from_formula(formula) group_columns = set() for t in model_desc.rhs_termlist: for i in t.factors: group_columns.add(i.name()) if metric is None: metric = model_desc.lhs_termlist[0].name() return metric, group_columns
def parse_formula(f_str): patsy_formula = ModelDesc.from_formula(f_str) tokenize = patsy_formula.lhs_termlist valid_tokenizers = list() for term in tokenize: for e in term.factors: code = e.code if code in _VALID_TOKENIZERS: valid_tokenizers.append(code) if len(valid_tokenizers) == 0: tokenize.insert(0, Term([EvalFactor(_DEFAULT_TOKENIZER)])) if len(valid_tokenizers) > 1: raise RuntimeError("Multiple tokenizers found in formula\n" f"Specify one from {' '.join(_VALID_TOKENIZERS)}") preprocess = [t for t in patsy_formula.rhs_termlist if len(t.factors) > 0] return tokenize, preprocess
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
# Now to try this in `patsy`. # # Steps: # 1. See how the model description is derived from the formula # 2. Build the design matrix that the formula specifies # 3. Use the design matrix in order to create the model in `scikit-learn` # In[87]: from patsy import ModelDesc, EvalEnvironment # In[88]: env = EvalEnvironment.capture() predicted_lat_age_mtx = ModelDesc.from_formula('Predicted ~ Age_Calc * Case', env) # In[89]: predicted_lat_age_mtx # In[90]: from patsy import dmatrix # In[91]: design_mtx = dmatrix('Case * Age_Calc', subj_case_data)
def mean_from_patsy_formula(formula, inputdict={}): """ Create a mean function from a patsy formula This is the functional interface to creating a mean function from a patsy/R formula. The higher level function ``MeanFunction`` in the ``MeanFunction`` module is preferred over this one, but this potentially gives the user slightly more control as a patsy ``ModelDesc`` object can be passed directly, rather than giving a string. This method takes a string or a patsy ``ModelDesc`` object as an input and an optional dictionary that map strings to integer indices in the input data. The formula is then parsed with patsy, and the individual terms resulting from that are converted to mean functions and composed using the provided operations. The string formulas can be specified in several ways. The formula LHS is implicitly always ``"y = "`` or ``"y ~ "``, though these can be explicitly provided as well (though it is ignored in the conversion). The RHS may contain a set of terms containing the add, multiply, power, and call operations much in the same way that the operations would be entered as regular python code. Parentheses are used to indicated prececence as well as the call operation, and square brackets indicate an indexing operation on the inputs. Inputs may be specified as either a string such as ``"x[0]"``, ``"inputs[0]"``, or a string that can be mapped to an integer index with the optional dictionary passed to the function. Any strings not representing operations or inputs as described above are interpreted as follows: if the string can be converted into a number, then it is interpreted as a ``ConstantMean`` fixed mean function object; otherwise it is assumed to represent a fitting coefficient. Note that this means many characters that do not represent operations within this mean function language but would not normally be considered as python variables will nonetheless be converted into fitting coefficients -- it is up to the user to get this right. Expressions that are repeated or redundant will not be simplified beyond the parsing done by patsy, so the user should take care that the provided expression is sensible as a mean function and will not cause problems when fitting. Examples: :: >>> from mogp_emulator.formula import mean_from_patsy_formula >>> mf1 = mean_from_patsy_formula("x[0]") >>> print(mf1) c + c*x[0] >>> mf2 = mean_from_patsy_formula("a*b", {"a": 0, "b": 1}) >>> print(mf2) c + c*x[0] + c*x[1] + c*x[0]*x[1] :param formula: string representing the desired mean function formula or a patsy ``ModelDesc`` object :type formula: str or ModelDesc :param inputdict: dictionary used to map variables to input indices. Maps strings to integer indices (must be non-negative). Optional, default is ``{}``. :type inputdict: dict :returns: New subclass of ``MeanBase`` implementing the given formula :rtype: subclass of MeanBase (exact type will depend on the formula that is provided) """ assert not no_patsy, "patsy must be installed to parse formulas using patsy" if isinstance(formula, str): model = ModelDesc.from_formula(formula) elif isinstance(formula, ModelDesc): model = formula model_terms = [] for term in model.rhs_termlist: model_terms.append(_term_to_mean(term, inputdict)) mf = model_terms.pop(0) assert issubclass(type(mf), MeanFunction.MeanBase) for term in model_terms: mf += term assert issubclass(type(mf), MeanFunction.MeanBase) return mf
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np from patsy import ModelDesc from patsy import demo_data from patsy import dmatrix, dmatrices print(ModelDesc.from_formula("y ~ x").describe()) print(ModelDesc.from_formula("y ~ x + x + x").describe()) print(ModelDesc.from_formula("y ~ -1 + x").describe()) print(ModelDesc.from_formula("~ -1").describe()) print(ModelDesc.from_formula("y ~ a:b").describe()) print(ModelDesc.from_formula("y ~ a*b").describe()) print(ModelDesc.from_formula("y ~ (a + b + c + d) ** 2").describe()) print(ModelDesc.from_formula("y ~ (a + b)/(c + d)").describe()) print( ModelDesc.from_formula("np.log(x1 + x2) " "+ (x + {6: x3, 8 + 1: x4}[3 * i])").describe()) #Sometimes it might be easier to read if you put the processed formula back into formula notation using ModelDesc.describe(): desc = ModelDesc.from_formula("y ~ (a + b + c + d) ** 2") print(desc.describe()) data = demo_data("a", "b", "x1", "x2") mat = dmatrix("x1:x2 + a:b + b + x1:a:b + a + x2:a:x1", data) print(mat.design_info.term_names) data = demo_data("a", "b", "y") mat1 = dmatrices("y ~ 0 + a:b", data)[1]
def __str__(self): spec = ModelDesc.from_formula(self._model_spec) return spec.describe()
def lm( model_spec: str, data: pd.DataFrame, name: Optional[str] = None, alias_threshold: Optional[float] = 0.995, ) -> Model: """ Create a linear model. """ def find_aliases(model, model_desc, threshold_correlation=0.995): """ Finds columns which are exactly correlated, or up to at least a level of `threshold_correlation`. Returns a dictionary of aliasing and a list of columns to keep. The columns to keep will be in the order checked. Perhaps this can be improved. For example if AB = CD, then return AB to keep. For example if A = BCD, then return A, and not the BCD column to keep. """ has_variation = model.exog.std(axis=0) > np.sqrt(np.finfo(float).eps) # np.dot(model.exog.T, model.exog)/model.exog.shape[0] # Drop columns which do not have any variation corrcoef = np.corrcoef(model.exog[:, has_variation].T) # , ddof=0) # Snippet of code here is from the NumPy "corrcoef" function. Adapted. c = np.cov(model.exog.T, None, rowvar=True) dot_product = model.exog.T @ model.exog try: d = np.diag(c) except ValueError: # scalar covariance # nan if incorrect value (nan, inf, 0), 1 otherwise return c / c stddev = np.sqrt(d.real) aliasing = defaultdict(list) terms = model_desc.rhs_termlist drop_columns = [] counter = -1 corrcoef = c.copy() for idx, check in enumerate(has_variation): if check: counter += 1 for j, stddev_value in enumerate(stddev): if stddev_value == 0: pass else: corrcoef[idx, j] = c[idx, j] / stddev[idx] / stddev[j] # corrcoef = c / stddev[idx, None] # corrcoef = corrcoef / stddev[None, idx] candidates = [ i for i, val in enumerate(np.abs(corrcoef[idx, :])) if (val > threshold_correlation) ] signs = [np.sign(j) for j in corrcoef[idx, :]] else: # Columns with no variation candidates = [ i for i, j in enumerate(has_variation) if (j <= threshold_correlation) ] # Track the correlation signs signs = [np.sign(j) for j in dot_product[idx, :]] # Now drop out the candidates with the longest word lengths alias_len = [(len(terms[i].factors), i) for i in candidates] alias_len.sort(reverse=True) for entry in alias_len[0:-1]: drop_columns.append(entry[1]) for col in candidates: if col == idx: # It is of course perfectly correlated with itself pass else: aliases = [t.name() for t in terms[col].factors] if len(aliases) == 0: aliases = ["Intercept"] key = tuple([t.name() for t in terms[idx].factors]) if len(key) == 0: key = ("Intercept", ) if signs[col] > 0: aliases.insert(0, "+") if signs[col] < 0: aliases.insert(0, "-") aliasing[key].append(aliases) # Sort the aliases in length: for key, val in aliasing.items(): alias_len = [(len(i), i) if i[1] != "Intercept" else (1e5, i) for i in val] alias_len.sort() aliasing[key] = [i[1] for i in alias_len] return aliasing, list(set(drop_columns)) pre_model = smf.ols(model_spec, data=data) model_description = ModelDesc.from_formula(model_spec) aliasing, drop_columns = find_aliases( pre_model, model_description, threshold_correlation=alias_threshold) drop_column_names = [pre_model.data.xnames[i] for i in drop_columns] post_model = smf.ols(model_spec, data=data, drop_cols=drop_column_names) name = name or data.pi_title out = Model( OLS_instance=post_model.fit(), model_spec=model_spec, aliasing=aliasing, name=name, ) out.data = data return out
def get_response_name(self): spec = ModelDesc.from_formula(self._model_spec) return spec.lhs_termlist[0].name()
def guess_and_check_eq(sims, info): errors, obs = [], "noisy" for index, row in sims.iterrows(): T = row['T'] gt = row['std']**2 / T - row['eps']**2 / T error = np.mean((row[obs] - gt)**2) errors.append(error) sims['errors'] = errors sims['inv_T'] = 1 / sims['T'] sims['inv_D'] = 1 / sims['D'] sims['inv_T2'] = 1 / sims['T']**2 sims['inv_D2'] = 1 / sims['D']**2 sims['sqrt_T'] = 1 / np.sqrt(sims['T']) sims['sqrt_D'] = 1 / np.sqrt(sims['D']) sims['std2'] = sims['std']**2 sims['eps2'] = sims['eps']**2 sims['eps2_div_T'] = sims['eps2'] / (sims['T'] * sims['D'])**2 sims['std2_div_T'] = sims['std2'] / (sims['T'] * sims['D'])**2 sims['eps2_std2'] = sims['std2'] * sims['eps2'] / sims['T']**2 sims['sum_eps2_std2_div_T'] = sims['eps2'] + sims['std2'] / sims['T']**2 sims['exp_sum_eps2_std2_div_T'] = np.exp( (sims['eps2_div_T'] + sims['std2_div_T'])) sims['sum_std2_prod_eps2_std2_div_T'] = sims['std2'] * ( sims['eps2_div_T'] + sims['std2_div_T']) sims['div_D_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] + sims['std2_div_T']) / sims['D'] sims['eps2_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] + sims['std2_div_T']) * sims['eps2'] sims['std2_prod_eps2_std2_div_T'] = (sims['eps2_div_T'] + sims['std2_div_T']) * sims['std2'] # sims['exp_eps2_std2_div_T'] = np.exp( ( sims['eps2'] + sims['std2'] ) / sims['T']**2) # sims = sims[sims['std'] > 0] # sims = sims[sims['T'] > 3] # sims = sims[sims['eps'] > 0] # model_desc = "errors ~ std2_div_T * eps2_div_T * sum_eps2_std2_div_T * T * D - D - T" model_desc = "errors ~ eps2_div_T * std2_div_T * inv_T2 * inv_D2 * std2_prod_eps2_std2_div_T * eps2_prod_eps2_std2_div_T" # model_desc = "errors ~ eps2_div_T * std2_div_T * inv_T * inv_D * exp_sum_eps2_std2_div_T" y, X = dmatrices(model_desc, sims) desc = ModelDesc.from_formula(model_desc) print(desc.describe()) reg = linear_model.LinearRegression() #reg = linear_model.Ridge(alpha=1000) # reg = linear_model.Lasso(alpha=.1,max_iter=10000) reg.fit(X, y) print(reg.coef_) print(reg.intercept_) b = reg.intercept_ A = reg.coef_ print(X.shape, A.shape) preds = reg.predict(X) # preds_2 = X @ A.T + b # print(np.mean((preds - preds_2)**2)) mse = np.mean((y - preds)**2) pp = pprint.PrettyPrinter(indent=4) results = {} for v, k in zip(reg.coef_[0], str(desc.describe()).split(" + ")): name = str(int(np_log(abs(v)) / np_log(10))) if not (name in results.keys()): results[name] = [] results[name].append(k) pp.pprint(results) fixed = {"T": 10, "D": 100} fsims = sims for name, value in fixed.items(): fsims = fsims[fsims[name] == value] noisy = fsims['noisy'].mean() std = fsims['std'].mean() eps = fsims['eps'].mean() # print(noisy,std,eps) print("MSE ~= %2.3e" % mse)
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env): rerp_infos = [] rerp_names = set() spans = [] design_offset = 0 expanded_design_offset = 0 data_format = data_set.data_format for rerp_idx, rerp_spec in enumerate(rerp_specs): start_offset = data_format.ms_to_samples(rerp_spec.start_time) # Offsets are half open: [start, stop) # But, it's more intuitive for times to be closed: [start, stop] # So we interpret the user times as a closed interval, and add 1 # sample when converting to offsets. stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time) if start_offset >= stop_offset: raise ValueError("Epochs must be >1 sample long!") event_set = data_set.events.find(rerp_spec.event_query) # Tricky bit: the specifies a RHS-only formula, but really we have an # implicit LHS (determined by the event_query). This makes things # complicated when it comes to e.g. keeping track of which items # survived NA removal, determining the number of rows in an # intercept-only formula, etc. Really we want patsy to just treat all # this stuff the same way as it normally handles a LHS~RHS # formula. So, we convert our RHS formula into a LHS~RHS formula, # using a special LHS that represents each event by a placeholder # integer! desc = ModelDesc.from_formula(rerp_spec.formula, eval_env) if desc.lhs_termlist: raise ValueError("Formula cannot have a left-hand side") desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])] fake_lhs, design = dmatrices(desc, event_set) surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel() design_row_idxes = np.empty(len(event_set)) design_row_idxes.fill(-1) design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0]) # Now design_row_idxes[i] is -1 if event i was thrown out, and # otherwise gives the row in 'design' which refers to event 'i'. for i in xrange(len(event_set)): event = event_set[i] # -1 for non-existent design_row_idx = design_row_idxes[i] recspan = (event.recording, event.span_id) recspan_intern = recspan_intern_table[recspan] epoch_start = start_offset + event.start_idx epoch_stop = stop_offset + event.start_idx if design_row_idx == -1: design_row = None else: design_row = design[design_row_idx, :] epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row, design_offset, expanded_design_offset, rerp_idx, []) if design_row is None: # Event thrown out due to missing predictors; this # makes its whole epoch into an artifact -- but if overlap # correction is disabled, then this artifact only affects # this epoch, not anything else. (We still want to treat # it as an artifact though so we get proper accounting at # the end.) epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR") spans.append( DataSpan((recspan_intern, epoch_start), (recspan_intern, epoch_stop), epoch, None)) if rerp_spec.name in rerp_names: raise ValueError("name %r used for two different sub-analyses" % (rerp_spec.name, )) rerp_names.add(rerp_spec.name) rerp_info = { "spec": rerp_spec, "design_info": design.design_info, "start_offset": start_offset, "stop_offset": stop_offset, "design_offset": design_offset, "expanded_design_offset": expanded_design_offset, "total_epochs": len(event_set), "epochs_with_data": 0, "epochs_with_artifacts": 0, } rerp_infos.append(rerp_info) design_offset += design.shape[1] epoch_samples = stop_offset - start_offset expanded_design_offset += epoch_samples * design.shape[1] return rerp_infos, spans, design_offset, expanded_design_offset