def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target words (if relevant), tokenizes the input, and casts words, onsets, and durations to token-level lists. Called within _extract method to prepare input for the model. ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] n_tok = [len(t) for t in tok] stims.name = ' '.join(wds) if stims.name == '' else stims.name wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) tok = list(flatten(tok)) idx = self.tokenizer.encode(tok, return_tensors=self.framework) return wds, ons, dur, tok, idx
def run(self, stim, merge=True, **merge_kwargs): ''' Executes the graph by calling all Transformers in sequence. Args: stim (str, Stim, list): One or more valid inputs to any Transformer's 'transform' call. merge (bool): If True, all results are merged into a single pandas DataFrame before being returned. If False, a list of ExtractorResult objects is returned (one per Extractor/Stim combination). merge_kwargs: Optional keyword arguments to pass onto the merge_results() call. ''' results = list(chain(*[self.run_node(n, stim) for n in self.roots])) results = list(flatten(results)) self._results = results # For use in plotting return merge_results(results, **merge_kwargs) if merge else results
def merge_results(results, format='wide', timing=True, metadata=True, extractor_names=True, object_id=True, extractor_params=False, aggfunc=None, invalid_results='ignore', **to_df_kwargs): ''' Merges a list of ExtractorResults instances and returns a pandas DF. Args: results (list, tuple): A list of ExtractorResult instances to merge. format (str): Format to return the data in. Can be either 'wide' or 'long'. In the wide case, every extracted feature is a column, and every Stim is a row. In the long case, every row contains a single Stim/Extractor/feature combination. timing (bool, str): Whether or not to include columns for onset, order, and duration. metadata (bool): if True, includes Stim metadata columns in the returned DataFrame. These columns include 'stim_name', 'class', 'filename', 'history', and 'source_file'. Note that these values are often long strings, so the returned DF will be considerably larger. extractor_names (str, bool): How to handle extractor names when returning results. The specific behavior depends on whether format is 'long' or 'wide'. Valid values include: - 'prepend' or True: In both 'long' and 'wide' formats, feature names will be prepended with the Extractor name (e.g., "FaceExtractor#face_likelihood"). - 'drop' or False: In both 'long' and 'wide' formats, extractor names will be omitted entirely from the result. Note that this can create feature name conflicts when merging results from multiple Extractors, so is generally discouraged. - 'column': In 'long' format, extractor name will be included as a separate column. Not valid for 'wide' format (and will raise an error). - 'multi': In 'wide' format, a MultiIndex will be used for the columns, with the first level of the index containing the Extractor name and the second level containing the feature name. This value is invalid if format='long' (and will raise and error). object_id (bool): If True, attempts to intelligently add an 'object_id' column that differentiates between multiple objects in the results that may share onsets/orders/durations (and would otherwise be impossible to distinguish). This frequently occurs for ImageExtractors that identify multiple target objects (e.g., faces) within a single ImageStim. Default is 'auto', which includes the 'object_id' column if and only if it has a non-constant value. extractor_params (bool): If True, returns serialized extractor_params of the extractor, i.e. log_attributes at time of extraction. If format='wide', merge_results returns one column per extractor, each named ExtractorName#FeatureName#extractor_params. If format='long', returns only one column named extractor_params. aggfunc (str, Callable): If format='wide' and extractor_names='drop', it's possible for name clashes between features to occur. In such cases, the aggfunc argument is passed onto pandas' pivot_table function, and specifies how to aggregate multiple values for the same index. Can be a callable or any string value recognized by pandas. By default (None), 'mean' will be used for numeric columns and 'first' will be used for object/categorical columns. invalid_results (str): Specifies desired action for treating elements of the passed in results argument that are not ExtractorResult objects. Valid values include: - 'ignore' will ignore them and merge the valid ExtractorResults. - 'fail' will raise an exception on any invalid input Returns: a pandas DataFrame. For format details, see 'format' argument. ''' results = flatten(results) _timing = True if timing == 'auto' else timing _object_id = True if object_id == 'auto' else object_id if extractor_names is True: extractor_names = 'prepend' elif extractor_names is False: extractor_names = 'drop' dfs = [] for r in results: if isinstance(r, ExtractorResult): dfs.append( r.to_df(timing=_timing, metadata=metadata, format='long', extractor_name=True, object_id=_object_id, extractor_params=extractor_params, **to_df_kwargs)) elif invalid_results == 'fail': raise ValueError("At least one of the provided results was not an" "ExtractorResult. Set the invalid_results" "parameter to 'ignore' if you wish to ignore" "this.") if len(dfs) == 0: return pd.DataFrame() data = pd.concat(dfs, axis=0).reset_index(drop=True) if object_id == 'auto' and data['object_id'].nunique() == 1: data = data.drop('object_id', axis=1) unique_ext = data['extractor'] + '#' + data['feature'].astype(str) if extractor_names in ['prepend', 'multi']: data['feature'] = unique_ext if format == 'wide': ind_cols = { 'stim_name', 'onset', 'order', 'duration', 'object_id', 'class', 'filename', 'history', 'source_file' } ind_cols = list(ind_cols & set(data.columns)) # pandas groupby/index operations can't handle NaNs in index, (see # issue at https://github.com/pandas-dev/pandas/issues/3729), so we # replace NaNs with a placeholder and then re-substitute after # pivoting. dtypes = data[ind_cols].dtypes data[ind_cols] = data[ind_cols].fillna('PlAcEholdER') # Set default aggfunc based on column type, otherwise bad things happen if aggfunc is None: aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first' # add conditional on value of extractor_names if extractor_params: data['unique_extractor'] = unique_ext.astype( str) + '#extractor_params' attrs = data.pivot_table(index=ind_cols, columns='unique_extractor', values='extractor_params', aggfunc='first') data = data.pivot_table(index=ind_cols, columns='feature', values='value', aggfunc=aggfunc) if extractor_params: data = pd.concat([data, attrs], axis=1) data = data.reset_index() data.columns.name = None # vestigial--is set to 'feature' data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan) data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes))) if extractor_names != 'column' and 'extractor' in data.columns: data = data.drop('extractor', axis=1) if timing == 'auto' and 'onset' in data.columns: if data['onset'].isnull().all(): data = data.drop(['onset', 'order', 'duration'], axis=1) if 'onset' in data.columns: key = [('onset', ''), ('order', ''), ('duration', '')] \ if isinstance(data.columns, pd.MultiIndex) \ else ['onset', 'order', 'duration'] data = data.sort_values(key).reset_index(drop=True) if extractor_names == 'multi': if format == 'long': raise ValueError("Invalid extractor_names value 'multi'. When " "format is 'long', extractor_names must be " "one of 'drop', 'prepend', or 'column'.") data.columns = pd.MultiIndex.from_tuples( [c.split('#') for c in data.columns]) return data
def run(self, stim, merge=True): results = list(chain(*[self.run_node(n, stim) for n in self.roots])) results = list(flatten(results)) self._results = results # For use in plotting return merge_results(results) if merge else results
def merge_results(results, format='wide', timing=True, metadata=True, extractor_names=True, object_id=True, aggfunc=None, invalid_results='ignore', **to_df_kwargs): ''' Merges a list of ExtractorResults instances and returns a pandas DF. Args: results (list, tuple): A list of ExtractorResult instances to merge. format (str): Format to return the data in. Can be either 'wide' or 'long'. In the wide case, every extracted feature is a column, and every Stim is a row. In the long case, every row contains a single Stim/Extractor/feature combination. timing (bool, str): Whether or not to include columns for onset, order, and duration. metadata (bool): if True, includes Stim metadata columns in the returned DataFrame. These columns include 'stim_name', 'class', 'filename', 'history', and 'source_file'. Note that these values are often long strings, so the returned DF will be considerably larger. extractor_names (str, bool): How to handle extractor names when returning results. The specific behavior depends on whether format is 'long' or 'wide'. Valid values include: - 'prepend' or True: In both 'long' and 'wide' formats, feature names will be prepended with the Extractor name (e.g., "FaceExtractor#face_likelihood"). - 'drop' or False: In both 'long' and 'wide' formats, extractor names will be omitted entirely from the result. Note that this can create feature name conflicts when merging results from multiple Extractors, so is generally discouraged. - 'column': In 'long' format, extractor name will be included as a separate column. Not valid for 'wide' format (and will raise an error). - 'multi': In 'wide' format, a MultiIndex will be used for the columns, with the first level of the index containing the Extractor name and the second level containing the feature name. This value is invalid if format='long' (and will raise and error). object_id (bool): If True, attempts to intelligently add an 'object_id' column that differentiates between multiple objects in the results that may share onsets/orders/durations (and would otherwise be impossible to distinguish). This frequently occurs for ImageExtractors that identify multiple target objects (e.g., faces) within a single ImageStim. Default is 'auto', which includes the 'object_id' column if and only if it has a non-constant value. aggfunc (str, Callable): If format='wide' and extractor_names='drop', it's possible for name clashes between features to occur. In such cases, the aggfunc argument is passed onto pandas' pivot_table function, and specifies how to aggregate multiple values for the same index. Can be a callable or any string value recognized by pandas. By default (None), 'mean' will be used for numeric columns and 'first' will be used for object/categorical columns. invalid_results (str): Specifies desired action for treating elements of the passed in results argument that are not ExtractorResult objects. Valid values include: - 'ignore' will ignore them and merge the valid ExtractorResults. - 'fail' will raise an exception on any invalid input Returns: a pandas DataFrame. For format details, see 'format' argument. ''' results = flatten(results) _timing = True if timing == 'auto' else timing _object_id = True if object_id == 'auto' else object_id if extractor_names is True: extractor_names = 'prepend' elif extractor_names is False: extractor_names = 'drop' dfs = [] for r in results: if isinstance(r, ExtractorResult): dfs.append(r.to_df(timing=_timing, metadata=metadata, format='long', extractor_name=True, object_id=_object_id, **to_df_kwargs)) elif invalid_results == 'fail': raise ValueError("At least one of the provided results was not an" "ExtractorResult. Set the invalid_results" "parameter to 'ignore' if you wish to ignore" "this.") if len(dfs) == 0: return pd.DataFrame() data = pd.concat(dfs, axis=0).reset_index(drop=True) if object_id == 'auto' and data['object_id'].nunique() == 1: data = data.drop('object_id', axis=1) if extractor_names in ['prepend', 'multi']: data['feature'] = data['extractor'] + '#' + data['feature'].astype(str) if extractor_names != 'column': data = data.drop('extractor', axis=1) if format == 'wide': ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id', 'class', 'filename', 'history', 'source_file'} ind_cols = list(ind_cols & set(data.columns)) # pandas groupby/index operations can't handle NaNs in index, (see # issue at https://github.com/pandas-dev/pandas/issues/3729), so we # replace NaNs with a placeholder and then re-substitute after # pivoting. dtypes = data[ind_cols].dtypes data[ind_cols] = data[ind_cols].fillna('PlAcEholdER') # Set default aggfunc based on column type, otherwise bad things happen if aggfunc is None: aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first' data = data.pivot_table(index=ind_cols, columns='feature', values='value', aggfunc=aggfunc).reset_index() data.columns.name = None # vestigial--is set to 'feature' data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan) data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes))) if timing == 'auto' and 'onset' in data.columns: if data['onset'].isnull().all(): data = data.drop(['onset', 'order', 'duration'], axis=1) if 'onset' in data.columns: key = [('onset', ''), ('order', ''), ('duration', '')] \ if isinstance(data.columns, pd.MultiIndex) \ else ['onset', 'order', 'duration'] data = data.sort_values(key).reset_index(drop=True) if extractor_names == 'multi': if format == 'long': raise ValueError("Invalid extractor_names value 'multi'. When " "format is 'long', extractor_names must be " "one of 'drop', 'prepend', or 'column'.") data.columns = pd.MultiIndex.from_tuples( [c.split('#') for c in data.columns]) return data