Exemple #1
0
 def _preprocess(self, stims):
     ''' Extracts text, onset, duration from ComplexTextStim, masks target
         words (if relevant), tokenizes the input, and casts words, onsets,
         and durations to token-level lists. Called within _extract method 
         to prepare input for the model. '''
     els = [(e.text, e.onset, e.duration) for e in stims.elements]
     wds, ons, dur = map(list, zip(*els))
     tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)]
     n_tok = [len(t) for t in tok]
     stims.name = ' '.join(wds) if stims.name == '' else stims.name
     wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur])
     tok = list(flatten(tok))
     idx = self.tokenizer.encode(tok, return_tensors=self.framework)
     return wds, ons, dur, tok, idx
Exemple #2
0
    def run(self, stim, merge=True, **merge_kwargs):
        ''' Executes the graph by calling all Transformers in sequence.

        Args:
            stim (str, Stim, list): One or more valid inputs to any
                Transformer's 'transform' call.
            merge (bool): If True, all results are merged into a single pandas
                DataFrame before being returned. If False, a list of
                ExtractorResult objects is returned (one per Extractor/Stim
                combination).
            merge_kwargs: Optional keyword arguments to pass onto the
                merge_results() call.
        '''
        results = list(chain(*[self.run_node(n, stim) for n in self.roots]))
        results = list(flatten(results))
        self._results = results  # For use in plotting
        return merge_results(results, **merge_kwargs) if merge else results
Exemple #3
0
    def run(self, stim, merge=True, **merge_kwargs):
        ''' Executes the graph by calling all Transformers in sequence.

        Args:
            stim (str, Stim, list): One or more valid inputs to any
                Transformer's 'transform' call.
            merge (bool): If True, all results are merged into a single pandas
                DataFrame before being returned. If False, a list of
                ExtractorResult objects is returned (one per Extractor/Stim
                combination).
            merge_kwargs: Optional keyword arguments to pass onto the
                merge_results() call.
        '''
        results = list(chain(*[self.run_node(n, stim) for n in self.roots]))
        results = list(flatten(results))
        self._results = results  # For use in plotting
        return merge_results(results, **merge_kwargs) if merge else results
Exemple #4
0
def merge_results(results,
                  format='wide',
                  timing=True,
                  metadata=True,
                  extractor_names=True,
                  object_id=True,
                  extractor_params=False,
                  aggfunc=None,
                  invalid_results='ignore',
                  **to_df_kwargs):
    ''' Merges a list of ExtractorResults instances and returns a pandas DF.

    Args:
        results (list, tuple): A list of ExtractorResult instances to merge.
        format (str): Format to return the data in. Can be either 'wide' or
            'long'. In the wide case, every extracted feature is a column,
            and every Stim is a row. In the long case, every row contains a
            single Stim/Extractor/feature combination.
        timing (bool, str): Whether or not to include columns for onset,
            order, and duration.
        metadata (bool): if True, includes Stim metadata columns in the
            returned DataFrame. These columns include 'stim_name', 'class',
            'filename', 'history', and 'source_file'. Note that these values
            are often long strings, so the returned DF will be considerably
            larger.
        extractor_names (str, bool): How to handle extractor names when
            returning results. The specific behavior depends on whether format
            is 'long' or 'wide'. Valid values include:

                - 'prepend' or True: In both 'long' and 'wide' formats,
                  feature names will be prepended with the Extractor name
                  (e.g., "FaceExtractor#face_likelihood").
                - 'drop' or False: In both 'long' and 'wide' formats, extractor
                  names will be omitted entirely from the result. Note that
                  this can create feature name conflicts when merging results
                  from multiple Extractors, so is generally discouraged.
                - 'column': In 'long' format, extractor name will be included
                  as a separate column. Not valid for 'wide' format (and will
                  raise an error).
                - 'multi': In 'wide' format, a MultiIndex will be used for the
                  columns, with the first level of the index containing the
                  Extractor name and the second level containing the feature
                  name. This value is invalid if format='long' (and will raise
                  and error).
        object_id (bool): If True, attempts to intelligently add an
            'object_id' column that differentiates between multiple objects in
            the results that may share onsets/orders/durations (and would
            otherwise be impossible to distinguish). This frequently occurs for
            ImageExtractors that identify multiple target objects (e.g., faces)
            within a single ImageStim. Default is 'auto', which includes the
            'object_id' column if and only if it has a non-constant value.
        extractor_params (bool): If True, returns serialized extractor_params 
            of the extractor, i.e. log_attributes at time of extraction. 
            If format='wide', merge_results returns one column per extractor, 
            each named ExtractorName#FeatureName#extractor_params.
            If format='long', returns only one column named extractor_params.
        aggfunc (str, Callable): If format='wide' and extractor_names='drop',
            it's possible for name clashes between features to occur. In such
            cases, the aggfunc argument is passed onto pandas' pivot_table
            function, and specifies how to aggregate multiple values for the
            same index. Can be a callable or any string value recognized by
            pandas. By default (None), 'mean' will be used for numeric columns
            and 'first' will be used for object/categorical columns.
        invalid_results (str): Specifies desired action for treating elements
            of the passed in results argument that are not ExtractorResult
            objects. Valid values include:
                - 'ignore' will ignore them and merge the valid
                    ExtractorResults.
                - 'fail' will raise an exception on any invalid input


    Returns: a pandas DataFrame. For format details, see 'format' argument.
    '''

    results = flatten(results)

    _timing = True if timing == 'auto' else timing
    _object_id = True if object_id == 'auto' else object_id

    if extractor_names is True:
        extractor_names = 'prepend'
    elif extractor_names is False:
        extractor_names = 'drop'

    dfs = []
    for r in results:
        if isinstance(r, ExtractorResult):
            dfs.append(
                r.to_df(timing=_timing,
                        metadata=metadata,
                        format='long',
                        extractor_name=True,
                        object_id=_object_id,
                        extractor_params=extractor_params,
                        **to_df_kwargs))
        elif invalid_results == 'fail':
            raise ValueError("At least one of the provided results was not an"
                             "ExtractorResult. Set the invalid_results"
                             "parameter to 'ignore' if you wish to ignore"
                             "this.")

    if len(dfs) == 0:
        return pd.DataFrame()

    data = pd.concat(dfs, axis=0).reset_index(drop=True)

    if object_id == 'auto' and data['object_id'].nunique() == 1:
        data = data.drop('object_id', axis=1)

    unique_ext = data['extractor'] + '#' + data['feature'].astype(str)
    if extractor_names in ['prepend', 'multi']:
        data['feature'] = unique_ext

    if format == 'wide':
        ind_cols = {
            'stim_name', 'onset', 'order', 'duration', 'object_id', 'class',
            'filename', 'history', 'source_file'
        }
        ind_cols = list(ind_cols & set(data.columns))

        # pandas groupby/index operations can't handle NaNs in index, (see
        # issue at https://github.com/pandas-dev/pandas/issues/3729), so we
        # replace NaNs with a placeholder and then re-substitute after
        # pivoting.
        dtypes = data[ind_cols].dtypes
        data[ind_cols] = data[ind_cols].fillna('PlAcEholdER')

        # Set default aggfunc based on column type, otherwise bad things happen
        if aggfunc is None:
            aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first'

        # add conditional on value of extractor_names
        if extractor_params:
            data['unique_extractor'] = unique_ext.astype(
                str) + '#extractor_params'
            attrs = data.pivot_table(index=ind_cols,
                                     columns='unique_extractor',
                                     values='extractor_params',
                                     aggfunc='first')
        data = data.pivot_table(index=ind_cols,
                                columns='feature',
                                values='value',
                                aggfunc=aggfunc)
        if extractor_params:
            data = pd.concat([data, attrs], axis=1)
        data = data.reset_index()
        data.columns.name = None  # vestigial--is set to 'feature'
        data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan)
        data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes)))

    if extractor_names != 'column' and 'extractor' in data.columns:
        data = data.drop('extractor', axis=1)

    if timing == 'auto' and 'onset' in data.columns:
        if data['onset'].isnull().all():
            data = data.drop(['onset', 'order', 'duration'], axis=1)

    if 'onset' in data.columns:
        key = [('onset', ''), ('order', ''), ('duration', '')] \
            if isinstance(data.columns, pd.MultiIndex) \
            else ['onset', 'order', 'duration']
        data = data.sort_values(key).reset_index(drop=True)

    if extractor_names == 'multi':
        if format == 'long':
            raise ValueError("Invalid extractor_names value 'multi'. When "
                             "format is 'long', extractor_names must be "
                             "one of 'drop', 'prepend', or 'column'.")
        data.columns = pd.MultiIndex.from_tuples(
            [c.split('#') for c in data.columns])
    return data
Exemple #5
0
 def run(self, stim, merge=True):
     results = list(chain(*[self.run_node(n, stim) for n in self.roots]))
     results = list(flatten(results))
     self._results = results  # For use in plotting
     return merge_results(results) if merge else results
Exemple #6
0
def merge_results(results, format='wide', timing=True, metadata=True,
                  extractor_names=True, object_id=True, aggfunc=None,
                  invalid_results='ignore', **to_df_kwargs):
    ''' Merges a list of ExtractorResults instances and returns a pandas DF.

    Args:
        results (list, tuple): A list of ExtractorResult instances to merge.
        format (str): Format to return the data in. Can be either 'wide' or
            'long'. In the wide case, every extracted feature is a column,
            and every Stim is a row. In the long case, every row contains a
            single Stim/Extractor/feature combination.
        timing (bool, str): Whether or not to include columns for onset,
            order, and duration.
        metadata (bool): if True, includes Stim metadata columns in the
            returned DataFrame. These columns include 'stim_name', 'class',
            'filename', 'history', and 'source_file'. Note that these values
            are often long strings, so the returned DF will be considerably
            larger.
        extractor_names (str, bool): How to handle extractor names when
            returning results. The specific behavior depends on whether format
            is 'long' or 'wide'. Valid values include:

                - 'prepend' or True: In both 'long' and 'wide' formats,
                  feature names will be prepended with the Extractor name
                  (e.g., "FaceExtractor#face_likelihood").
                - 'drop' or False: In both 'long' and 'wide' formats, extractor
                  names will be omitted entirely from the result. Note that
                  this can create feature name conflicts when merging results
                  from multiple Extractors, so is generally discouraged.
                - 'column': In 'long' format, extractor name will be included
                  as a separate column. Not valid for 'wide' format (and will
                  raise an error).
                - 'multi': In 'wide' format, a MultiIndex will be used for the
                  columns, with the first level of the index containing the
                  Extractor name and the second level containing the feature
                  name. This value is invalid if format='long' (and will raise
                  and error).

        object_id (bool): If True, attempts to intelligently add an
            'object_id' column that differentiates between multiple objects in
            the results that may share onsets/orders/durations (and would
            otherwise be impossible to distinguish). This frequently occurs for
            ImageExtractors that identify multiple target objects (e.g., faces)
            within a single ImageStim. Default is 'auto', which includes the
            'object_id' column if and only if it has a non-constant value.
        aggfunc (str, Callable): If format='wide' and extractor_names='drop',
            it's possible for name clashes between features to occur. In such
            cases, the aggfunc argument is passed onto pandas' pivot_table
            function, and specifies how to aggregate multiple values for the
            same index. Can be a callable or any string value recognized by
            pandas. By default (None), 'mean' will be used for numeric columns
            and 'first' will be used for object/categorical columns.
        invalid_results (str): Specifies desired action for treating elements
            of the passed in results argument that are not ExtractorResult
            objects. Valid values include:
                - 'ignore' will ignore them and merge the valid
                    ExtractorResults.
                - 'fail' will raise an exception on any invalid input


    Returns: a pandas DataFrame. For format details, see 'format' argument.
    '''

    results = flatten(results)

    _timing = True if timing == 'auto' else timing
    _object_id = True if object_id == 'auto' else object_id

    if extractor_names is True:
        extractor_names = 'prepend'
    elif extractor_names is False:
        extractor_names = 'drop'

    dfs = []
    for r in results:
        if isinstance(r, ExtractorResult):
            dfs.append(r.to_df(timing=_timing, metadata=metadata,
                               format='long', extractor_name=True,
                               object_id=_object_id, **to_df_kwargs))
        elif invalid_results == 'fail':
            raise ValueError("At least one of the provided results was not an"
                             "ExtractorResult. Set the invalid_results"
                             "parameter to 'ignore' if you wish to ignore"
                             "this.")

    if len(dfs) == 0:
        return pd.DataFrame()

    data = pd.concat(dfs, axis=0).reset_index(drop=True)

    if object_id == 'auto' and data['object_id'].nunique() == 1:
        data = data.drop('object_id', axis=1)

    if extractor_names in ['prepend', 'multi']:
        data['feature'] = data['extractor'] + '#' + data['feature'].astype(str)

    if extractor_names != 'column':
        data = data.drop('extractor', axis=1)

    if format == 'wide':
        ind_cols = {'stim_name', 'onset', 'order', 'duration', 'object_id',
                    'class', 'filename', 'history', 'source_file'}
        ind_cols = list(ind_cols & set(data.columns))
        # pandas groupby/index operations can't handle NaNs in index, (see
        # issue at https://github.com/pandas-dev/pandas/issues/3729), so we
        # replace NaNs with a placeholder and then re-substitute after
        # pivoting.
        dtypes = data[ind_cols].dtypes
        data[ind_cols] = data[ind_cols].fillna('PlAcEholdER')

        # Set default aggfunc based on column type, otherwise bad things happen
        if aggfunc is None:
            aggfunc = 'mean' if is_numeric_dtype(data['value']) else 'first'

        data = data.pivot_table(index=ind_cols, columns='feature',
                                values='value', aggfunc=aggfunc).reset_index()
        data.columns.name = None  # vestigial--is set to 'feature'
        data[ind_cols] = data[ind_cols].replace('PlAcEholdER', np.nan)
        data[ind_cols] = data[ind_cols].astype(dict(zip(ind_cols, dtypes)))

    if timing == 'auto' and 'onset' in data.columns:
        if data['onset'].isnull().all():
            data = data.drop(['onset', 'order', 'duration'], axis=1)

    if 'onset' in data.columns:
        key = [('onset', ''), ('order', ''), ('duration', '')] \
            if isinstance(data.columns, pd.MultiIndex) \
            else ['onset', 'order', 'duration']
        data = data.sort_values(key).reset_index(drop=True)

    if extractor_names == 'multi':
        if format == 'long':
            raise ValueError("Invalid extractor_names value 'multi'. When "
                             "format is 'long', extractor_names must be "
                             "one of 'drop', 'prepend', or 'column'.")
        data.columns = pd.MultiIndex.from_tuples(
            [c.split('#') for c in data.columns])
    return data