def _postprocess(self, stims, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:, 1:-1, :] if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) out_idx = preds[0, self.mask_pos, :].argsort()[::-1] if self.top_n: sub_idx = out_idx[:self.top_n] elif self.target: sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) elif self.threshold: sub_idx = np.where(preds[0, self.mask_pos, :] >= self.threshold)[0] else: sub_idx = out_idx out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) feat = [ f.capitalize() if len(f) == len(f.encode()) else f for f in feat ] data = [listify(p) for p in preds[0, self.mask_pos, out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) if self.return_input: data += [stims.name] feat += ['sequence'] mask_ons = listify(stims.elements[self.mask_pos].onset) mask_dur = listify(stims.elements[self.mask_pos].duration) return data, feat, mask_ons, mask_dur
def __init__(self, functions=None, var_names=None, subset_idx=None, **kwargs): functions = listify(functions) if var_names is not None: var_names = listify(var_names) if len(var_names) != len(functions): raise ValueError('Length of var_names must match number of ' 'functions') for idx, f in enumerate(functions): if isinstance(f, str): try: f_mod, f_func = f.rsplit('.', 1) functions[idx] = getattr(import_module(f_mod), f_func) except: try: functions[idx] = eval(f) except: raise ValueError(f"{f} is not a valid function") if var_names is None: var_names = [f.__name__ for f in functions] self.var_names = var_names self.functions = functions self.kwargs = kwargs self.subset_idx = subset_idx super().__init__()
def _preprocess(self, stim): x = listify(stim.data) if self.preprocessor_url_or_path: preprocessor = hub.KerasLayer(self.preprocessor_url_or_path, self.preprocessor_kwargs) x = preprocessor(x) return x
def get_converter(in_type, out_type, *args, **kwargs): ''' Scans the list of available Converters and returns an instantiation of the first one whose input and output types match those passed in. Args: in_type (type): The type of input the converter must have. out_type (type): The type of output the converter must have. args, kwargs: Optional positional and keyword arguments to pass onto matching Converter's initializer. ''' convs = pliers.converters.__all__ # If config includes default converters for this combination, try them # first out_type = listify(out_type)[::-1] default_convs = config.get_option('default_converters') for ot in out_type: conv_str = '%s->%s' % (in_type.__name__, ot.__name__) if conv_str in default_convs: convs = list(default_convs[conv_str]) + convs for name in convs: cls = getattr(pliers.converters, name) if not inspect.isclass(cls) or not issubclass(cls, Converter): continue available = cls.available if issubclass(cls, EnvironmentKeyMixin) else True if cls._input_type == in_type and cls._output_type in out_type \ and available: conv = cls(*args, **kwargs) return conv return None
def __init__(self, api_key=None, model='general-v1.3', min_value=None, max_concepts=None, select_concepts=None, rate_limit=None, batch_size=None): verify_dependencies(['clarifai_client']) if api_key is None: try: api_key = os.environ['CLARIFAI_API_KEY'] except KeyError: raise ValueError("A valid Clarifai API API_KEY " "must be passed the first time a Clarifai " "extractor is initialized.") self.api_key = api_key try: self.api = clarifai_client.ClarifaiApp(api_key=api_key) self.model = self.api.models.get(model) except clarifai_client.ApiError as e: logging.warn(str(e)) self.api = None self.model = None self.model_name = model self.min_value = min_value self.max_concepts = max_concepts self.select_concepts = select_concepts if select_concepts: select_concepts = listify(select_concepts) self.select_concepts = [clarifai_client.Concept(concept_name=n) for n in select_concepts] super(ClarifaiAPIExtractor, self).__init__(rate_limit=rate_limit)
def __init__(self, api_key=None, model='general-v1.3', min_value=None, max_concepts=None, select_concepts=None, rate_limit=None): verify_dependencies(['clarifai_client']) if api_key is None: try: api_key = os.environ['CLARIFAI_API_KEY'] except KeyError: raise ValueError("A valid Clarifai API API_KEY " "must be passed the first time a Clarifai " "extractor is initialized.") self.api_key = api_key try: self.api = clarifai_client.ClarifaiApp(api_key=api_key) self.model = self.api.models.get(model) except clarifai_client.ApiError: self.api = None self.model = None self.model_name = model self.min_value = min_value self.max_concepts = max_concepts self.select_concepts = select_concepts if select_concepts: select_concepts = listify(select_concepts) self.select_concepts = [ clarifai_client.Concept(concept_name=n) for n in select_concepts ] super(ClarifaiAPIExtractor, self).__init__(rate_limit=rate_limit)
def _transform(self, stim, *args, **kwargs): # Check if we are requesting faster than the rate limit, # if so, throttle by sleeping time_diff = time.time() - self._last_request_time if time_diff < self.rate_limit: time.sleep(self.rate_limit - time_diff) self._last_request_time = time.time() # Check if we are trying to transform a large amount of data self.transformed_stim_count += len(listify(stim)) if not config.get_option('allow_large_jobs'): if not isiterable(stim) and stim.duration \ and stim.duration > config.get_option('long_job'): raise ValueError("Attempted to run an API transformation " "on a stimulus of duration %f, aborting. " "To allow this transformation, set " "config option 'allow_large_jobs' to " "True." % stim.duration) if self.transformed_stim_count > config.get_option('large_job'): raise ValueError("Number of transformations using this %s " "would exceed %d, aborting further " "transformations. To allow, set config " "option 'allow_large_jobs' to True." % (self.__class__.__name__, config.get_option('large_job'))) if config.get_option('api_key_validation') and not self.validate_keys(): raise ValueError("Error running %s, a provided environment key " "was invalid or unauthorized. Please check that " "you have authorized credentials for accessing " "the target API." % self.__class__.__name__) return super(APITransformer, self)._transform(stim, *args, **kwargs)
def _transform(self, stim, *args, **kwargs): new_stim = self._filter(stim, *args, **kwargs) if not isinstance(new_stim, self._input_type) and \ not isinstance(listify(new_stim)[0], stim.__class__): raise ValueError("Filter must return a Stim of the same type as " "its input.") return new_stim
def get_transformer(name, base=None, *args, **kwargs): ''' Scans list of currently available Transformer classes and returns an instantiation of the first one whose name perfectly matches (case-insensitive). Args: name (str): The name of the transformer to retrieve. Case-insensitive; e.g., 'stftextractor' or 'CornerDetectionExtractor'. base (str, list): Optional name of transformer modules to search. Valid values are 'converters', 'extractors', and 'filters'. args, kwargs: Optional positional or keyword arguments to pass onto the Transformer. ''' name = name.lower() # Default to searching all kinds of Transformers if base is None: base = ['extractors', 'converters', 'filters'] base = listify(base) for b in base: importlib.import_module('pliers.%s' % b) mod = getattr(pliers, b) classes = getattr(mod, '__all__') for cls_name in classes: if cls_name.lower() == name.lower(): cls = getattr(mod, cls_name) return cls(*args, **kwargs) raise KeyError("No transformer named '%s' found." % name)
def get_converter(in_type, out_type, *args, **kwargs): ''' Scans the list of available Converters and returns an instantiation of the first one whose input and output types match those passed in. Args: in_type (type): The type of input the converter must have. out_type (type): The type of output the converter must have. args, kwargs: Optional positional and keyword arguments to pass onto matching Converter's initializer. ''' convs = pliers.converters.__all__ # If config includes default converters for this combination, try them first out_type = listify(out_type)[::-1] for ot in out_type: conv_str = '%s->%s' % (in_type.__name__, ot.__name__) if conv_str in config.default_converters: convs = list(config.default_converters[conv_str]) + convs for name in convs: cls = getattr(pliers.converters, name) if not issubclass(cls, Converter): continue if cls._input_type == in_type and cls._output_type in out_type and cls.available: try: conv = cls(*args, **kwargs) return conv except ValueError: # Important for API converters pass return None
def _extract(self, stim): values = self._get_values(stim) if self._feature == 'beat_track': beats = np.array(values[1]) values = beats values = values.T n_frames = len(values) feature_names = listify(self.get_feature_names()) onsets = librosa.frames_to_time(range(n_frames), sr=stim.sampling_rate, hop_length=self.hop_length) onsets = onsets + stim.onset if stim.onset else onsets durations = [self.hop_length / float(stim.sampling_rate)] * n_frames return ExtractorResult(values, stim, self, features=feature_names, onsets=onsets, durations=durations, orders=list(range(n_frames)))
def _extract(self, stim): values = self.func(stim.data) feature_names = listify(self.get_feature_names()) return ExtractorResult(values, stim, self, features=feature_names, raw=values)
def _preprocess(self, stim): if self.transform_inp: return self.transform_inp(stim.data) else: if type(stim) == TextStim: return listify(stim.data) else: return stim.data
def _stim_matches_input_types(self, stim): # Checks if passed Stim meets all _input_type and _optional_input_type # specifications. mandatory = tuple(listify(self._input_type)) optional = tuple(listify(self._optional_input_type)) if isinstance(stim, CompoundStim): return stim.has_types(mandatory) or ( not mandatory and stim.has_types(optional, False)) if len(mandatory) > 1: msg = "Transformer of class %s requires multiple mandatory " + \ "inputs, so the passed input Stim must be a CompoundStim" + \ "--which it isn't." % self.__class__.__name__ raise ValueError(msg) return isinstance(stim, mandatory) or (not mandatory and isinstance(stim, optional))
def _transform(self, stim, *args, **kwargs): stims = listify(stim) if all(self._stim_matches_input_types(s) for s in stims): result = super() \ ._transform(stims, *args, **kwargs) if isiterable(stim): return result else: return result[0] else: return list(super()._iterate(stims, *args, **kwargs))
def has_types(self, types, all_=True): ''' Check whether the current component list matches all Stim types in the types argument. Args: types (Stim, list): a Stim class or iterable of Stim classes. all_ (bool): if True, all input types must match; if False, at least one input type must match. Return: True if all passed types match at least one Stim in the component list, otherwise False. ''' func = all if all_ else any return func([self.get_stim(t) for t in listify(types)])
def run_node(self, node, stim): if isinstance(node, string_types): node = self.nodes[node] result = node.transformer.transform(stim) if isinstance(node.transformer, Extractor): return listify(result) stim = result # If result is a generator, the first child will destroy the # iterable, so cache via list conversion if len(node.children) > 1 and isgenerator(stim): stim = list(stim) return list(chain(*[self.run_node(c, stim) for c in node.children]))
def has_types(self, types, all_=True): ''' Check whether the current component list matches all Stim types in the types argument. Args: types (Stim, list): a Stim class or iterable of Stim classes. all_ (bool): if True, all input types must match; if False, at least one input type must match. Returns: True if all passed types match at least one Stim in the component list, otherwise False. ''' func = all if all_ else any return func([self.get_stim(t) for t in listify(types)])
def _postprocess(self, stims, preds, tok, wds, ons, dur): data = preds[0].numpy().squeeze() if self.return_softmax: data = scipy.special.softmax(data) data = [listify(d) for d in data.tolist()] tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] except: dur = None ons = ons[0] feat = ['sent_pos', 'sent_neg'] if self.return_input: data += tok feat += ['sequence'] return data, feat, ons, dur
def _extract(self, stim): values = self._get_values(stim) values = values.T feature_names = listify(self.get_feature_names()) n_frames = len(values) onsets = librosa.frames_to_time(range(n_frames), sr=stim.sampling_rate, hop_length=self.hop_length) onsets = onsets + stim.onset if stim.onset else onsets durations = [self.hop_length / float(stim.sampling_rate)] * n_frames return ExtractorResult(values, stim, self, features=feature_names, onsets=onsets, durations=durations)
def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', mask='MASK', top_n=None, threshold=None, target=None, return_softmax=False, return_masked_word=False, return_input=False, model_kwargs=None, tokenizer_kwargs=None): if any([top_n and target, top_n and threshold, threshold and target]): raise ValueError('top_n, threshold and target arguments ' 'are mutually exclusive') if type(mask) not in [int, str]: raise ValueError('Mask must be a string or an integer.') super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, tokenizer=tokenizer, framework=framework, return_input=return_input, model_class='AutoModelWithLMHead', model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs) self.target = listify(target) if self.target: missing = set(self.target) - set(self.tokenizer.vocab.keys()) if missing: logging.warning(f'{missing} not in vocabulary. Dropping.') present = set(self.target) & set(self.tokenizer.vocab.keys()) self.target = list(present) if self.target == []: raise ValueError( 'No valid target token. Import transformers' ' and run transformers.BertTokenizer.from_pretrained' f'(\'{tokenizer}\').vocab.keys() to see available tokens') self.mask = mask self.top_n = top_n self.threshold = threshold self.return_softmax = return_softmax self.return_masked_word = return_masked_word
def run_node(self, node, stim): ''' Executes the Transformer at a specific node. Args: node (str, Node): If a string, the name of the Node in the current Graph. Otherwise the Node instance to execute. stim (str, stim, list): Any valid input to the Transformer stored at the target node. ''' if isinstance(node, string_types): node = self.nodes[node] result = node.transformer.transform(stim) if node.is_leaf(): return listify(result) stim = result # If result is a generator, the first child will destroy the # iterable, so cache via list conversion if len(node.children) > 1 and isgenerator(stim): stim = list(stim) return list(chain(*[self.run_node(c, stim) for c in node.children]))
def _extract(self, stim): values = self._get_values(stim) if self._feature=='beat_track': beats=np.array(values[1]) values=beats values = values.T n_frames = len(values) feature_names = listify(self.get_feature_names()) onsets = librosa.frames_to_time(range(n_frames), sr=stim.sampling_rate, hop_length=self.hop_length) onsets = onsets + stim.onset if stim.onset else onsets durations = [self.hop_length / float(stim.sampling_rate)] * n_frames return ExtractorResult(values, stim, self, features=feature_names, onsets=onsets, durations=durations, orders=list(range(n_frames)))
def _extract(self, stim): values = self.func(stim.data) feature_names = listify(self.get_feature_names()) return ExtractorResult(values, stim, self, features=feature_names)
def env_keys(self): return listify(self._env_keys)
def _to_df(self, result): cols = listify(self._feature) return pd.DataFrame([[r] for r in result._data], columns=cols)
def get_feature_names(self, out): if self.features: return listify(self.features) else: return ['feature_' + str(i) for i in range(out.shape[-1])]
def _extract(self, stim): inp = self._preprocess(stim) out = self.model(inp) out = self._postprocess(out) features = self.get_feature_names(out) return ExtractorResult(listify(out), stim, self, features=features)