def __call__(self, batch: Union[List[str], List[List[str]]]) -> \ Union[List[List[str]], List[str]]: """Tokenize or detokenize strings, depends on the type structure of passed arguments. Args: batch: a batch of documents to perform tokenizing/lemmatizing; or a batch of lists of tokens/lemmas to perform detokenizing Returns: a batch of lists of tokens/lemmas; or a batch of detokenized strings Raises: TypeError: If the first element of ``batch`` is neither ``List``, nor ``str``. """ try: if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] except: self.save(self.save_path) self.manager.shutdown() raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`". format(type(batch[0])))
def __call__(self, batch): if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`". format(type(batch[0])))
def __call__(self, dialogs: Sequence[Dialog]) -> Sequence[str]: new_responses = [] for d in dialogs: # get tokens & tags response = d['utterances'][-1] ner_annotations = response['annotations']['ner'] user_name = d['user']['profile']['name'] # replace names with user name if ner_annotations and (response['active_skill'] == 'chitchat'): response_toks_norm, _ = \ self.person_normalizer([ner_annotations['tokens']], [ner_annotations['tags']], [user_name]) response_toks_norm = response_toks_norm[0] # detokenize new_responses.append(detokenize(response_toks_norm)) else: new_responses.append(response['text']) return new_responses
def __call__(self, batch: Union[List[str], List[List[str]]]) -> \ Union[List[List[str]], List[str]]: """Tokenize or detokenize strings, depends on the type structure of passed arguments. Args: batch: a batch of documents to perform tokenizing/lemmatizing; or a batch of lists of tokens/lemmas to perform detokenizing Returns: a batch of lists of tokens/lemmas; or a batch of detokenized strings Raises: TypeError: If the first element of ``batch`` is neither List, nor str. """ if isinstance(batch[0], str): if self.lemmas: return list(self._lemmatize(batch)) else: return list(self._tokenize(batch)) if isinstance(batch[0], list): return [detokenize(doc) for doc in batch] raise TypeError( "StreamSpacyTokenizer.__call__() is not implemented for `{}`".format(type(batch[0])))