def encode(self, x: str) -> str: y = tokenize(x, space=self.space_symbol, non_lang_syms=self.non_lang_syms) if self.ends_with_space: return y + " " + self.space_symbol else: return y
def test_speech_tokenizer(self): for i, sent in enumerate(self.text): print('test sentence {}:'.format(i)) print(sent) tokens = utils.tokenize( sent, space=self.dictionary.space_word, non_lang_syms=self.non_lang_syms, ) # test :func:`~speech_tools.utils.tokenize` with # :func:`~AsrDictionary.encode_line` tensor = self.dictionary.encode_line( tokens, add_if_not_exist=False, append_eos=True, ) reconstructed_tokens = self.dictionary.string(tensor) expected_tokens = ' '.join([ token if self.dictionary.index(token) != self.dictionary.unk() else self.dictionary.unk_word for token in tokens.split(' ') ]) self.assertEqual(reconstructed_tokens, expected_tokens) # test :func:`~speech_tools.utils.tokenize` with # :func:`~AsrDictionary.tokens_to_sentence` reconstructed_sent = self.dictionary.tokens_to_sentence(tokens) expected_sent = [] words = sent.split(' ') for w in words: if w not in self.non_lang_syms: new_word = ''.join([ self.dictionary.unk_word if c in self.oovs else c for c in w ]) expected_sent.append(new_word) else: expected_sent.append(w) expected_sent = ' '.join(expected_sent) self.assertEqual(reconstructed_sent, expected_sent)
def tokenizer(x: str) -> List[str]: return tokenize( x, non_lang_syms=subword_dict.non_lang_syms).split(' ')
def tokenizer(x): return tokenize(x, non_lang_syms=subword_dict.non_lang_syms).split(" ")