class TestTrieDict(unittest.TestCase): def setUp(self) -> None: super().setUp() self.text = '第一个词语很重要,第二个词语也很重要' self.trie_dict = TrieDict({'重要': 'important'}) def test_tokenize(self): self.assertEqual([(6, 8, 'important'), (16, 18, 'important')], self.trie_dict.tokenize(self.text)) def test_split_batch(self): data = [self.text] new_data, new_data_belongs, parts = self.trie_dict.split_batch(data) predictions = [list(x) for x in new_data] self.assertSequenceEqual([[ '第', '一', '个', '词', '语', '很', 'important', ',', '第', '二', '个', '词', '语', '也', '很', 'important' ]], self.trie_dict.merge_batch( data, predictions, new_data_belongs, parts)) def test_tokenize_2(self): t = TrieDict({'次世代', '生产环境'}) self.assertSequenceEqual( t.tokenize('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'), [(15, 19, True), (21, 24, True)]) def test_empty_dict(self): trie_dict = TrieDict() self.assertFalse(bool(trie_dict)) trie_dict['one'] = 1 self.assertTrue(bool(trie_dict)) del trie_dict['one'] self.assertFalse(bool(trie_dict))
def dict_force(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_force = dictionary self.tokenizer_transform.dict = dictionary
def dict_blacklist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_blacklist = dictionary
def test_empty_dict(self): trie_dict = TrieDict() self.assertFalse(bool(trie_dict)) trie_dict['one'] = 1 self.assertTrue(bool(trie_dict)) del trie_dict['one'] self.assertFalse(bool(trie_dict))
def dict_combine(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if dictionary is not None and not isinstance(dictionary, DictInterface): if all(isinstance(k, str) for k in dictionary): dictionary = TrieDict(dictionary) else: _d = set() for k in dictionary: if isinstance(k, str): _d.update(possible_tokenization(k)) else: _d.add(k) dictionary = TupleTrieDict(_d) self.config.dict_combine = dictionary
def dict_whitelist(self, dictionary: Union[DictInterface, Union[Dict[str, Any], Set[str]]]): if not isinstance(dictionary, DictInterface): dictionary = TrieDict(dictionary) self.config.dict_whitelist = dictionary
def setUp(self) -> None: super().setUp() self.text = '第一个词语很重要,第二个词语也很重要' self.trie_dict = TrieDict({'重要': 'important'})
def test_tokenize_2(self): t = TrieDict({'次世代', '生产环境'}) self.assertSequenceEqual( t.tokenize('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'), [(15, 19, True), (21, 24, True)])