def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, "pipeline"): raise ValueError("You have to load model first.") sentence_list = content # 1. 检查sentence的类型 for sentence in sentence_list: if not all((type(obj) == str for obj in sentence)): raise ValueError("Input must be list of list of string.") # 2. 组建dataset dataset = DataSet() dataset.add_field("words", sentence_list) # 3. 使用pipeline self.pipeline(dataset) def merge_tag(words_list, tags_list): rtn = [] for words, tags in zip(words_list, tags_list): rtn.append([w + "/" + t for w, t in zip(words, tags)]) return rtn output = dataset.field_arrays["tag"].content if isinstance(content, str): return output[0] elif isinstance(content, list): return merge_tag(content, output)
def test_delete_field(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.delete_field("x") self.assertFalse("x" in dd.field_arrays) self.assertTrue("y" in dd.field_arrays)
def test_add_field(self): ds = DataSet({"x": [3, 4]}) ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True) # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y') print(ds)
def predict(self, content): """ 分词接口。 :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如 [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。 :return: str或List[str], 根据输入的的类型决定。 """ if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('raw_sentence', sentence_list) # 3. 使用pipeline self.pipeline(dataset) output = dataset.get_field('output').content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def predict(self, content): if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") # 1. 利用POS得到分词和pos tagging结果 pos_out = self.pos_tagger.predict(content) # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()] # 2. 组建dataset dataset = DataSet() dataset.add_field('wp', pos_out) dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words') dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos') dataset.rename_field("words", "raw_words") # 3. 使用pipeline self.pipeline(dataset) dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred') dataset.apply(lambda x: [ arc + '/' + label for arc, label in zip(x['arc_pred'], x['label_pred_seq']) ][1:], new_field_name='output') # output like: [['2/top', '0/root', '4/nn', '2/dep']] return dataset.field_arrays['output'].content
def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('words', sentence_list) # 3. 使用pipeline self.pipeline(dataset) output = dataset['word_pos_output'].content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, "pipeline"): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field("words", sentence_list) # 3. 使用pipeline self.pipeline(dataset) def decode_tags(ins): pred_tags = ins["tag"] chars = ins["words"] words = [] start_idx = 0 for idx, tag in enumerate(pred_tags): if tag[0] == "S": words.append(chars[start_idx:idx + 1] + "/" + tag[2:]) start_idx = idx + 1 elif tag[0] == "E": words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:]) start_idx = idx + 1 return words dataset.apply(decode_tags, new_field_name="tag_output") output = dataset.field_arrays["tag_output"].content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def test_add_append(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.add_field("z", [[5, 6]] * 10) self.assertEqual(len(dd), 10) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10) self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10) self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10) with self.assertRaises(RuntimeError): dd.add_field("??", [[1, 2]] * 40)
def predict(self, content): if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('words', sentence_list) # dataset.add_field('tag', sentence_list) # 3. 使用pipeline self.pipeline(dataset) for ins in dataset: ins['heads'] = ins['heads'].tolist() return dataset['heads'], dataset['labels']
def test_add_append(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.add_field("z", [[5, 6]] * 10) self.assertEqual(len(dd), 10) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10) self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10) self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10)
def predict(self, content): if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('raw_sentence', sentence_list) # 3. 使用pipeline self.pipeline(dataset) output = dataset['output'].content if isinstance(content, str): return output[0] elif isinstance(content, list): return output