def create_new_corpus(data_dict, corpus_vol, **kwargs): new_corpus = Corpus([]) sem_nums = kwargs['sem_nums'] intents = data_dict.keys() if not corpus_vol: return elif sem_nums > len(intents): return else: for i in range(corpus_vol): intent_sam = set() while len(intent_sam) < sem_nums: intent_sam.add(random.choice(list(intents))) spanset = SpanSet() sentences = [] start_position = 0 for intent in list(intent_sam): if intent == 'noise': txt = random.choice(list(data_dict[intent])) sentences.append(txt) start_position += len(txt) else: txt = random.choice(list(data_dict[intent])) sentences.append(txt) spanset.append( Span(start=start_position, end=start_position + len(txt), entity=intent)) start_position += len(txt) doc = Document(text=''.join(sentences), label='|'.join(intent_sam), span_set=spanset) new_corpus.append(doc) return new_corpus
def two_add_link(map_data, file1, file2, link, domain): list1 = read_raw_data(file1) list2 = read_raw_data(file2) link_list = read_raw_data(link) len_all = max(len(list1), len(list2)) path1 = os.path.basename(file1) path2 = os.path.basename(file2) doc_list = [] dict_list = read_map(map_data) # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(list2) l3 = choice(link_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) l = l1 + l3 + l2 doc1 = Document(l) doc1.domain = domain doc1.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + l3), end=len(l1 + l3 + l2end), entity=path2[:-4]), ] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) ll = l2 + l3 + l1 doc2 = Document(ll) doc2.domain = domain doc2.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list2 = [ Span(start=0, end=len(l2end), entity=path2[:-4]), Span(start=len(l2 + l3), end=len(l2 + l3 + l1end), entity=path1[:-4]), ] doc2.entities = SpanSet(span_list2) # print(doc1) doc_list.append(doc2) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx" corpus.write_to_file(res_path)
def test_tensorflow_inference(datadir): # TODO(howl-anderson): skip this test until model file oversize issue solved return workshop_dir = datadir model_dir = os.path.join(workshop_dir, "./saved_model") inference = TensorFlowInference(model_dir) result = inference.infer("看一下上海的天气。") print(result) expected = ( "看一下上海的天气。", Sequence( text=["看", "一", "下", "上", "海", "的", "天", "气", "。"], span_set=SpanSet([Span(3, 5, "地点", value=None, normal_value=None)]), id=None, label=None, extra_attr={}, ), ["O", "O", "O", "B-地点", "L-地点", "O", "O", "O", "O"], False, ) assert expected == result
def test_tensorflow_keras_h5_inference(datadir): workshop_dir = datadir model_file = os.path.join(workshop_dir, "./h5_model/model.h5") tag_lookup_file = os.path.join(workshop_dir, "./h5_model/tag_lookup_table.json") vocabulary_lookup_file = os.path.join( workshop_dir, "./h5_model/vocabulary_lookup_table.json") inference = Inference(model_file, tag_lookup_file, vocabulary_lookup_file) result = inference.infer("看一下上海的天气。") expected = ( "看一下上海的天气。", Sequence( text=["看", "一", "下", "上", "海", "的", "天", "气", "。"], span_set=SpanSet( [Span(3, 5, "城市名", value=None, normal_value=None)]), id=None, label=None, extra_attr={}, ), ["O", "O", "O", "B-城市名", "L-城市名", "O", "O", "O", "O"], False, ) assert expected == result
def __init__( self, text: Union[List[str], str], span_set: SpanSet = None, id: Union[str, None] = None, label: Union[str, None] = None, extra_attr: Union[Mapping[str, Any], None] = None, ): # TODO: # 1. rename extra_attr to attr # 2. move label into attr # 3. span_set should include in some column data # convert text from string to list, if needed if isinstance(text, str): text = list(i for i in text) self.text = text self._span_set = None self.span_set = span_set or SpanSet() self.id = id if id is not None else str(uuid.uuid4()) self.label = label # for feature usage self.extra_attr = extra_attr if extra_attr else {} self._compare_method = None self._hash_method = None
def test_span_set_bind(): seq = Sequence("abce", span_set=SpanSet([Span(1, 2, '1-2')])) result = seq.span_set[0].value expected = ["b"] assert result == expected
def _turn_training_data_to_offset(training_data): from tokenizer_tools.tagset.offset.sequence import Sequence from tokenizer_tools.tagset.offset.span import Span from tokenizer_tools.tagset.offset.span_set import SpanSet for example in training_data.training_examples: span_set = SpanSet() text = [i for i in example.text] # need to be str list (not str) intent = example.get("intent") for ent in example.get("entities", []): start, end, entity = ent["start"], ent["end"], ent["entity"] span_set.append(Span(start, end, entity)) seq = Sequence(text, span_set, label=intent) yield seq
def test_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = corpus_one.difference(corpus_two) expected = Corpus([ Document( "王小明在台北新竹的清华大学读书。", span_set=SpanSet( [Span(0, 3, "PERSON"), Span(4, 8, "GPE"), Span(9, 13, "ORG")]), id="3", ) ]) assert result == expected
def one_add_noise(map_data, file1, noise, domain, pos=''): list1 = read_raw_data(file1) noise_list = read_raw_data(noise) len_all = max(len(list1), len(noise_list)) dict_list = read_map(map_data) path1 = os.path.basename(file1) doc_list = [] # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(noise_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) if pos == 'before': l = l2 + l1 span_list1 = [ Span(start=len(l2), end=len(l1end + l2), entity=path1[:-4]), ] elif pos == 'after': l = l1 + l2 span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] else: l = l1 span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc1 = Document(l) doc1.domain = domain doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + 'noise' + '_' + pos + ".conllx" corpus.write_to_file(res_path)
def one_to_conllx(map_data, file1, domain): list1 = read_raw_data(file1) path1 = os.path.basename(file1) dict_list = read_map(map_data) doc_list = [] # 数量min+max for i in list1: doc1 = Document(i) doc1.domain = domain # print(dict_list[path1[:-4]]) doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] lenx = line_end_remove(i) span_list1 = [ Span(start=0, end=len(lenx), entity=path1[:-4]), ] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + ".conllx" corpus.write_to_file(res_path)
def read_data(data_path, output_file): data_path = Path(data_path) doc_list = [] for data_file in data_path.glob("*.json"): with jsonlines.open(str(data_file)) as reader: for obj in reader: text = [i for i in obj["content"]] doc = Document(text) doc.sub_function = obj["childFunction"] doc.domain = obj["domain"] doc.function = obj["function"] doc.intent = obj["intent"] span_list = [] for entity in obj["marked"]: record = entity["record"] if not record: continue start = int(record[0]) end = int(record[-1]) + 1 entity_type = entity["titleIndex"] span = Span(start, end, entity_type) span_list.append(span) entities = SpanSet(span_list) doc.entities = entities doc_list.append(doc) corpus = Corpus(doc_list) corpus.write_to_file(output_file)
def test_document_pattern(): # construct a pattern object dp = DocumentPattern("name 的 goods".split()) dp.entities = SpanSet([ EntityPlaceholder(start=0, end=1, entity="name"), EntityPlaceholder(start=2, end=3, entity="goods"), ]) # test if render method works doc = dp.render(name=["Real", "Name"], goods=["RealGoods"]) expected_doc_snippet = "[Real Name](name) 的 [RealGoods](goods)" result_doc = str(doc) assert expected_doc_snippet in result_doc # make sure DocumentPattern is untouched expected_pattern_snippet = "<name> 的 <goods>" result_pattern = str(dp) assert expected_pattern_snippet in result_pattern
def test_check_overlap(): span_set = SpanSet() span_set.append(Span(1, 2, 'entity')) span_set.append(Span(2, 3, 'entity')) assert span_set.check_overlap()[0] == True span_set = SpanSet() span_set.append(Span(1, 2, 'entity')) span_set.append(Span(4, 6, 'entity')) assert span_set.check_overlap()[0] == True span_set = SpanSet() span_set.append(Span(1, 4, 'entity')) span_set.append(Span(2, 3, 'entity')) check_result = span_set.check_overlap() assert check_result[0] == False assert check_result[1] == [(Span(1, 4, 'entity'), Span(2, 3, 'entity'))]
def one_before_after_add_noise(map_data, file1, before_noise, after_noise, domain): list1 = read_raw_data(file1) before_list = read_raw_data(before_noise) after_list = read_raw_data(after_noise) len_all = max(len(list1), len(before_list), len(after_list)) dict_list = read_map(map_data) path1 = os.path.basename(file1) doc_list = [] # 数量min for i in range(0, len_all): l1 = choice(list1) before = choice(before_list) after = choice(after_list) l1end = line_end_remove(l1) before_end = line_end_remove(before) after_end = line_end_remove(after) # file1 len1 = l1 doc1 = Document(len1) doc1.domain = domain doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc1.entities = SpanSet(span_list1) doc_list.append(doc1) # before + file1 len2 = before + l1 doc2 = Document(len2) doc2.domain = domain doc2.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list2 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), ] doc2.entities = SpanSet(span_list2) doc_list.append(doc2) # before + file1 + after len3 = before + l1 + after doc3 = Document(len3) doc3.domain = domain doc3.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list3 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), ] doc3.entities = SpanSet(span_list3) doc_list.append(doc3) # file1 + after len4 = l1 + after doc4 = Document(len4) doc4.domain = domain doc4.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list4 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc4.entities = SpanSet(span_list4) doc_list.append(doc4) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + ".conllx" corpus.write_to_file(res_path)
corpus = Corpus.read_from_file("./data/all_data.conllx") list1 = [] for doc in corpus: list1.append(doc) len_all = len(list1) doc_list = [] for i in range(0, len_all): l1 = choice(list1) len1 = len(l1.text) span_list = [] for span in l1.span_set: span_list.append(span) l2 = choice(list1) for span in l2.span_set: span_ll = Span(start=len1 + span.start, end=len1 + span.end, entity=span.entity) span_list.append(span_ll) text = "".join(l1.text) + "".join(l2.text) doc1 = Document(text) doc1.entities = SpanSet(span_list) doc1.domain = l1.domain doc_list.append(doc1) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) corpus.write_to_file('./data/data_all.conllx')
def test_check_match(): span_set = SpanSet() span_set.append(Span(1, 2, 'entity', '春')) span_set.append(Span(2, 3, 'entity', '秋')) assert span_set.check_match('赛春秋')[0] == True span_set = SpanSet() span_set.append(Span(1, 2, 'entity', '春')) span_set.append(Span(4, 6, 'entity', '秋天')) assert span_set.check_match('赛春秋赛秋天')[0] == True span_set = SpanSet() span_set.append(Span(1, 4, 'entity', '赛春秋')) span_set.append(Span(2, 3, 'entity', '春')) assert span_set.check_match('赛赛春秋')[0] == True span_set = SpanSet() span_set.append(Span(1, 4, 'entity', '赛春秋')) span_set.append(Span(2, 3, 'entity', '春')) check_result = span_set.check_match('不不不不') assert check_result[0] == False assert check_result[1] == [ Span(1, 4, 'entity', '赛春秋'), Span(2, 3, 'entity', '春') ]
def two_add_before_link_after(map_data, file1, file2, before, link, after, domain): list1 = read_raw_data(file1) list2 = read_raw_data(file2) before_list = read_raw_data(before) link_list = read_raw_data(link) after_list = read_raw_data(after) len_all = min(len(list1), len(list2)) path1 = os.path.basename(file1) path2 = os.path.basename(file2) doc_list = [] dict_list = read_map(map_data) # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(list2) link = choice(link_list) before = choice(before_list) before2 = choice(before_list) after = choice(after_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) link_end = line_end_remove(link) before_end = line_end_remove(before) after_end = line_end_remove(after) # file1 + file2 len1 = l1 + l2 doc1 = Document(len1) doc1.domain = domain doc1.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]), ] doc1.entities = SpanSet(span_list1) doc_list.append(doc1) # file1 + link + file2 len2 = l1 + link + l2 doc2 = Document(len2) doc2.domain = domain doc2.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list2 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link), end=len(l1 + link + l2end), entity=path2[:-4]), ] doc2.entities = SpanSet(span_list2) doc_list.append(doc2) # before + file1 + file2 len3 = before + l1 + l2 doc3 = Document(len3) doc3.domain = domain doc3.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list3 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1), end=len(before + l1 + l2end), entity=path2[:-4]), ] doc3.entities = SpanSet(span_list3) doc_list.append(doc3) # before + file1 + file2 + after len4 = before + l1 + l2 + after doc4 = Document(len4) doc4.domain = domain doc4.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list4 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1), end=len(before + l1 + l2end), entity=path2[:-4]), ] doc4.entities = SpanSet(span_list4) doc_list.append(doc4) # before + file1 + link + file2 len5 = before + l1 + link + l2 doc5 = Document(len5) doc5.domain = domain doc5.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list5 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1 + link), end=len(before + l1 + link + l2end), entity=path2[:-4]), ] doc5.entities = SpanSet(span_list5) doc_list.append(doc5) # before + file1 + link + file2 + after len6 = before + l1 + link + l2 + after doc6 = Document(len6) doc6.domain = domain doc6.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list6 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1 + link), end=len(before + l1 + link + l2end), entity=path2[:-4]), ] doc6.entities = SpanSet(span_list6) doc_list.append(doc6) # file1 + file2 + after len7 = l1 + l2 + after doc7 = Document(len7) doc7.domain = domain doc7.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list7 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]), ] doc7.entities = SpanSet(span_list7) doc_list.append(doc7) # file1 + link + file2 + after len8 = l1 + link + l2 + after doc8 = Document(len8) doc8.domain = domain doc8.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list8 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link), end=len(l1 + link + l2end), entity=path2[:-4]), ] doc8.entities = SpanSet(span_list8) doc_list.append(doc8) # file1 + before + file2 len9 = l1 + before + l2 doc9 = Document(len9) doc9.domain = domain doc9.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list9 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + before), end=len(l1 + before + l2end), entity=path2[:-4]), ] doc9.entities = SpanSet(span_list9) doc_list.append(doc9) # file1 + before + file2 + after len10 = l1 + before + l2 + after doc10 = Document(len10) doc10.domain = domain doc10.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list10 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + before), end=len(l1 + before + l2end), entity=path2[:-4]), ] doc10.entities = SpanSet(span_list10) doc_list.append(doc10) # file1 + link + before + file2 len11 = l1 + link + before + l2 doc11 = Document(len11) doc11.domain = domain doc11.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list11 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link + before), end=len(l1 + link + before + l2end), entity=path2[:-4]), ] doc11.entities = SpanSet(span_list11) doc_list.append(doc11) # file1 + link + before + file2 + after len12 = l1 + link + before + l2 + after doc12 = Document(len12) doc12.domain = domain doc12.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list12 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link + before), end=len(l1 + link + before + l2end), entity=path2[:-4]), ] doc12.entities = SpanSet(span_list12) doc_list.append(doc12) # before + file1 + before2 + file2 len13 = before + l1 + before2 + l2 doc13 = Document(len13) doc13.domain = domain doc13.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list13 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + before2), end=len(l1 + before + before2 + l2end), entity=path2[:-4]), ] doc13.entities = SpanSet(span_list13) doc_list.append(doc13) # before + file1 + before2 + file2 + after len14 = before + l1 + before2 + l2 + after doc14 = Document(len14) doc14.domain = domain doc14.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list14 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + before2), end=len(l1 + before + before2 + l2end), entity=path2[:-4]), ] doc14.entities = SpanSet(span_list14) doc_list.append(doc14) # before + file1 + link + before2 + file2 len15 = before + l1 + link + before2 + l2 doc15 = Document(len15) doc15.domain = domain doc15.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list15 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + link + before2), end=len(l1 + before + link + before2 + l2end), entity=path2[:-4]), ] doc15.entities = SpanSet(span_list15) doc_list.append(doc15) # before + file1 + link + before2 + file2 + after len16 = before + l1 + link + before2 + l2 + after doc16 = Document(len16) doc16.domain = domain doc16.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list16 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + link + before2), end=len(l1 + before + link + before2 + l2end), entity=path2[:-4]), ] doc16.entities = SpanSet(span_list16) doc_list.append(doc16) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + ".conllx" corpus.write_to_file(res_path)
def fasttext_parser(data_generator_func): for sentence in data_generator_func(): data = '__label_{} {}'.format(sentence.label, ' '.join(sentence.text)) yield data if __name__ == "__main__": from uuid import UUID from tokenizer_tools.tagset.offset.sequence import Sequence from tokenizer_tools.tagset.offset.span import Span from tokenizer_tools.tagset.offset.span_set import SpanSet data = [ Sequence(text='我要听周杰伦的青花瓷', span_set=SpanSet([Span(7, 10, '地点')]), id=UUID('59139985-367e-44c3-8540-b6340d07f79e'), label='媒体'), Sequence(text='我要听周杰伦的夜曲', span_set=SpanSet([Span(7, 10, '地点')]), id=UUID('59139985-367e-44c3-8540-b6340d07f79e'), label='媒体') ] def faked_data_generator_func(): return data result = fasttext_parser(faked_data_generator_func) for i in result: print(i)
def test_eq_(): a = SpanSet() a.append(Span(1, 2, 'entity')) a.append(Span(2, 3, 'entity')) b = SpanSet() b.append(Span(1, 2, 'entity')) b.append(Span(2, 3, 'entity')) assert a == b c = SpanSet() # empty SpanSet assert a != c d = SpanSet() # same with `a` but different span order d.append(Span(2, 3, 'entity')) d.append(Span(1, 2, 'entity')) assert a == d e = SpanSet() # same with `a` but different span order e.append(Span(0, 1, 'entity')) e.append(Span(1, 2, 'entity')) assert a != e
def test_express_pattern(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") express_pattern = ExpressPattern(corpus) result = express_pattern.compute() expected = { ("<PERSON>", "在", "<GPE>", "的", "<ORG>", "读", "书", "。"): [ Document( text=[ "王", "小", "明", "在", "北", "京", "的", "清", "华", "大", "学", "读", "书", "。", ], span_set=SpanSet( [ Span(0, 3, "PERSON", value=None, normal_value=None), Span(4, 6, "GPE", value=None, normal_value=None), Span(7, 11, "ORG", value=None, normal_value=None), ] ), id="1", label=None, extra_attr={}, ), Document( text=[ "王", "小", "明", "在", "台", "北", "新", "竹", "的", "清", "华", "大", "学", "读", "书", "。", ], span_set=SpanSet( [ Span(0, 3, "PERSON", value=None, normal_value=None), Span(4, 8, "GPE", value=None, normal_value=None), Span(9, 13, "ORG", value=None, normal_value=None), ] ), id="3", label=None, extra_attr={}, ), ], ("来", "一", "首", "<歌手名>", "的", "歌", "。"): [ Document( text=["来", "一", "首", "蓝", "泽", "雨", "的", "歌", "。"], span_set=SpanSet([Span(3, 6, "歌手名", value=None, normal_value=None)]), id="2", label=None, extra_attr={}, ) ], } assert result == expected