def process_one_line(line, logger=sys.stderr): obj = bson.loads(line) # print(obj) text = obj['text'] intent = obj['intent'] id = obj["id"] domain = obj["domain"] seq = Document(text, label=intent, id=id) seq.domain = domain for entity in obj['entities']: start = int(entity['start']) # original index start at 0 end = int(entity['end']) entity = entity['entity'] try: span = Span(start, end, entity) # may raise OffsetSpanCheckError except OffsetSpanCheckError as e: logger.write("{}\tspan init failed: {}\n".format(id, e)) raise CheckFailedError # get value which is not in corpus_item object # span.fill_text(corpus_item['text']) seq.span_set.append(span) encoding = offset_to_biluo(seq) # may raise AssertionError # print(encoding) sentence = SentenceX(word_lines=text, attribute_lines=[encoding], id=seq.id) sentence.meta = {'domain': domain, 'label': intent} return seq, sentence
def _copy_structure_as_doc(self) -> Document: doc = Document(copy.deepcopy(self.text)) attrs_need_be_copied = ["domain", "intent", "function", "sub_function"] for attr in attrs_need_be_copied: setattr(doc, attr, getattr(self, attr)) doc.span_set = copy.deepcopy(self.span_set) doc.span_set.bind(doc) return doc
def two_add_link(map_data, file1, file2, link, domain): list1 = read_raw_data(file1) list2 = read_raw_data(file2) link_list = read_raw_data(link) len_all = max(len(list1), len(list2)) path1 = os.path.basename(file1) path2 = os.path.basename(file2) doc_list = [] dict_list = read_map(map_data) # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(list2) l3 = choice(link_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) l = l1 + l3 + l2 doc1 = Document(l) doc1.domain = domain doc1.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + l3), end=len(l1 + l3 + l2end), entity=path2[:-4]), ] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) ll = l2 + l3 + l1 doc2 = Document(ll) doc2.domain = domain doc2.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list2 = [ Span(start=0, end=len(l2end), entity=path2[:-4]), Span(start=len(l2 + l3), end=len(l2 + l3 + l1end), entity=path1[:-4]), ] doc2.entities = SpanSet(span_list2) # print(doc1) doc_list.append(doc2) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx" corpus.write_to_file(res_path)
def to_offset(self, sequence, text, **kwargs): seq = Document(text) plain_offset_list = self.decode_to_offset(sequence) for offset in plain_offset_list: seq.span_set.append(Span(offset[0], offset[1], offset[2])) seq.span_set.bind(seq) seq.label = kwargs.pop('label', None) seq.id = kwargs.pop('id', None) seq.extra_attr = kwargs return seq
def inference_process(cls, model, input_data, input_ids): output = [] for data, id in zip(input_data, input_ids): req = [] req.append(data) request = Request(req) response = model.inference(request) tmp_result = response['data'][0].sequence if not isinstance(tmp_result, Document): tmp_result = Document(tmp_result.text, tmp_result.span_set) tmp_result.label = response['cls'][0][0] tmp_result.id = id output.append(tmp_result) corpus_inference = Corpus(output) return corpus_inference
def conllz_to_offset(sentence_data: Sentence, raise_exception=False, attr_index=0) -> Tuple[Document, bool]: decoder = BILUOSequenceEncoderDecoder() input_text = sentence_data.word_lines tags_seq = sentence_data.get_attribute_by_index(attr_index) failed = False meta = copy.deepcopy(sentence_data.meta) try: seq = decoder.to_offset(tags_seq, input_text, label=meta.pop('label', None), id=sentence_data.id, **meta) except TagSetDecodeError as e: if not raise_exception: # invalid tag sequence will raise exception # so return a empty result seq = Document(input_text) failed = True else: raise return seq, failed
def create_new_corpus(data_dict, corpus_vol, **kwargs): new_corpus = Corpus([]) sem_nums = kwargs['sem_nums'] intents = data_dict.keys() if not corpus_vol: return elif sem_nums > len(intents): return else: for i in range(corpus_vol): intent_sam = set() while len(intent_sam) < sem_nums: intent_sam.add(random.choice(list(intents))) spanset = SpanSet() sentences = [] start_position = 0 for intent in list(intent_sam): if intent == 'noise': txt = random.choice(list(data_dict[intent])) sentences.append(txt) start_position += len(txt) else: txt = random.choice(list(data_dict[intent])) sentences.append(txt) spanset.append( Span(start=start_position, end=start_position + len(txt), entity=intent)) start_position += len(txt) doc = Document(text=''.join(sentences), label='|'.join(intent_sam), span_set=spanset) new_corpus.append(doc) return new_corpus
def test_bind(): doc = Document("abc") doc.span_set.append(Span(start=0, end=1, entity="a")) result = doc.convert_to_md() expected = "[a](a) b c" assert result == expected span = doc.span_set[0] span.bind(doc) span.value = ["a", "a", "a"] result = doc.convert_to_md() expected = "[a a a](a) b c" assert result == expected
def one_add_noise(map_data, file1, noise, domain, pos=''): list1 = read_raw_data(file1) noise_list = read_raw_data(noise) len_all = max(len(list1), len(noise_list)) dict_list = read_map(map_data) path1 = os.path.basename(file1) doc_list = [] # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(noise_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) if pos == 'before': l = l2 + l1 span_list1 = [ Span(start=len(l2), end=len(l1end + l2), entity=path1[:-4]), ] elif pos == 'after': l = l1 + l2 span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] else: l = l1 span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc1 = Document(l) doc1.domain = domain doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + 'noise' + '_' + pos + ".conllx" corpus.write_to_file(res_path)
def test_contains__(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) assert seq_one in corpus other_corpus = Document("") assert other_corpus not in corpus
def one_to_conllx(map_data, file1, domain): list1 = read_raw_data(file1) path1 = os.path.basename(file1) dict_list = read_map(map_data) doc_list = [] # 数量min+max for i in list1: doc1 = Document(i) doc1.domain = domain # print(dict_list[path1[:-4]]) doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] lenx = line_end_remove(i) span_list1 = [ Span(start=0, end=len(lenx), entity=path1[:-4]), ] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + ".conllx" corpus.write_to_file(res_path)
def test_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = corpus_one.difference(corpus_two) expected = Corpus([ Document( "王小明在台北新竹的清华大学读书。", span_set=SpanSet( [Span(0, 3, "PERSON"), Span(4, 8, "GPE"), Span(9, 13, "ORG")]), id="3", ) ]) assert result == expected
def read_data(data_path, output_file): data_path = Path(data_path) doc_list = [] for data_file in data_path.glob("*.json"): with jsonlines.open(str(data_file)) as reader: for obj in reader: text = [i for i in obj["content"]] doc = Document(text) doc.sub_function = obj["childFunction"] doc.domain = obj["domain"] doc.function = obj["function"] doc.intent = obj["intent"] span_list = [] for entity in obj["marked"]: record = entity["record"] if not record: continue start = int(record[0]) end = int(record[-1]) + 1 entity_type = entity["titleIndex"] span = Span(start, end, entity_type) span_list.append(span) entities = SpanSet(span_list) doc.entities = entities doc_list.append(doc) corpus = Corpus(doc_list) corpus.write_to_file(output_file)
def test_express_pattern(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") express_pattern = ExpressPattern(corpus) result = express_pattern.compute() expected = { ("<PERSON>", "在", "<GPE>", "的", "<ORG>", "读", "书", "。"): [ Document( text=[ "王", "小", "明", "在", "北", "京", "的", "清", "华", "大", "学", "读", "书", "。", ], span_set=SpanSet( [ Span(0, 3, "PERSON", value=None, normal_value=None), Span(4, 6, "GPE", value=None, normal_value=None), Span(7, 11, "ORG", value=None, normal_value=None), ] ), id="1", label=None, extra_attr={}, ), Document( text=[ "王", "小", "明", "在", "台", "北", "新", "竹", "的", "清", "华", "大", "学", "读", "书", "。", ], span_set=SpanSet( [ Span(0, 3, "PERSON", value=None, normal_value=None), Span(4, 8, "GPE", value=None, normal_value=None), Span(9, 13, "ORG", value=None, normal_value=None), ] ), id="3", label=None, extra_attr={}, ), ], ("来", "一", "首", "<歌手名>", "的", "歌", "。"): [ Document( text=["来", "一", "首", "蓝", "泽", "雨", "的", "歌", "。"], span_set=SpanSet([Span(3, 6, "歌手名", value=None, normal_value=None)]), id="2", label=None, extra_attr={}, ) ], } assert result == expected
corpus = Corpus.read_from_file("./data/all_data.conllx") list1 = [] for doc in corpus: list1.append(doc) len_all = len(list1) doc_list = [] for i in range(0, len_all): l1 = choice(list1) len1 = len(l1.text) span_list = [] for span in l1.span_set: span_list.append(span) l2 = choice(list1) for span in l2.span_set: span_ll = Span(start=len1 + span.start, end=len1 + span.end, entity=span.entity) span_list.append(span_ll) text = "".join(l1.text) + "".join(l2.text) doc1 = Document(text) doc1.entities = SpanSet(span_list) doc1.domain = l1.domain doc_list.append(doc1) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) corpus.write_to_file('./data/data_all.conllx')
import filecmp from tokenizer_tools.tagset.offset.document import Document from tokenizer_tools.tagset.offset.span import Span from tokenizer_tools.tagset.offset.corpus import Corpus from tokenizer_tools.tagset.offset.document_compare_ways import DocumentCompareWays from tokenizer_tools.tagset.offset.span_set import SpanSet seq = Document("王小明在北京的清华大学读书。", id="1") seq.span_set.append(Span(0, 3, "PERSON", "王小明")) seq.span_set.append(Span(4, 6, "GPE", "北京")) seq.span_set.append(Span(7, 11, "ORG", "清华大学")) seq_one = seq seq = Document("来一首蓝泽雨的歌。", id="2") seq.span_set.append(Span(3, 6, "歌手名", "蓝泽雨")) seq_two = seq def test_read_from_file(datadir): corpus = Corpus.read_from_file(datadir / "output.conllx") assert len(corpus) == 2 assert corpus[0] == seq_one assert corpus[1] == seq_two def test_write_to_file(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one)
def offset_to_biluo(sequence: Document) -> List[str]: """ Convert Sequence object to BILUO string :param sequence: Sequence example :return: string of BILUO encoding """ encoding = ['O'] * len(sequence.text) for span in sequence.span_set: encoder = BILUOEncoderDecoder(span.entity) entity_text = sequence.text[span.start:span.end] entity_encoding = encoder.encode(entity_text) encoding[span.start:span.end] = entity_encoding return encoding if __name__ == "__main__": seq = Document("王小明在北京的清华大学读书。") seq.span_set.append(Span(0, 3, 'PERSON', '王小明')) seq.span_set.append(Span(4, 6, 'GPE', '北京')) seq.span_set.append(Span(7, 11, 'ORG', '清华大学')) check_result = seq.check_span_set() print(check_result) encoding = offset_to_biluo(seq) print(encoding)
def two_add_before_link_after(map_data, file1, file2, before, link, after, domain): list1 = read_raw_data(file1) list2 = read_raw_data(file2) before_list = read_raw_data(before) link_list = read_raw_data(link) after_list = read_raw_data(after) len_all = min(len(list1), len(list2)) path1 = os.path.basename(file1) path2 = os.path.basename(file2) doc_list = [] dict_list = read_map(map_data) # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(list2) link = choice(link_list) before = choice(before_list) before2 = choice(before_list) after = choice(after_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) link_end = line_end_remove(link) before_end = line_end_remove(before) after_end = line_end_remove(after) # file1 + file2 len1 = l1 + l2 doc1 = Document(len1) doc1.domain = domain doc1.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]), ] doc1.entities = SpanSet(span_list1) doc_list.append(doc1) # file1 + link + file2 len2 = l1 + link + l2 doc2 = Document(len2) doc2.domain = domain doc2.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list2 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link), end=len(l1 + link + l2end), entity=path2[:-4]), ] doc2.entities = SpanSet(span_list2) doc_list.append(doc2) # before + file1 + file2 len3 = before + l1 + l2 doc3 = Document(len3) doc3.domain = domain doc3.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list3 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1), end=len(before + l1 + l2end), entity=path2[:-4]), ] doc3.entities = SpanSet(span_list3) doc_list.append(doc3) # before + file1 + file2 + after len4 = before + l1 + l2 + after doc4 = Document(len4) doc4.domain = domain doc4.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list4 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1), end=len(before + l1 + l2end), entity=path2[:-4]), ] doc4.entities = SpanSet(span_list4) doc_list.append(doc4) # before + file1 + link + file2 len5 = before + l1 + link + l2 doc5 = Document(len5) doc5.domain = domain doc5.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list5 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1 + link), end=len(before + l1 + link + l2end), entity=path2[:-4]), ] doc5.entities = SpanSet(span_list5) doc_list.append(doc5) # before + file1 + link + file2 + after len6 = before + l1 + link + l2 + after doc6 = Document(len6) doc6.domain = domain doc6.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list6 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(before + l1 + link), end=len(before + l1 + link + l2end), entity=path2[:-4]), ] doc6.entities = SpanSet(span_list6) doc_list.append(doc6) # file1 + file2 + after len7 = l1 + l2 + after doc7 = Document(len7) doc7.domain = domain doc7.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list7 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]), ] doc7.entities = SpanSet(span_list7) doc_list.append(doc7) # file1 + link + file2 + after len8 = l1 + link + l2 + after doc8 = Document(len8) doc8.domain = domain doc8.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list8 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link), end=len(l1 + link + l2end), entity=path2[:-4]), ] doc8.entities = SpanSet(span_list8) doc_list.append(doc8) # file1 + before + file2 len9 = l1 + before + l2 doc9 = Document(len9) doc9.domain = domain doc9.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list9 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + before), end=len(l1 + before + l2end), entity=path2[:-4]), ] doc9.entities = SpanSet(span_list9) doc_list.append(doc9) # file1 + before + file2 + after len10 = l1 + before + l2 + after doc10 = Document(len10) doc10.domain = domain doc10.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list10 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + before), end=len(l1 + before + l2end), entity=path2[:-4]), ] doc10.entities = SpanSet(span_list10) doc_list.append(doc10) # file1 + link + before + file2 len11 = l1 + link + before + l2 doc11 = Document(len11) doc11.domain = domain doc11.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list11 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link + before), end=len(l1 + link + before + l2end), entity=path2[:-4]), ] doc11.entities = SpanSet(span_list11) doc_list.append(doc11) # file1 + link + before + file2 + after len12 = l1 + link + before + l2 + after doc12 = Document(len12) doc12.domain = domain doc12.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list12 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + link + before), end=len(l1 + link + before + l2end), entity=path2[:-4]), ] doc12.entities = SpanSet(span_list12) doc_list.append(doc12) # before + file1 + before2 + file2 len13 = before + l1 + before2 + l2 doc13 = Document(len13) doc13.domain = domain doc13.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list13 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + before2), end=len(l1 + before + before2 + l2end), entity=path2[:-4]), ] doc13.entities = SpanSet(span_list13) doc_list.append(doc13) # before + file1 + before2 + file2 + after len14 = before + l1 + before2 + l2 + after doc14 = Document(len14) doc14.domain = domain doc14.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list14 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + before2), end=len(l1 + before + before2 + l2end), entity=path2[:-4]), ] doc14.entities = SpanSet(span_list14) doc_list.append(doc14) # before + file1 + link + before2 + file2 len15 = before + l1 + link + before2 + l2 doc15 = Document(len15) doc15.domain = domain doc15.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list15 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + link + before2), end=len(l1 + before + link + before2 + l2end), entity=path2[:-4]), ] doc15.entities = SpanSet(span_list15) doc_list.append(doc15) # before + file1 + link + before2 + file2 + after len16 = before + l1 + link + before2 + l2 + after doc16 = Document(len16) doc16.domain = domain doc16.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list16 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), Span(start=len(l1 + before + link + before2), end=len(l1 + before + link + before2 + l2end), entity=path2[:-4]), ] doc16.entities = SpanSet(span_list16) doc_list.append(doc16) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + ".conllx" corpus.write_to_file(res_path)
def one_before_after_add_noise(map_data, file1, before_noise, after_noise, domain): list1 = read_raw_data(file1) before_list = read_raw_data(before_noise) after_list = read_raw_data(after_noise) len_all = max(len(list1), len(before_list), len(after_list)) dict_list = read_map(map_data) path1 = os.path.basename(file1) doc_list = [] # 数量min for i in range(0, len_all): l1 = choice(list1) before = choice(before_list) after = choice(after_list) l1end = line_end_remove(l1) before_end = line_end_remove(before) after_end = line_end_remove(after) # file1 len1 = l1 doc1 = Document(len1) doc1.domain = domain doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc1.entities = SpanSet(span_list1) doc_list.append(doc1) # before + file1 len2 = before + l1 doc2 = Document(len2) doc2.domain = domain doc2.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list2 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), ] doc2.entities = SpanSet(span_list2) doc_list.append(doc2) # before + file1 + after len3 = before + l1 + after doc3 = Document(len3) doc3.domain = domain doc3.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list3 = [ Span(start=len(before), end=len(before + l1end), entity=path1[:-4]), ] doc3.entities = SpanSet(span_list3) doc_list.append(doc3) # file1 + after len4 = l1 + after doc4 = Document(len4) doc4.domain = domain doc4.intent = dict_list[path1[:-4]] + ": " + path1[:-4] span_list4 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), ] doc4.entities = SpanSet(span_list4) doc_list.append(doc4) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + ".conllx" corpus.write_to_file(res_path)