def read_txt(file: str, number: int = -1): insts = [] digit2zero = False insts = [] sentences = [] predictions = [] sentence = '' sections = [] prediction = [] true_label = [] with open(file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line == '': if sentence.strip() != '': sentences.append(' '.join(sentence.split(' ')[1:])) sections.append(prediction[0]) predictions.append(prediction[1:]) inst = Instance(input=Sentence(words=sentence.split()[1:], heading=prediction[0])) inst.output = true_label inst.prediction = prediction[1:] insts.append(inst) # print(len(sentence.split()[1:])) # print(len(label[1:])) # print(len(output)) sentence = '' prediction = [] true_label = [] else: if digit2zero: sentence += re.sub('\d', '0', line.split()[0]) + ' ' else: sentence += line.split()[0] + ' ' prediction.append(line.split()[1]) if len(line.split()) == 3: true_label.append(line.split()[2]) if sentence.strip() != '': sentences.append(' '.join(sentence.split(' ')[1:])) sections.append(prediction[0]) predictions.append(prediction[1:]) inst = Instance(input=Sentence(words=sentence.split()[1:], heading=prediction[0])) inst.output = true_label inst.prediction = prediction[1:] insts.append(inst) # print(len(sentence.split()[1:])) # print(len(label[1:])) # print(len(output)) return sentences, sections, predictions, insts
def distance(self, s1: Sentence, s2: Sentence) -> float: return [ lcs.llcs( s1.lowercase_tokens(), s2.lowercase_tokens(), ) ]
def distance(self, s1: Sentence, s2: Sentence) -> float: return [ self.wv_levenshtein( s1.lowercase_tokens(), s2.lowercase_tokens(), ) ]
def distance(self, s1: Sentence, s2: Sentence) -> float: return [ levenshtein( s1.lowercase_tokens(), s2.lowercase_tokens(), self.del_cost, self.insert_cost, self.subs_cost, self.normalize, ) ]
def postprocess(self): self.data = list() senttokens = list() for token in self.output_data: if token == "</s>": self.data.append(Sentence(senttokens)) senttokens = list() else: tok, tag, lemma = token.split("\t") senttokens.append(Token(word=tok, xpos=tag, lemma=lemma)) if senttokens: # add last sentence self.data.append(Sentence(senttokens))
def read_txt(self, file: str, number: int = -1) -> List[Instance]: print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": inst = Instance(Sentence(words), labels) inst.set_id(len(insts)) insts.append(inst) words = [] labels = [] if len(insts) == number: break continue word, label = line.split('<|>') if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def read_txt(self, file: str, number: int = -1, category: str = "all") -> List[Instance]: print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] ori_words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": if category == "all" or words[0] == category: insts.append( Instance(Sentence(words[1:], ori_words[1:]), labels[1:])) words = [] ori_words = [] labels = [] if len(insts) == number: break continue ls = line.split() word, label = ls[0], ls[-1] if len(ls) == 1: label = "O" ori_words.append(word) if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def read_extraction_results(file: str, number: int = -1, digit2zero: bool = True) -> List[Instance]: print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] ground_truth = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": inst = Instance(Sentence(words), labels) inst.ground_truth = ground_truth insts.append(inst) words = [] labels = [] ground_truth = [] if len(insts) == number: break continue _, word, gold_label, predicted_segment_label = line.split() if digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) labels.append(predicted_segment_label) ground_truth.append(gold_label) print("number of sentences: {}".format(len(insts))) return insts
def read_txt_with_extraction(self, file: str, extraction_file: str, number: int = -1) -> List[Instance]: print("Reading file: " + file) print("Reading file: " + extraction_file) f_extract = open(extraction_file, 'r', encoding='utf-8') extract_lines = f_extract.readlines() i = -1 insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] boundaries = [] for line in tqdm(f.readlines()): i += 1 extract_line = extract_lines[i] extract_line = extract_line.rstrip() line = line.rstrip() if line == "": insts.append(Instance(Sentence(words), labels, boundaries)) words = [] labels = [] boundaries = [] if len(insts) == number: break continue word, label = line.split() _, word_, gold_, predicted_label = extract_line.split() if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) boundaries.append(predicted_label) print("number of sentences: {}".format(len(insts))) return insts
def read_conll(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]: print("Reading file: " + file) insts = [] num_entity = 0 # vocab = set() ## build the vocabulary find_root = False with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": insts.append(Instance(Sentence(words), labels)) words = [] labels = [] if len(insts) == number: break continue vals = line.split() word = vals[1] label = vals[10] if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) if label.startswith("B-"): num_entity +=1 print("number of sentences: {}, number of entities: {}".format(len(insts), num_entity)) return insts
def read_txt(self, file: str, number: int = -1) -> List[Instance]: insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() # 检测到空行,即句子间分割标志 if line == "": if len(words) == 0: continue insts.append(Instance(Sentence(words), labels)) words = [] labels = [] if len(insts) == number: break continue word = line.split()[0] label = line.split()[1] if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append(label) print("number of sentences: {}".format(len(insts))) return insts
def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent.rstrip().split("\n"): ( index, word, lemma, upos, xpos, feats, head, deprel, deps, misc, ) = tok.split("\t") mytokens.append( Token( id=index, word=word, lemma=lemma, # don't write out gold pos # upos=upos, xpos=xpos, feats=str(Morph.from_parzu(xpos + "|" + feats)), head=head, deprel=deprel, deps=deps, misc=misc, )) self.data.append(Sentence(mytokens))
def three_class_data_iter(which, n=None): assert which in ["dev", "test", "train"] f = base_path / f"snli_1.0_{which}.jsonl" with f.open() as i: yielded = 0 for line in i: if n is not None and yielded >= n: break loaded = json.loads(line) if loaded["gold_label"] == "-": continue yield ((Sentence(loaded["sentence1"], loaded["sentence1_parse"]), Sentence(loaded["sentence2"], loaded["sentence2_parse"])), loaded["gold_label"]) yielded += 1
def postprocess(self): self.data = list() for sent_doc in self.output_data: self.data.append( Sentence( Token(word=str(tok), xpos=tok.tag_, upos=tok.pos_, lemma=tok.lemma_) for tok in sent_doc))
def distance(self, s1: Sentence, s2: Sentence) -> float: B = nx.Graph() s1_tokens = s1.lowercase_tokens() s2_tokens = s2.lowercase_tokens() top_nodes = [(0, idx) for idx in range(len(s1_tokens))] bottom_nodes = [(1, idx) for idx in range(len(s2_tokens))] B.add_nodes_from(top_nodes, bipartite=0) B.add_nodes_from(bottom_nodes, bipartite=1) for idx1, t1 in enumerate(s1_tokens): for idx2, t2 in enumerate(s2_tokens): # use negative of similarity because we will do minimum weight matching B.add_edge((0, idx1), (1, idx2), weight=-self.sim(t1, t2)) matching = nx.bipartite.matching.minimum_weight_full_matching(B) edges = [(v_from, v_to) for v_from, v_to in matching.items() if v_from[0] == 0] sum_sim = sum(-B[v_from][v_to]["weight"] for v_from, v_to in edges) return [sum_sim / len(s2_tokens)]
def postprocess(self): self.data = list() for sent in self.output_data: senttokens = list() for tok in sent.split("\n"): token, tag = tok.split("\t") stts = rftag2stts(tag) senttokens.append( Token(word=token, xpos=stts, feats=str(Morph.from_rftag(tag)))) self.data.append(Sentence(senttokens))
def postprocess(self): self.data = list() for sent in self.output_data: self.data.append( Sentence( Token( id=str(rel.dep().index()), word=rel.dep().word(), # don't write out gold pos # xpos=rel.dep().tag(), head=str(rel.gov().index()), deprel=str(rel.reln()), ) for rel in sent.typedDependencies()))
def text_to_instances(self, tokens: List[str], annotations: List[Dict[str, Any]] = [], **metadata) -> Instance: metadata["og_tokens"] = tokens if self.subword_converter is not None: tokens, tokidx2bpeidxs = self.subword_converter(tokens) else: tokidx2bpeidxs = {i: [i] for i in range(len(tokens))} metadata["tokidx2bpeidxs"] = tokidx2bpeidxs tags = self.get_tags(tokens, annotations, metadata) for tokens, tags, metadata in self.as_maximal_subdocs(tokens, tags, metadata): inst = Instance(Sentence(tokens), tags) inst.metadata = metadata yield inst
def postprocess(self): self.data = list() for sent in self.output_data.sentences: self.data.append( Sentence( Token( id=tok.index, word=tok.text, lemma=tok.lemma, feats=tok.feats, head=str(tok.governor), deprel=tok.dependency_relation, ) for tok in sent.words))
def postprocess(self): self.data = list() for sent in self.output_data.sents: self.data.append( Sentence( Token( word=tok.text, lemma=tok.lemma_, # upos=tok.pos_, # xpos=tok.tag_, head=str(tok.head.i - sent[0].i + 1), deprel=tok.dep_, ) for tok in sent))
def postprocess(self): self.data = list() for sent in self.output_data: mytokens = list() for tok in sent: text, rftmorph, stts, lemma = tok mytokens.append( Token( word=text, xpos=stts, feats=str(Morph.from_rftag(rftmorph)), lemma=lemma, )) self.data.append(Sentence(mytokens))
def postprocess(self): self.data = list() for sent in self.output_data.rstrip().split("\n\n"): mytokens = list() for token_entry in sent.split("\n"): tok, tag, lemma = token_entry.split("\t") maintag = tag.split(".")[0] # kleine korrektur stts = "$." if maintag == "$" else maintag mytokens.append( Token( word=tok, xpos=stts, lemma=lemma, feats=str(Morph.from_tigertag(tag)), )) self.data.append(Sentence(mytokens))
def read_txt( self, file: str, number: int = -1 ) -> List[Instance]: # expected type -> return type count_0 = 0 print("Reading file: " + file) insts = [] with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] for line in tqdm(f.readlines()): line = line.rstrip() if line == "": assert len(words) == len(labels) inst = Instance(Sentence(words), labels) inst.set_id(len(insts)) insts.append(inst) words = [] labels = [] if len(insts) == number: break continue #for x = line.split() if len(x) == 1: word, label = '&', x[0] # '&' elif len(x) == 2: word, label = x[0], x[1] else: print(x) if self.digit2zero: word = re.sub( '\d', '0', word ) # replace all digits with 0. '\d' - unicode decimal digits [0-9] count_0 += 1 words.append(word) self.vocab.add(word) labels.append(label) print("numbers being replaced by zero:", count_0) print("number of sentences: {}".format(len(insts))) return insts
def read_txt(self, file: str, number: int = 5) -> List[Instance]: print("Reading file: " + file) insts = [] # f_vec = open(file[:8]+'vec_test.pkl', 'rb') f_vec = open(file[:9] + 'vec_' + file[9:-4] + '.pkl', 'rb') print(file[:8] + 'vec_' + file[8:-4] + '.pkl') all_vecs = pickle.load(f_vec) f_vec.close with open(file, 'r', encoding='utf-8') as f: sents = [] ori_sents = [] labels = [] types = [] sent_idx = 0 review_idx = [] reply_idx = [] labels_pair = [] max_review_id = 0 new_index = 0 f = f.readlines() count_review = 0 count_reply = 0 argu_sent_review = 0 argu_sent_reply = 0 argu_review = 0 argu_reply = 0 for line_idx, line in enumerate(tqdm(f)): line = line.rstrip() if line == "": new_index = 0 vecs = all_vecs[len(insts)] # max_num_tokens = len(vecs[0]) num_tokens = [len(vecs[i]) for i in range(len(vecs))] inst = Instance(Sentence(sents, ori_sents), labels, vecs, types, review_idx, reply_idx, labels_pair, max_review_id, num_tokens) ##read vector # print(review_idx,reply_idx,max_review_id,labels_pair) insts.append(inst) sents = [] ori_sents = [] labels = [] types = [] sent_idx = 0 review_idx = [] reply_idx = [] labels_pair = [] max_review_id = 0 if len(insts) == number: break continue ls = line.split('\t') if ls[1] == 'O': sent, label, label_pair, type = ls[0], ls[1], 0, ls[-2] else: sent, label, label_pair, type = ls[ 0], ls[1][:2] + '0', int(ls[2][2:]), ls[-2] ori_sents.append(sent) if type == 'Review': count_review += 1 type_id = 0 if label[0] != 'O': review_idx.append(sent_idx) argu_sent_review += 1 if label[0] == 'B': argu_review += 1 # else: # review_idx.append(0) max_review_id += 1 else: type_id = 1 count_reply += 1 reply_idx.append(sent_idx) if label[0] != 'O': argu_sent_reply += 1 if label[0] == 'B': argu_reply += 1 types.append(type_id) sent_idx += 1 new_index += 1 # if self.digit2zero: # sent = re.sub('\d', '0', sent) # replace digit with 0. sents.append(sent) self.vocab.add(sent) # bc = BertClient() # vec = bc.encode([sent]) # vecs.append(vec[0][0]) labels.append(label) labels_pair.append(label_pair) print( 'review, reply, review_argu, reply_argu, review_sent_argu, reply_sent_argu', count_review, count_reply, argu_review, argu_reply, argu_sent_review, argu_sent_reply) print("number of sentences: {}".format(len(insts))) all_vecs = 0 vecs = 0 return insts
def distance(self, s1: Sentence, s2: Sentence) -> float: return [simple_distance(s1.tree(), s2.tree())]
def postprocess(self): """re-format output_data so that it conforms to eval format""" self.data = list() for sent in self.output_data: self.data.append( Sentence(Token(word=tok, xpos=tag) for tok, tag in sent))
def postprocess(self): self.data = list() for sent in self.output_data.sentence: self.data.append( Sentence( Token(word=tok.word, xpos=tok.pos) for tok in sent.token))
def sents_to_insts(self, sentences: List[str]) -> List[Instance]: insts = [] for sentence in sentences: words = sentence.split() insts.append(Instance(Sentence(words))) return insts
def sent_to_insts(self, sentence: str) -> List[Instance]: words = sentence.split() return[Instance(Sentence(words))]
def read_trigger_txt(self, file: str, number: int = -1) -> List[Instance]: label_vocab = dict() print("Reading file: " + file) insts = [] max_length = 0 with open(file, 'r', encoding='utf-8') as f: words = [] labels = [] word_index = 0 for line in tqdm(f.readlines()): line = line.rstrip() if line == "": # check the sequence of index to find entity which consists of multiple words entity_dict = dict() for ent in labels: if ent[0].startswith("B-") or ent[0].startswith( "I-") or ent[0].startswith("T-"): if ent[0].split("-")[1] not in entity_dict: entity_dict[ent[0].split("-")[1]] = [[ words[ent[1]], ent[1] ]] else: entity_dict[ent[0].split("-")[1]].append( [words[ent[1]], ent[1]]) # entity word, index, type trigger_positions = [] trigger_keys = [] for key in entity_dict: if key in [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]: trigger_positions.append( [i[1] for i in entity_dict[key]]) trigger_keys.append(" ".join( i[0] for i in entity_dict[key])) else: if key not in label_vocab: label_vocab[key] = len(label_vocab) trigger_label = label_vocab[key] final_labels = [] for label in labels: if label[0].startswith("T"): final_labels.append("O") else: final_labels.append(label[0]) for trigger_position, trigger_key in zip( trigger_positions, trigger_keys): insts.append( Instance(Sentence(words), final_labels, None, trigger_label, trigger_position, trigger_key)) #insts.append(Instance(Sentence(words), labels, None, trigger_label, trigger_positions)) if len(words) > max_length: max_length = len(words) words = [] labels = [] word_index = 0 if len(insts) == number: break continue word, label = line.split() if self.digit2zero: word = re.sub('\d', '0', word) # replace digit with 0. words.append(word) self.vocab.add(word) labels.append([label, word_index]) word_index += 1 print("number of sentences: {}".format(len(insts))) return insts, max_length, len(label_vocab)