def get_sentence_boundaries(path_tok, path_conll): """ :type path_tok: str :type path_conll: str :rtype: list of (int, int) Compute sentence boundaries based on the tokenized file and the sentence-splitted file """ edus = read_edus(path_tok) # list of list of int sentences = read_sentences(path_conll) # list of list of str # Assign EDU ID to each token in the sentence list. tokens_with_edu_ids = utils.flatten_lists(edus) assert len(tokens_with_edu_ids) == len(utils.flatten_lists(sentences)) sentences_with_edu_ids = assign_edu_ids_to_sentences(sentences, tokens_with_edu_ids) # Adjustment sentences_with_edu_ids = adjust(sentences_with_edu_ids, n_edus=len(edus)) assert len(tokens_with_edu_ids) == len(utils.flatten_lists(sentences_with_edu_ids)) # Compute boundaries bnds = compute_boundaries(sentences_with_edu_ids) # Check test_boundaries(bnds, n_edus=len(edus)) return bnds
def get_paragraph_boundaries(path_tok, path_tok2): """ :type path_tok: str :type path_tok2: str :rtype: list of tuple of int Compute paragraph boundaries based on the tokenized file and the paragraph-splitted file """ edus = read_edus(path_tok) # list of list of int paragraphs = read_paragraphs(path_tok2) # list of list of str # Assign EDU ID to each token in the paragraph list tokens_with_edu_ids = utils.flatten_lists(edus) # if len(tokens_with_edu_ids) != len(utils.flatten_lists(paragraphs)): # print(path_tok, path_tok2) # print(len(tokens_with_edu_ids), len(utils.flatten_lists(paragraphs))) assert len(tokens_with_edu_ids) == len(utils.flatten_lists(paragraphs)) paragraphs_with_edu_ids = assign_edu_ids_to_sentences( paragraphs, tokens_with_edu_ids) # Adjust paragraphs_with_edu_ids = adjust(paragraphs_with_edu_ids, n_edus=len(edus)) assert len(tokens_with_edu_ids) == len( utils.flatten_lists(paragraphs_with_edu_ids)) # Compute boundaries bnds = compute_boundaries(paragraphs_with_edu_ids) # Check test_boundaries(bnds, n_edus=len(edus)) return bnds
def evaluate_entity_label(pred, label, classes): pred = flatten_lists(pred) label = flatten_lists(label) assert len(pred) == len(label) cla = [i.split('-')[-1] for i in classes if i != 'O'] cla = list(set(cla)) cla2ind = {} cla2ind = dict((c, ind) for ind, c in enumerate(cla)) index = 0 pred_entities = np.zeros(len(cla), dtype=int) #TP+FP label_entities = np.zeros(len(cla), dtype=int) #TP+FN acc = np.zeros(len(cla), dtype=int) #TP while index < len(label): label_tag = label[index] if label_tag == 'O': index += 1 else: c = label_tag.split('-')[-1] c = cla2ind[c] next_tag = 'I' + label_tag[1:] j = index + 1 while label[j] == next_tag and j < len(label): j += 1 label_entities[c] += 1 label_entity = ''.join(label[index:j]) pred_entity = ''.join(pred[index:j]) if label_entity == pred_entity: acc[c] += 1 index = j #统计Pred_tag 上的Entity index = 0 while index < len(pred): pred_tag = pred[index] if pred_tag == 'O': index += 1 elif pred_tag.split('-')[0] == 'B': c = pred_tag.split('-')[-1] c = cla2ind[c] next_tag = 'I' + pred_tag[1:] j = index + 1 while pred[j] == next_tag and j < len(pred): j += 1 pred_entities[c] += 1 index = j else: index += 1 # if index%100==0: # print(index,end=' ') units = [] TP = acc FP = pred_entities - acc FN = label_entities - acc TN = acc.sum() - acc for c, ind in cla2ind.items(): units.append(Eval_unit(TP[ind], FP[ind], FN[ind], TN[ind], c)) return units
def main(): config = utils.Config() filenames = os.listdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed")) filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")] filenames = [ n.replace(".paragraph.boundaries", ".edus") for n in filenames ] filenames.sort() for filename in filenames: # Path path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename + ".tokenized") path_conll = os.path.join( config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename.replace(".edus", ".sentences.conll")) path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed", filename + ".postags") # Read edus = utils.read_lines( path_edus, process=lambda line: line.split()) # list of list of str tokens_e = utils.flatten_lists(edus) # list of str sentences = utils.read_conll( path_conll, keys=["ID", "FORM", "LEMMA", "POSTAG", "_1", "HEAD", "DEPREL"]) # list of list of {str: str} conll_lines = utils.flatten_lists(sentences) # list of {str: str} tokens_s = [conll_line["FORM"] for conll_line in conll_lines] # list of str postags_s = [conll_line["POSTAG"] for conll_line in conll_lines] # list of str # Check whether the number of tokens and that of postags are equivalent for token_e, token_s, postag_s in zip(tokens_e, tokens_s, postags_s): if token_e != token_s: raise ValueError("Error! %s != %s" % (token_e, token_s)) # Create the POSTAG-version of EDUs postag_i = 0 edus_postag = [] for edu in edus: edu_postag = [postags_s[postag_i + i] for i in range(len(edu))] edus_postag.append(edu_postag) postag_i += len(edu) # Write with open(path_out, "w") as f: for edu_postag in edus_postag: f.write("%s\n" % " ".join(edu_postag))
def evaluate(tag_lists, target_tag_lists): # 评估准确率 correct_count = 0. # 展开嵌套列表 tag_lists = flatten_lists(tag_lists) target_tag_lists = flatten_lists(target_tag_lists) assert len(tag_lists) == len(target_tag_lists) for pred, tgt in zip(tag_lists, target_tag_lists): if pred == tgt: correct_count += 1. return correct_count / len(tag_lists)
def evaluate_single_label(pred, label, classes): pred = flatten_lists(pred) label = flatten_lists(label) matrix = confusion_matrix(pred, label, classes) TP = np.diag(matrix) FP = matrix.sum(axis=1) - TP FN = matrix.sum(axis=0) - TP TN = matrix.sum() - TP - FN - FP unit_list = [] for i in range(len(classes)): cla = classes[i] unit = Eval_unit(TP[i], FP[i], FN[i], TN[i], cla) unit_list.append(unit) return unit_list
def __init__(self, golden_tags, predict_tags, remove_O=True): self.golden_tags = flatten_lists(golden_tags) self.predict_tags = flatten_lists(predict_tags) if remove_O: self._remove_Otags() self.tagset = set(self.golden_tags) self.correct_tags_number = self.count_correct_tags() self.predict_tags_counter = Counter(self.predict_tags) self.golden_tags_counter = Counter(self.golden_tags) self.precision_scores = self.cal_precision() self.recall_scores = self.cal_recall() self.f1_scores = self.cal_f1()
def ensemble_evaluate(results, targets, remove_O=False): """ensemble多个模型""" for i in range(len(results)): results[i] = flatten_lists(results[i]) pred = [] for result in zip(*results): ensemble_tag = Counter(result).most_common(1)[0][0] pred.append(ensemble_tag) tag_lists = flatten_lists(targets) assert len(pred) == len(tag_lists) print("Ensemble 四个模型的结果如下:") _print_metrics(tag_lists, pred)
def ensemble_evaluate(results, targets, remove_O=False): """ensemble多个模型""" for i in range(len(results)): results[i] = flatten_lists(results[i]) pred_tags = [] for result in zip(*results): ensemble_tag = Counter(result).most_common(1)[0][0] pred_tags.append(ensemble_tag) targets = flatten_lists(targets) assert len(pred_tags) == len(targets) print("Ensemble 四个模型的结果如下:") metrics = Metrics(targets, pred_tags, remove_0=remove_O) metrics.report_scores(dtype='ensembel')
def ensemble_evaluate(results, targets, remove_O=False): """Multiple models of ensemble""" for i in range(len(results)): results[i] = flatten_lists(results[i]) pred_tags = [] for result in zip(*results): ensemble_tag = Counter(result).most_common(1)[0][0] pred_tags.append(ensemble_tag) targets = flatten_lists(targets) assert len(pred_tags) == len(targets) print("The results of the four Ensemble models are as follows:") metrics = Metrics(targets, pred_tags, remove_O=remove_O) metrics.report_scores() metrics.report_confusion_matrix()
def convert_edus(edus, raw_lines): """ :type edus: list of str :type raw_lines: list of str :rtype: list of str """ edu_positions = [] for edu_i in range(len(edus)): raw = [] for char_i in range(len(list(edus[edu_i]))): if edus[edu_i][char_i] == " ": continue raw.append(edu_i) edu_positions.append(raw) edu_positions = utils.flatten_lists(edu_positions) # print(edu_positions) flatten_raw_lines = list("".join(utils.flatten_lists(raw_lines))) # print(flatten_raw_lines) result_positions = [-1 for _ in flatten_raw_lines] result_i = 0 cur_char_i = 0 cur_edu_i = 0 for char in flatten_raw_lines: if char == " ": result_positions[result_i] = cur_edu_i else: edu_i = edu_positions[cur_char_i] result_positions[result_i] = edu_i cur_char_i += 1 assert edu_i == cur_edu_i or edu_i == cur_edu_i + 1 cur_edu_i = edu_i result_i += 1 # print(result_positions) new_edus = [] for edu_i in range(len(edus)): b = result_positions.index(edu_i) e = b + result_positions.count(edu_i) new_edu = "".join(flatten_raw_lines[b:e]) new_edu = new_edu.strip() # print(new_edu) new_edus.append(new_edu) return new_edus
def compute_span_vectors( self, edus, edus_postag, sbnds, pbnds, padded_edu_vectors, mask_bwd, mask_fwd, batch_spans): """ :type edus: list of list of str :type edus_postag: list of list of str :type sbnds: list of (int, int) :type pbnds: list of (int, int) :type padded_edu_vectors: Variable(shape=(n_edus+2, bilstm_dim), dtype=np.float32) :type mask_bwd: Variable(shape=(1, bilstm_dim), dtype=np.float32) :type mask_fwd: Variable(shape=(1, bilstm_dim), dtype=np.float32) :type batch_spans: list of list of (int, int) :rtype: Variable(shape=(batch_size * n_spans, bilstm_dim + tempfeat_dim), dtype=np.float32) """ batch_size = len(batch_spans) n_spans = len(batch_spans[0]) total_spans = batch_size * n_spans for spans in batch_spans: assert len(spans) == n_spans # Reshape flatten_batch_spans = utils.flatten_lists(batch_spans) # total_spans * (int, int) # NOTE that indices in batch_spans should be shifted by +1 due to the boundary padding bm1_indices = [(b-1)+1 for b,e in flatten_batch_spans] # total_spans * int b_indices = [b+1 for b,e in flatten_batch_spans] # total_spans * int e_indices = [e+1 for b,e in flatten_batch_spans] # total_spans * int ep1_indices = [(e+1)+1 for b,e in flatten_batch_spans] # total_spans * int # Feature extraction bm1_padded_edu_vectors = F.get_item(padded_edu_vectors, bm1_indices) # (total_spans, bilstm_dim) b_padded_edu_vectors = F.get_item(padded_edu_vectors, b_indices) # (total_spans, bilstm_dim) e_padded_edu_vectors = F.get_item(padded_edu_vectors, e_indices) # (total_spans, bilstm_dim) ep1_padded_edu_vectors = F.get_item(padded_edu_vectors, ep1_indices) # (total_spans, bilstm_dim) mask_bwd = F.broadcast_to(mask_bwd, (total_spans, self.bilstm_dim)) # (total_spans, bilstm_dim) mask_fwd = F.broadcast_to(mask_fwd, (total_spans, self.bilstm_dim)) # (total_spans, bilstm_dim) span_vectors = mask_bwd * (e_padded_edu_vectors - bm1_padded_edu_vectors) \ + mask_fwd * (b_padded_edu_vectors - ep1_padded_edu_vectors) # (total_spans, bilstm_dim) # Template features tempfeat_vectors = self.template_feature_extractor.extract_batch_features( edus=edus, edus_postag=edus_postag, sbnds=sbnds, pbnds=pbnds, spans=flatten_batch_spans) # (total_spans, tempfeat_dim) tempfeat_vectors = utils.convert_ndarray_to_variable(tempfeat_vectors, seq=False) # (total_spans, tempfeat_dim) span_vectors = F.concat([span_vectors, tempfeat_vectors], axis=1) # (total_spans, bilstm_dim + tempfeat_dim) return span_vectors
def __init__(self, golden_tags, predict_tags, remove_O=False): # [[t1, t2], [t3, t4]...] --> [t1, t2, t3, t4...] self.golden_tags = flatten_lists(golden_tags) self.predict_tags = flatten_lists(predict_tags) if remove_O: # Remove the O tag, only the entity tag self._remove_Otags() self.tagset = set(self.golden_tags) self.correct_tags_number = self.count_correct_tags() self.predict_tags_counter = Counter(self.predict_tags) self.golden_tags_counter = Counter(self.golden_tags) self.precision_scores = self.cal_precision() self.recall_scores = self.cal_recall() self.f1_scores = self.cal_f1()
def __init__(self, gloden_tags, predict_tags, remove_0=False): self.golden_tags = flatten_lists(gloden_tags) self.predict_tags = flatten_lists(predict_tags) if remove_0: # 不统计非实体标记 self._remove_Otags() # 所有的tag总数 self.tagset = set(self.golden_tags) self.correct_tags_number = self.count_correct_tags() # print(self.correct_tags_number) self.predict_tags_count = Counter(self.predict_tags) self.golden_tags_count = Counter(self.golden_tags) # 精确率 self.precision_scores = self.cal_precision() # 召回率 self.recall_scores = self.cal_recall() # F1 self.f1_scores = self.cal_f1()
def ensemble_evaluate(results, targets): """ensemble多个模型""" for i in range(len(results)): results[i] = flatten_lists(results[i]) pred_tags = [] for result in zip(*results): ensemble_tag = Counter(result).most_common(1)[0][0] pred_tags.append(ensemble_tag) targets = flatten_lists(targets) assert len(pred_tags) == len(targets) correct = 0 for pred, tgt in zip(pred_tags, targets): if pred == tgt: correct += 1. accuracy = correct / len(targets) print("Ensemble四个模型的准确率为{:.2f}%".format(accuracy * 100))
def __init__(self, golden_tags, predict_tags, remove_O=False): # [[t1, t2], [t3, t4]...] --> [t1, t2, t3, t4...] self.golden_tags = flatten_lists(golden_tags) self.predict_tags = flatten_lists(predict_tags) if remove_O: # 将O标记移除,只关心实体标记 self._remove_Otags() # 辅助计算的变量 self.tagset = set(self.golden_tags) self.correct_tags_number = self.count_correct_tags() self.predict_tags_counter = Counter(self.predict_tags) self.golden_tags_counter = Counter(self.golden_tags) # 计算精确率 self.precision_scores = self.cal_precision() # 计算召回率 self.recall_scores = self.cal_recall() # 计算F1分数 self.f1_scores = self.cal_f1()
def main(): config = utils.Config() filenames = os.listdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed")) filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")] filenames = [ n.replace(".paragraph.boundaries", ".edus") for n in filenames ] filenames.sort() with open( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", "filelist.corenlp2.txt"), "w") as ff: for filename in filenames: # Path path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename + ".tokenized") path_sbnds = os.path.join( config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed", filename.replace(".edus", ".sentence.noproj.boundaries")) path_sents = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing", filename.replace(".edus", ".sentences")) # Read edus = utils.read_lines( path_edus, process=lambda line: line.split()) # list of list of str sbnds = utils.read_lines( path_sbnds, process=lambda line: (int(x) for x in line.split())) # list of (int, int) # Create sentences based on the sentence boundaries sentences = [] for begin_i, end_i in sbnds: sentence = edus[begin_i:end_i + 1] # list of list of str sentence = utils.flatten_lists(sentence) # list of str sentences.append(sentence) # Write with open(path_sents, "w") as fs: for sentence in sentences: fs.write("%s\n" % " ".join(sentence)) ff.write("%s\n" % path_sents)
def predict_health_cb(data, vectoriser, classifier): """Predict health labels for CB. Args: data (:obj:`list` of :obj:`tuple`): Crunchbase IDs and list of categories. Return: output(:obj:`list` of :obj:`dict`): Crunchbase IDS and bool. """ with open(vectoriser, 'rb') as h: vec = pickle.load(h) with open(classifier, 'rb') as h: clf = pickle.load(h) # Store index. data_idx = [tup[0] for tup in data] labels = clf.predict(vec.transform(flatten_lists([tup[1] for tup in data]))) return [{'id':id_, 'is_health':pred} for id_, pred in zip(data_idx, labels)]
def process(path_in, path_out): utils.mkdir(path_out) nlp_no_ssplit = spacy.load("en_core_web_sm", diable=["ner", "textcat"]) nlp_no_ssplit.tokenizer = nlp_no_ssplit.tokenizer.tokens_from_list nlp_no_ssplit.add_pipe(prevent_sentence_boundary_detection, name="prevent-sbd", before="parser") filenames = os.listdir(path_in) filenames = [n for n in filenames if n.endswith(".edu.txt.dep")] filenames.sort() skip_count = 0 for filename in pyprind.prog_bar(filenames): edus, sents, sbnds, disc_arcs = read_data( os.path.join(path_in, filename)) if edus is None: print("Skippted %s" % filename) skip_count += 1 continue assert len(sents) == len(sbnds) with open( os.path.join(path_out, filename.replace(".edu.txt.dep", ".edus.tokens")), "w") as f: for edu in edus: edu = " ".join(edu) f.write("%s\n" % edu) with open( os.path.join(path_out, filename.replace(".edu.txt.dep", ".sbnds")), "w") as f: for begin_i, end_i in sbnds: f.write("%d %d\n" % (begin_i, end_i)) with open( os.path.join(path_out, filename.replace(".edu.txt.dep", ".pbnds")), "w") as f: n_sents = len(sents) f.write("0 %d\n" % (n_sents - 1)) with open( os.path.join(path_out, filename.replace(".edu.txt.dep", ".arcs")), "w") as f: disc_arcs = sorted(disc_arcs, key=lambda x: x[1]) disc_arcs = ["%d-%d-%s" % (h, d, l) for h, d, l in disc_arcs] disc_arcs = " ".join(disc_arcs) f.write("%s\n" % disc_arcs) sents_postags = [] sents_arcs = [] for sent in sents: doc = nlp_no_ssplit(sent) sents_ = list(doc.sents) assert len(sents_) == 1 sent = sents_[0] postags = [token.tag_ for token in sent] arcs = [] found_root = False for token in sent: head = token.head.i + 1 dep = token.i + 1 label = token.dep_ if head == dep: assert label == "ROOT" assert not found_root # Only one token can be the root of dependency graph head = 0 found_root = True syn_arc = (head, dep, label) arcs.append(syn_arc) assert found_root arcs = ["%d-%d-%s" % (h, d, l) for h, d, l in arcs] sents_postags.append(postags) sents_arcs.append(arcs) postags = utils.flatten_lists(sents_postags) # List[str] arcs = utils.flatten_lists(sents_arcs) # List[str] with open(os.path.join(path_out, filename.replace(".edu.txt.dep", ".edus.postags")), "w") as fp,\ open(os.path.join(path_out, filename.replace(".edu.txt.dep", ".edus.arcs")), "w") as fa: begin_tok_i = 0 for edu in edus: length = len(edu) sub_postags = postags[begin_tok_i:begin_tok_i + length] sub_postags = " ".join(sub_postags) fp.write("%s\n" % sub_postags) sub_arcs = arcs[begin_tok_i:begin_tok_i + length] sub_arcs = " ".join(sub_arcs) fa.write("%s\n" % sub_arcs) begin_tok_i += length print("Processed %d files; %d files are skipped." % (len(filenames) - skip_count, skip_count))
def main(args): path = args.path filenames = os.listdir(path) filenames = [n for n in filenames if n.endswith(".edus.tokens")] filenames.sort() for filename in pyprind.prog_bar(filenames): edus = utils.read_lines( os.path.join(path, filename), process=lambda line: line.split()) # List[List[str]] sents = utils.read_lines( os.path.join(path, filename.replace(".edus.tokens", ".sents.tokens")), process=lambda line: line.split()) # List[List[str]] sents_postags = utils.read_lines( os.path.join(path, filename.replace(".edus.tokens", ".sents.postags")), process=lambda line: line.split()) # List[List[str]] sents_arcs = utils.read_lines( os.path.join(path, filename.replace(".edus.tokens", ".sents.arcs")), process=lambda line: line.split()) # List[List[str]] postags = utils.flatten_lists(sents_postags) # List[str] arcs = utils.flatten_lists(sents_arcs) # List[str] # Ending positions of gold EDUs edu_end_positions = [] tok_i = 0 for edu in edus: length = len(edu) edu_end_positions.append(tok_i + length - 1) tok_i += length # Ending positions of sentences sent_end_positions = [] tok_i = 0 for sent in sents: length = len(sent) sent_end_positions.append(tok_i + length - 1) tok_i += length # All the ending positions of sentences must match with those of gold EDUs assert set(sent_end_positions ) == set(edu_end_positions) & set(sent_end_positions) # Sentence boundaries sbnds = [] tok_i = 0 sent_i = 0 begin_edu_i = 0 for end_edu_i, edu in enumerate(edus): tok_i += len(edu) if tok_i - 1 == sent_end_positions[sent_i]: sbnds.append((begin_edu_i, end_edu_i)) sent_i += 1 begin_edu_i = end_edu_i + 1 assert sent_i == len(sent_end_positions) with open( os.path.join(path, filename.replace(".edus.tokens", ".sbnds")), "w") as f: for begin_i, end_i in sbnds: f.write("%d %d\n" % (begin_i, end_i)) # Extract POS tags and dependency arcs corresponding to each EDU with open(os.path.join(path, filename.replace(".edus.tokens", ".edus.postags")), "w") as fp,\ open(os.path.join(path, filename.replace(".edus.tokens", ".edus.arcs")), "w") as fa: begin_tok_i = 0 for edu in edus: length = len(edu) sub_postags = postags[begin_tok_i:begin_tok_i + length] sub_postags = " ".join(sub_postags) fp.write("%s\n" % sub_postags) sub_arcs = arcs[begin_tok_i:begin_tok_i + length] sub_arcs = " ".join(sub_arcs) fa.write("%s\n" % sub_arcs) begin_tok_i += length assert begin_tok_i - 1 == edu_end_positions[-1]