def select_summary_sentences(nlp, intro, abstract, filter_level): if not (intro and abstract): return "" if filter_level <= 1: return abstract doc = nlp(abstract) en_count_in_summary = 0 sentences_select = {} for sent in doc.sents: sentences_select[sent.text] = True for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: en_count_in_summary += 1 # if e.text in source: match_result = entity_match(e.text, intro, 2) if not match_result: sentences_select[e.sent.text] = False # print("ENTITY NOT FOUND: {}".format(e.text)) # print(">>source>>", intro) # print(">>summary>>", abstract) # break # if select and en_count_in_summary>0: result = [] for sent in doc.sents: if sentences_select[sent.text]: result.append(sent.text) return " ".join(result)
def select_example(nlp, intro, abstract, filter_level): if not (intro and abstract): return False if filter_level <= 1: return True doc = nlp(abstract) en_count_in_summary = 0 select = True for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: en_count_in_summary += 1 # if e.text in source: match_result = entity_match(e.text, intro, 2) if not match_result: select = False # print("ENTITY NOT FOUND: {}".format(e.text)) # print(">>source>>", intro) # print(">>summary>>", abstract) break # if select and en_count_in_summary>0: if select: return True else: return False
def ent_count_match(nlp, base, parent, is_scispacy=False): # perform NER on base, then match in parant: doc = nlp(base) ent_count_base = 0 en_count_in_base_parent = 0 if is_scispacy: for e in doc.ents: ent_count_base += 1 # if e.text in source: match_result = entity_match(e.text, parent, 1) if match_result: en_count_in_base_parent += 1 else: for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: ent_count_base += 1 # if e.text in source: match_result = entity_match(e.text, parent, 2) if match_result: en_count_in_base_parent += 1 return ent_count_base, en_count_in_base_parent
def create_ent_augmented_target(source_file, target_file, out_text_file, out_bpe_file, tokenizer_dir, special_token=50009, max_len=1024): n_s = count_lines_in_text_file(source_file) n_t = count_lines_in_text_file(target_file) assert n_s == n_t, \ "Number of lines not consistent: {}, {}".format(n_s, n_t) nlp = spacy.load("en_core_web_lg") encoder_args = SimpleNamespace( encoder_json=os.path.join(tokenizer_dir, "encoder.json"), vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"), keep_empty=True) bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe) with open(source_file, 'r') as s_f, \ open(target_file, 'r') as t_f, \ open(out_bpe_file, 'w') as out_bpe_f, \ open(out_text_file, 'w') as out_text_f: for _ in tqdm(range(n_s)): sline = s_f.readline().strip() tline = t_f.readline().strip() doc = nlp(tline) entities_per_example = [] for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: # if e.text in source: match_result = entity_match(e.text, sline, 2) if match_result: entities_per_example.append(match_result[0]) target_bpe = bpe.encode(tline) if entities_per_example: entity_bpe = bpe.encode(", ".join(entities_per_example)) augmented_target_bpe = entity_bpe + [ special_token, ] + target_bpe else: augmented_target_bpe = [ special_token, ] + target_bpe out_text_f.write("{}".format(entities_per_example) + '\n') out_bpe_f.write( ' '.join(map(str, augmented_target_bpe[:max_len - 1])) + '\n')
def create_ent_labels(source_file, target_file, out_file, tokenizer_dir, first_only=False): n_s = count_lines_in_text_file(source_file) n_t = count_lines_in_text_file(target_file) assert n_s == n_t, \ "Number of lines not consistent: {}, {}".format(n_s, n_t) nlp = spacy.load("en_core_web_lg") entities_found = [] encoder_args = SimpleNamespace( encoder_json=os.path.join(tokenizer_dir, "encoder.json"), vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"), keep_empty=True) bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe) with open(source_file, 'r') as s_f, \ open(target_file, 'r') as t_f, \ open(out_file, 'w') as out_f: for _ in tqdm(range(n_s)): sline = s_f.readline().strip() tline = t_f.readline().strip() tokens = bpe.encode(sline) labels = [0] * len(tokens) doc = nlp(tline) entities_per_example = [] for e in doc.ents: if e[0].ent_type_ in TRACKING_ENTITY_LIST: entity_new = {'text': e.text, 'type': e[0].ent_type_} # if e.text in source: match_result = entity_match(e.text, sline, 2) entity_new['match_result'] = match_result labels = update_bio_labels(labels, sline, match_result, tokens, bpe, first_only=first_only) entities_per_example.append(entity_new) out_f.write(" ".join([str(i) for i in labels]) + '\n') entities_found.append(entities_per_example) return entities_found