def get_closest_containee(targets, components): sent2entities = get_sent2entities(targets, components) new_rels = [] for sentid in sent2entities: components, targets = sent2entities[sentid][ 'Components'], sent2entities[sentid]['Targets'] for target in targets: tidx = (target.sent_start_idx, target.sent_end_idx) closest_cidx = None min_dist = None if target.pred_relation_label == 'Contains': for component in components: if component.pred_relation_label != 'Contains': continue cidx = (component.sent_start_idx, component.sent_end_idx) dist = get_word_dist(tidx, cidx) is_closest = False if min_dist is None: is_closest = 1 elif dist < min_dist: is_closest = 1 elif dist == min_dist: is_closest = closest_cidx[0] < cidx[0] if is_closest: min_dist = dist closest_cidx = cidx for component in components: if target.pred_relation_label == 'Contains' and component.pred_relation_label == 'Contains' and closest_cidx[ 0] == component.sent_start_idx: rel = Rel_Instance(target, component) rel.pred_relation_label = 'Contains' new_rels.append(rel) return new_rels
def get_closest_target(targets, components): """ Link each containee to its closest target in the same sentence. """ sent2entities = get_sent2entities(targets, components) new_rels = [] for sentid in sent2entities: components, targets = sent2entities[sentid][ 'Components'], sent2entities[sentid]['Targets'] for component in components: cidx = (component.sent_start_idx, component.sent_end_idx) closest_targetid = None closest_tidx = None min_dist = None if component.pred_relation_label == 'Contains': # find closest target and assign for target in targets: tidx = (target.sent_start_idx, target.sent_end_idx) dist = get_word_dist(cidx, tidx) is_closest = False if min_dist is None: is_closest = 1 elif dist < min_dist: is_closest = 1 elif dist == min_dist: # If there is a tie, choose the preceding target is_closest = closest_tidx[0] > tidx[0] if is_closest: min_dist = dist closest_targetid = target.span_id closest_tidx = tidx for target in targets: if target.span_id == closest_targetid and component.pred_relation_label == 'Contains': rel = Rel_Instance(target, component) rel.pred_relation_label = 'Contains' new_rels.append(rel) return new_rels
def get_closest_component(targets, components): """ for each container, link it to its closest component """ sent2entities = get_sent2entities(targets, components) new_rels = [] for sentid in sent2entities: components, targets = sent2entities[sentid][ 'Components'], sent2entities[sentid]['Targets'] for target in targets: tidx = (target.sent_start_idx, target.sent_end_idx) closest_cidx = None min_dist = None if target.pred_relation_label == 'Contains': # find closest target and assign for component in components: cidx = (component.sent_start_idx, component.sent_end_idx) dist = get_word_dist(cidx, tidx) is_closest = False if min_dist is None: is_closest = 1 elif dist < min_dist: is_closest = 1 elif dist == min_dist: # break tie by choosing the following component is_closest = closest_cidx[0] < cidx[0] if is_closest: min_dist = dist closest_cidx = cidx for component in components: if target.pred_relation_label == 'Contains' and component.sent_start_idx == closest_cidx[ 0]: rel = Rel_Instance(target, component) rel.pred_relation_label = 'Contains' new_rels.append(rel) return new_rels
def make_instances(ann_files, text_files, corenlp_files, outdir): """ This function extracts gold relation instances from gold annotation. Args: ann_files: a list of ann files text_files: a list of text files corenlp_files: a list of .json files that contains parsing output dictionary from CoreNLP for each text files in text_files. outdir: output directory """ # first collect all valid entity for predictions. this is instance-based, which only relies on character offset. And it only extracts spans that cooccur with target in a sentence gold_relins = [] for text_file, ann_file, corenlp_file in zip(text_files, ann_files, corenlp_files): intrasent_gold_relations = [ (e1, e2, relation) for e1, e2, relation in extract_intrasent_goldrelations_from_ann( ann_file, corenlp_file=corenlp_file) if relation == 'Contains' and e1['label'] == 'Target' and e2['label'] in ['Element', 'Mineral'] ] for e1, e2, relation in intrasent_gold_relations: span1 = Span_Instance(e1['venue'], e1['year'], e1['docname'], e1['doc_start_char'], e1['doc_end_char'], e1['text'], 'Target') span2 = Span_Instance(e2['venue'], e2['year'], e2['docname'], e2['doc_start_char'], e2['doc_end_char'], e2['text'], 'Component') rel = Rel_Instance(span1, span2, 'Contains') gold_relins.append(rel) if not exists(outdir): os.makedirs(outdir) outfile = join(outdir, f"gold_relins.pkl") print( f"Saving the evaluation set ({len(gold_relins)} relations) to {outfile}" ) with open(outfile, "wb") as f: pickle.dump(gold_relins, f)
def extract_gold_relations_for_pure(ann_file, text_file, cornelp_file, use_component): accept_ner2 = ['Component'] if use_component else ['Element', 'Mineral'] venue, year, docname, _ = get_docid(text_file) doc = json.load(open(cornelp_file, 'r')) intrasent_gold_relations = [ (e1, e2, relation) for e1, e2, relation in extract_intrasent_goldrelations_from_ann( ann_file, doc=doc, use_component=use_component) if relation == 'Contains' and e1['label'] == 'Target' and e2['label'] in accept_ner2 ] gold_relins = [] for e1, e2, relation in intrasent_gold_relations: sentid = e1['sentid'] tokens = doc['sentences'][sentid]['tokens'] sent_toks = [token['word'] for token in tokens] ner1 = e1["label"] ner2 = e2['label'] span1 = Span_Instance(venue, year, docname, e1['doc_start_char'], e1['doc_end_char'], e1['text'], ner1) span2 = Span_Instance(venue, year, docname, e2['doc_start_char'], e2['doc_end_char'], e2['text'], ner2) rel = Rel_Instance(span1, span2, label_str=relation) gold_relins.append(rel) return gold_relins
def main(args): use_component = args.use_component outfile = args.outfile outdir = "/".join(outfile.split("/")[:-1]) if not exists(outdir): os.makedirs(outdir) corenlp_dir = "../../../parse/" ann_dir = "../../../corpus-LPSC" accept_ner2 = ['Element', 'Mineral' ] if not use_component else ['Component'] relins = [] # store prediction instances # load predictions predicted_docs = [] for line in open(args.pred_file, "r").read().strip().split("\n"): predicted_docs.append(json.loads(line)) for docidx, predicted_doc in enumerate(predicted_docs): venue, year, docname = predicted_doc['doc_key'].split(',') if "lpsc" in venue: doc = json.load( open(join(corenlp_dir, venue, docname + ".txt.json"), "r")) ann_file = join(ann_dir, venue, docname + ".ann") text_file = join(ann_dir, venue, docname + ".txt") else: doc = json.load( open(join(corenlp_dir, venue, f"{year}_{docname}.txt.json"), "r")) ann_file = join(ann_dir, venue, f"{year}_{docname}.ann") text_file = join(ann_dir, venue, f"{year}_{docname}.txt") # get offset to gold ner label gold_entities = [ e for e in extract_gold_entities_from_ann(ann_file, use_component) if e['label'] in ['Target'] + accept_ner2 ] offset2ner = {} for e in gold_entities: ner = e['label'] if use_component and ner != 'Target': ner = 'Component' offset2ner[(e['doc_start_char'], e['doc_end_char'] )] = e['label'] if e['label'] == 'Target' else ner for sentid, sent_relations in enumerate( predicted_doc['predicted_relations']): cumu_toks = sum( [len(doc['sentences'][s]['tokens']) for s in range(sentid)]) sent_toks = [w['word'] for w in doc['sentences'][sentid]['tokens']] for tok_sidx1, tok_eidx1, tok_sidx2, tok_eidx2, logit, relation in sent_relations: # eidx is inclusive score = softmax(logit) tok_sidx1, tok_eidx1, tok_sidx2, tok_eidx2 = tok_sidx1 - cumu_toks, tok_eidx1 - cumu_toks, tok_sidx2 - cumu_toks, tok_eidx2 - cumu_toks tokens = doc['sentences'][sentid]['tokens'] tok1_doc_start_char, tok1_doc_end_char = find_doc_offset( tokens, tok_sidx1, tok_eidx1) tok2_doc_start_char, tok2_doc_end_char = find_doc_offset( tokens, tok_sidx2, tok_eidx2) ner1 = offset2ner.get((tok1_doc_start_char, tok1_doc_end_char), "") ner2 = offset2ner.get((tok2_doc_start_char, tok2_doc_end_char), "") if ner1 != "Target" or ner2 not in accept_ner2: continue text1 = " ".join( [t['word'] for t in tokens[tok_sidx1:tok_eidx1 + 1]]) text2 = " ".join( [t['word'] for t in tokens[tok_sidx2:tok_eidx2 + 1]]) span1 = Span_Instance(venue, year, docname, tok1_doc_start_char, tok1_doc_end_char, text1, ner1, sent_toks=sent_toks) span2 = Span_Instance(venue, year, docname, tok2_doc_start_char, tok2_doc_end_char, text2, ner2, sent_toks=sent_toks) rel = Rel_Instance(span1, span2) rel.pred_relation_label = relation rel.pred_score = [score[1], score[0]] relins.append(rel) # sanity check all ner labels are valid assert all([(rel.span1.ner_label in ['Target'] + accept_ner2) and (rel.span2.ner_label in ['Target'] + accept_ner2) for rel in relins]) print( "possible ner labels: ", set([rel.span1.ner_label for rel in relins] + [rel.span2.ner_label for rel in relins])) print( f"{len(relins)} in predictions, {len([rel for rel in relins if rel.pred_relation_label == 'Contains'])} Contains and {len([rel for rel in relins if rel.pred_relation_label == 'O'])} O's" ) print(f"saving converted prediction file to {outfile}") with open(outfile, "wb") as f: pickle.dump(relins, f)