Esempio n. 1
0
def get_closest_containee(targets, components):
    sent2entities = get_sent2entities(targets, components)
    new_rels = []
    for sentid in sent2entities:
        components, targets = sent2entities[sentid][
            'Components'], sent2entities[sentid]['Targets']
        for target in targets:
            tidx = (target.sent_start_idx, target.sent_end_idx)
            closest_cidx = None
            min_dist = None
            if target.pred_relation_label == 'Contains':
                for component in components:

                    if component.pred_relation_label != 'Contains':
                        continue
                    cidx = (component.sent_start_idx, component.sent_end_idx)
                    dist = get_word_dist(tidx, cidx)
                    is_closest = False
                    if min_dist is None:
                        is_closest = 1
                    elif dist < min_dist:
                        is_closest = 1
                    elif dist == min_dist:
                        is_closest = closest_cidx[0] < cidx[0]
                    if is_closest:
                        min_dist = dist
                        closest_cidx = cidx

            for component in components:
                if target.pred_relation_label == 'Contains' and component.pred_relation_label == 'Contains' and closest_cidx[
                        0] == component.sent_start_idx:
                    rel = Rel_Instance(target, component)
                    rel.pred_relation_label = 'Contains'
                    new_rels.append(rel)
    return new_rels
Esempio n. 2
0
def get_closest_target(targets, components):
    """
    Link each containee to its closest target in the same sentence. 
    """
    sent2entities = get_sent2entities(targets, components)

    new_rels = []
    for sentid in sent2entities:
        components, targets = sent2entities[sentid][
            'Components'], sent2entities[sentid]['Targets']

        for component in components:
            cidx = (component.sent_start_idx, component.sent_end_idx)
            closest_targetid = None
            closest_tidx = None
            min_dist = None
            if component.pred_relation_label == 'Contains':
                # find closest target and assign
                for target in targets:
                    tidx = (target.sent_start_idx, target.sent_end_idx)
                    dist = get_word_dist(cidx, tidx)
                    is_closest = False
                    if min_dist is None:
                        is_closest = 1
                    elif dist < min_dist:
                        is_closest = 1
                    elif dist == min_dist:
                        # If there is a tie, choose the preceding target
                        is_closest = closest_tidx[0] > tidx[0]
                    if is_closest:
                        min_dist = dist
                        closest_targetid = target.span_id
                        closest_tidx = tidx

            for target in targets:
                if target.span_id == closest_targetid and component.pred_relation_label == 'Contains':
                    rel = Rel_Instance(target, component)
                    rel.pred_relation_label = 'Contains'
                    new_rels.append(rel)

    return new_rels
Esempio n. 3
0
def get_closest_component(targets, components):
    """
    for each container, link it to its closest component 
    """
    sent2entities = get_sent2entities(targets, components)

    new_rels = []
    for sentid in sent2entities:
        components, targets = sent2entities[sentid][
            'Components'], sent2entities[sentid]['Targets']
        for target in targets:
            tidx = (target.sent_start_idx, target.sent_end_idx)
            closest_cidx = None
            min_dist = None
            if target.pred_relation_label == 'Contains':
                # find closest target and assign
                for component in components:
                    cidx = (component.sent_start_idx, component.sent_end_idx)
                    dist = get_word_dist(cidx, tidx)
                    is_closest = False
                    if min_dist is None:
                        is_closest = 1
                    elif dist < min_dist:
                        is_closest = 1
                    elif dist == min_dist:
                        # break tie by choosing the following component
                        is_closest = closest_cidx[0] < cidx[0]

                    if is_closest:
                        min_dist = dist
                        closest_cidx = cidx

            for component in components:
                if target.pred_relation_label == 'Contains' and component.sent_start_idx == closest_cidx[
                        0]:
                    rel = Rel_Instance(target, component)
                    rel.pred_relation_label = 'Contains'
                    new_rels.append(rel)

    return new_rels
Esempio n. 4
0
def make_instances(ann_files, text_files, corenlp_files, outdir):
    """
    This function extracts gold relation instances from gold annotation. 
    
    Args:
        ann_files:
            a list of ann files
        text_files:
            a list of text files 
        corenlp_files:
            a list of .json files that contains parsing output dictionary from CoreNLP for each text files in text_files. 
        outdir:
            output directory 
    """

    # first collect all valid entity for predictions. this is instance-based, which only relies on character offset. And it only extracts spans that cooccur with target in a sentence
    gold_relins = []

    for text_file, ann_file, corenlp_file in zip(text_files, ann_files,
                                                 corenlp_files):

        intrasent_gold_relations = [
            (e1, e2, relation)
            for e1, e2, relation in extract_intrasent_goldrelations_from_ann(
                ann_file, corenlp_file=corenlp_file)
            if relation == 'Contains' and e1['label'] == 'Target'
            and e2['label'] in ['Element', 'Mineral']
        ]

        for e1, e2, relation in intrasent_gold_relations:

            span1 = Span_Instance(e1['venue'], e1['year'], e1['docname'],
                                  e1['doc_start_char'], e1['doc_end_char'],
                                  e1['text'], 'Target')

            span2 = Span_Instance(e2['venue'], e2['year'], e2['docname'],
                                  e2['doc_start_char'], e2['doc_end_char'],
                                  e2['text'], 'Component')

            rel = Rel_Instance(span1, span2, 'Contains')
            gold_relins.append(rel)

    if not exists(outdir):
        os.makedirs(outdir)

    outfile = join(outdir, f"gold_relins.pkl")
    print(
        f"Saving the evaluation set ({len(gold_relins)} relations) to {outfile}"
    )
    with open(outfile, "wb") as f:
        pickle.dump(gold_relins, f)
Esempio n. 5
0
def extract_gold_relations_for_pure(ann_file, text_file, cornelp_file,
                                    use_component):

    accept_ner2 = ['Component'] if use_component else ['Element', 'Mineral']

    venue, year, docname, _ = get_docid(text_file)

    doc = json.load(open(cornelp_file, 'r'))

    intrasent_gold_relations = [
        (e1, e2, relation)
        for e1, e2, relation in extract_intrasent_goldrelations_from_ann(
            ann_file, doc=doc, use_component=use_component)
        if relation == 'Contains' and e1['label'] == 'Target'
        and e2['label'] in accept_ner2
    ]

    gold_relins = []
    for e1, e2, relation in intrasent_gold_relations:

        sentid = e1['sentid']
        tokens = doc['sentences'][sentid]['tokens']

        sent_toks = [token['word'] for token in tokens]

        ner1 = e1["label"]
        ner2 = e2['label']

        span1 = Span_Instance(venue, year, docname, e1['doc_start_char'],
                              e1['doc_end_char'], e1['text'], ner1)

        span2 = Span_Instance(venue, year, docname, e2['doc_start_char'],
                              e2['doc_end_char'], e2['text'], ner2)

        rel = Rel_Instance(span1, span2, label_str=relation)

        gold_relins.append(rel)

    return gold_relins
Esempio n. 6
0
def main(args):
    use_component = args.use_component
    outfile = args.outfile
    outdir = "/".join(outfile.split("/")[:-1])

    if not exists(outdir):
        os.makedirs(outdir)

    corenlp_dir = "../../../parse/"
    ann_dir = "../../../corpus-LPSC"
    accept_ner2 = ['Element', 'Mineral'
                   ] if not use_component else ['Component']

    relins = []  # store prediction instances

    # load predictions
    predicted_docs = []
    for line in open(args.pred_file, "r").read().strip().split("\n"):
        predicted_docs.append(json.loads(line))

    for docidx, predicted_doc in enumerate(predicted_docs):

        venue, year, docname = predicted_doc['doc_key'].split(',')

        if "lpsc" in venue:
            doc = json.load(
                open(join(corenlp_dir, venue, docname + ".txt.json"), "r"))
            ann_file = join(ann_dir, venue, docname + ".ann")
            text_file = join(ann_dir, venue, docname + ".txt")
        else:
            doc = json.load(
                open(join(corenlp_dir, venue, f"{year}_{docname}.txt.json"),
                     "r"))
            ann_file = join(ann_dir, venue, f"{year}_{docname}.ann")
            text_file = join(ann_dir, venue, f"{year}_{docname}.txt")

        # get offset to gold ner label

        gold_entities = [
            e for e in extract_gold_entities_from_ann(ann_file, use_component)
            if e['label'] in ['Target'] + accept_ner2
        ]

        offset2ner = {}
        for e in gold_entities:
            ner = e['label']
            if use_component and ner != 'Target':
                ner = 'Component'

            offset2ner[(e['doc_start_char'], e['doc_end_char']
                        )] = e['label'] if e['label'] == 'Target' else ner

        for sentid, sent_relations in enumerate(
                predicted_doc['predicted_relations']):

            cumu_toks = sum(
                [len(doc['sentences'][s]['tokens']) for s in range(sentid)])
            sent_toks = [w['word'] for w in doc['sentences'][sentid]['tokens']]

            for tok_sidx1, tok_eidx1, tok_sidx2, tok_eidx2, logit, relation in sent_relations:  # eidx is inclusive
                score = softmax(logit)
                tok_sidx1, tok_eidx1, tok_sidx2, tok_eidx2 = tok_sidx1 - cumu_toks, tok_eidx1 - cumu_toks, tok_sidx2 - cumu_toks, tok_eidx2 - cumu_toks

                tokens = doc['sentences'][sentid]['tokens']

                tok1_doc_start_char, tok1_doc_end_char = find_doc_offset(
                    tokens, tok_sidx1, tok_eidx1)

                tok2_doc_start_char, tok2_doc_end_char = find_doc_offset(
                    tokens, tok_sidx2, tok_eidx2)

                ner1 = offset2ner.get((tok1_doc_start_char, tok1_doc_end_char),
                                      "")

                ner2 = offset2ner.get((tok2_doc_start_char, tok2_doc_end_char),
                                      "")

                if ner1 != "Target" or ner2 not in accept_ner2:
                    continue

                text1 = " ".join(
                    [t['word'] for t in tokens[tok_sidx1:tok_eidx1 + 1]])
                text2 = " ".join(
                    [t['word'] for t in tokens[tok_sidx2:tok_eidx2 + 1]])

                span1 = Span_Instance(venue,
                                      year,
                                      docname,
                                      tok1_doc_start_char,
                                      tok1_doc_end_char,
                                      text1,
                                      ner1,
                                      sent_toks=sent_toks)

                span2 = Span_Instance(venue,
                                      year,
                                      docname,
                                      tok2_doc_start_char,
                                      tok2_doc_end_char,
                                      text2,
                                      ner2,
                                      sent_toks=sent_toks)

                rel = Rel_Instance(span1, span2)
                rel.pred_relation_label = relation
                rel.pred_score = [score[1], score[0]]
                relins.append(rel)

    # sanity check all ner labels are valid
    assert all([(rel.span1.ner_label in ['Target'] + accept_ner2)
                and (rel.span2.ner_label in ['Target'] + accept_ner2)
                for rel in relins])
    print(
        "possible ner labels: ",
        set([rel.span1.ner_label
             for rel in relins] + [rel.span2.ner_label for rel in relins]))
    print(
        f"{len(relins)} in predictions, {len([rel for rel in relins if rel.pred_relation_label == 'Contains'])} Contains and {len([rel for rel in relins if rel.pred_relation_label == 'O'])} O's"
    )
    print(f"saving converted prediction file to {outfile}")
    with open(outfile, "wb") as f:
        pickle.dump(relins, f)