def select_summary_sentences(nlp, intro, abstract, filter_level):
    if not (intro and abstract):
        return ""
    if filter_level <= 1:
        return abstract

    doc = nlp(abstract)
    en_count_in_summary = 0
    sentences_select = {}
    for sent in doc.sents:
        sentences_select[sent.text] = True
    for e in doc.ents:
        if e[0].ent_type_ in TRACKING_ENTITY_LIST:
            en_count_in_summary += 1
            # if e.text in source:
            match_result = entity_match(e.text, intro, 2)
            if not match_result:
                sentences_select[e.sent.text] = False
                # print("ENTITY NOT FOUND: {}".format(e.text))
                # print(">>source>>", intro)
                # print(">>summary>>", abstract)
                # break
    # if select and en_count_in_summary>0:
    result = []
    for sent in doc.sents:
        if sentences_select[sent.text]:
            result.append(sent.text)
    return " ".join(result)
def select_example(nlp, intro, abstract, filter_level):
    if not (intro and abstract):
        return False
    if filter_level <= 1:
        return True

    doc = nlp(abstract)
    en_count_in_summary = 0
    select = True
    for e in doc.ents:
        if e[0].ent_type_ in TRACKING_ENTITY_LIST:
            en_count_in_summary += 1
            # if e.text in source:
            match_result = entity_match(e.text, intro, 2)
            if not match_result:
                select = False
                # print("ENTITY NOT FOUND: {}".format(e.text))
                # print(">>source>>", intro)
                # print(">>summary>>", abstract)
                break
    # if select and en_count_in_summary>0:
    if select:
        return True
    else:
        return False
def ent_count_match(nlp, base, parent, is_scispacy=False):
    # perform NER on base, then match in parant:
    doc = nlp(base)
    ent_count_base = 0
    en_count_in_base_parent = 0
    if is_scispacy:
        for e in doc.ents:
            ent_count_base += 1
            # if e.text in source:
            match_result = entity_match(e.text, parent, 1)
            if match_result:
                en_count_in_base_parent += 1
    else:
        for e in doc.ents:
            if e[0].ent_type_ in TRACKING_ENTITY_LIST:
                ent_count_base += 1
                # if e.text in source:
                match_result = entity_match(e.text, parent, 2)
                if match_result:
                    en_count_in_base_parent += 1
    return ent_count_base, en_count_in_base_parent
Example #4
0
def create_ent_augmented_target(source_file,
                                target_file,
                                out_text_file,
                                out_bpe_file,
                                tokenizer_dir,
                                special_token=50009,
                                max_len=1024):
    n_s = count_lines_in_text_file(source_file)
    n_t = count_lines_in_text_file(target_file)
    assert n_s == n_t, \
        "Number of lines not consistent: {}, {}".format(n_s, n_t)

    nlp = spacy.load("en_core_web_lg")

    encoder_args = SimpleNamespace(
        encoder_json=os.path.join(tokenizer_dir, "encoder.json"),
        vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"),
        keep_empty=True)
    bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe)

    with open(source_file, 'r') as s_f, \
        open(target_file, 'r') as t_f, \
        open(out_bpe_file, 'w') as out_bpe_f, \
        open(out_text_file, 'w') as out_text_f:

        for _ in tqdm(range(n_s)):
            sline = s_f.readline().strip()
            tline = t_f.readline().strip()

            doc = nlp(tline)
            entities_per_example = []
            for e in doc.ents:
                if e[0].ent_type_ in TRACKING_ENTITY_LIST:
                    # if e.text in source:
                    match_result = entity_match(e.text, sline, 2)
                    if match_result:
                        entities_per_example.append(match_result[0])
            target_bpe = bpe.encode(tline)
            if entities_per_example:
                entity_bpe = bpe.encode(", ".join(entities_per_example))
                augmented_target_bpe = entity_bpe + [
                    special_token,
                ] + target_bpe
            else:
                augmented_target_bpe = [
                    special_token,
                ] + target_bpe
            out_text_f.write("{}".format(entities_per_example) + '\n')
            out_bpe_f.write(
                ' '.join(map(str, augmented_target_bpe[:max_len - 1])) + '\n')
Example #5
0
def create_ent_labels(source_file,
                      target_file,
                      out_file,
                      tokenizer_dir,
                      first_only=False):
    n_s = count_lines_in_text_file(source_file)
    n_t = count_lines_in_text_file(target_file)
    assert n_s == n_t, \
        "Number of lines not consistent: {}, {}".format(n_s, n_t)

    nlp = spacy.load("en_core_web_lg")
    entities_found = []

    encoder_args = SimpleNamespace(
        encoder_json=os.path.join(tokenizer_dir, "encoder.json"),
        vocab_bpe=os.path.join(tokenizer_dir, "vocab.bpe"),
        keep_empty=True)
    bpe = get_encoder(encoder_args.encoder_json, encoder_args.vocab_bpe)

    with open(source_file, 'r') as s_f, \
        open(target_file, 'r') as t_f, \
        open(out_file, 'w') as out_f:

        for _ in tqdm(range(n_s)):
            sline = s_f.readline().strip()
            tline = t_f.readline().strip()
            tokens = bpe.encode(sline)
            labels = [0] * len(tokens)

            doc = nlp(tline)
            entities_per_example = []
            for e in doc.ents:
                if e[0].ent_type_ in TRACKING_ENTITY_LIST:
                    entity_new = {'text': e.text, 'type': e[0].ent_type_}
                    # if e.text in source:
                    match_result = entity_match(e.text, sline, 2)
                    entity_new['match_result'] = match_result
                    labels = update_bio_labels(labels,
                                               sline,
                                               match_result,
                                               tokens,
                                               bpe,
                                               first_only=first_only)
                    entities_per_example.append(entity_new)
            out_f.write(" ".join([str(i) for i in labels]) + '\n')
            entities_found.append(entities_per_example)
    return entities_found