コード例 #1
0
def process_multiple_docs_lexicons_sp(docs, lexicons):
    """
    Iterate through list of doc directories
    """
    # create one empty list for each doc
    doc_dict = {i: d for i, d in enumerate(docs)}
    output_entities = [[]] * len(docs)
    doc_results = []
    for idoc, doc in enumerate(docs):
        if sum(map(str.isalnum, doc)) < 5:  # must have at least 5 alnum
            print("no words", doc)
            continue
        doc = re.sub(r"[^A-Za-z0-9 ]{2,}", repl, doc)
        for l in lexicons:
            doc_results += merpy.get_entities(doc, l)
        for e in doc_results:
            # doc_entities = merpy.get_entities_mp(doc_dict, lex, n_cores=10)
            # print(lex, entities)
            # for e in l_entities:
            if len(e) > 2:
                entity = [int(e[0]), int(e[1]), e[2]]
                if len(e) > 3:  # URI
                    entity.append(e[3])
                if entity not in output_entities[idoc]:
                    output_entities[idoc].append(entity)
    for i in range(len(output_entities)):
        output_entities[i] = sorted(output_entities[i])
    return output_entities
コード例 #2
0
ファイル: mer_class.py プロジェクト: umar1196/Text-mining-
def merpy_function(article_abstract):
    '''
    Run MER. terms are inialized in the beginning of script with line: merpy.generate_lexicon("hp")
    :param article_abstract:  variable containing the abstract text
    :return: a nested list with identified term, its position and url of term
    '''
    document = article_abstract
    entities = merpy.get_entities(document, "hp")

    return entities
コード例 #3
0
def annotations(corpus_path, data_path):
    """

    :param corpus_path:
    :param data_path:
    """

    merpy.download_lexicons()
    merpy.process_lexicon("hp")
    merpy.process_lexicon("doid")
    merpy.process_lexicon("radlex")

    process_list(data_path, 'chebi')
    process_list(data_path, 'medical_devices')
    process_list(data_path, 'temporal_list')
    process_list(data_path, 'population_vocabulary')

    for f in os.listdir(corpus_path):

        file_to_annotate = open(corpus_path + f, 'r', encoding = 'utf-8')
        file_content = file_to_annotate.read()
        file_to_annotate.close()

        entities_hp = merpy.get_entities(file_content, "hp")
        clean_hp_list = []
        for hp in entities_hp:
            if 'HP' not in hp[-1]:
                pass
            else:
                clean_hp_list.append(hp)

        entities_doid = merpy.get_entities(file_content, "doid")
        entities_radlex = merpy.get_entities(file_content, "radlex")
        entities_devices = merpy.get_entities(file_content, "medical_devices")
        entities_chebi = merpy.get_entities(file_content, "chebi")
        entities_temporal = merpy.get_entities(file_content, "temporal")
        entities_population = merpy.get_entities(file_content, "population")

        entities = clean_hp_list + entities_doid + entities_devices + entities_chebi + entities_radlex + entities_temporal + entities_population
        entities_clean = [x for x in entities if x != ['']]

        entities_sorted = sorted(entities_clean, key = lambda position: int(position[0]))

        print('\n' + f + '\n')
        for entity in entities_sorted:
            print(entity)

    return
def map_to_ontology(entity_text, ontology):
    """
    Run merpy on an entity string to get ontology URI
    """
    if entity_text in normalization_dic[ontology]:
        return normalization_dic[ontology][entity_text]
    else:
        matches = merpy.get_entities(entity_text, ontology)
        if len(matches) > 1:
            print(entity_text, ontology, matches)
        if len(matches) == 0 or len(matches[-1]) < 4:
            print("no matches", entity_text, ontology, matches)
            normalization_dic[ontology][entity_text] = entity_text
            return entity_text
        else:  # get the last match (or biggest TODO)
            best_match = matches[-1][3]
            normalization_dic[ontology][entity_text] = best_match
        return best_match
コード例 #5
0
def call_mer_2nd_run(l_mer_terms, l_text, lexicon):
    for i in range(len(l_mer_terms)):
        if l_mer_terms[i][0] == ' ':
            l_aux = merpy.get_entities(l_text[i], lexicon)

            print('>>>>>', l_aux)  #DEBUG

            if len(l_aux) > 0:
                l_aux_2 = []
                if lexicon == 'decs_parlex':
                    for i in range(len(l_aux)):
                        #Spanish term
                        try:
                            if l_aux[i][2] not in l_aux_2:
                                l_aux_2.append(l_aux[i][2])

                            try:
                                #black magic to convert from string representation to list
                                l_aux[i][3] = ast.literal_eval(l_aux[i][3])

                                #DEBUG - decs_parlex
                                #print(l_mer_data[0]) #DeCS code
                                #print(l_mer_data[1]) #Parent Info Tuple
                                #print(l_mer_data[1][0]) #Parent DeCS code (if any)
                                #print(l_mer_data[1][1]) #Parent Name (if any)

                                #Spanish Parent
                                if l_aux[i][3][1][1] != '-' and l_aux[i][3][1][
                                        1] not in l_aux_2:
                                    l_aux_2.append(l_aux[i][3][1][1])

                            except IndexError:
                                l_aux_2.append(' ')

                        except IndexError:
                            l_aux_2.append(' ')

                l_mer_terms[i][0] = l_aux_2

    return l_mer_terms
コード例 #6
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 15:57:57 2020

@author: André
"""

import merpy
import pandas as pd

decs_data = pd.read_csv('../mesinesp_data/DeCS_data.tsv', sep='\t')

conv_dict = {}
for index, row in decs_data.iterrows():
    l_terms = str(row['Synonyms']).split('|')
    if row['Term_Spanish'] not in l_terms:
        l_terms.append(row['Term_Spanish'])
                                
    for i in l_terms:
        conv_dict[i] = str(row['#DeCS_code'])

merpy.create_lexicon(conv_dict.keys(), "decslex")
merpy.create_mappings(conv_dict, "decslex")
merpy.show_lexicons()
merpy.process_lexicon("decslex")

#DEBUG
merpy.get_entities("lo nervio abducens es una aurelia aurita", "decslex")
コード例 #7
0
import merpy
import ssmpy
import urllib.request

# Download the Human Disease Ontology OWL file
doid_link = 'http://purl.obolibrary.org/obo/doid.owl'
with urllib.request.urlopen(doid_link) as response, open('doid.owl',
                                                         'wb') as out_file:
    data = response.read()
    out_file.write(data)

ssmpy.create_semantic_base("doid.owl", "doid.db",
                           "http://purl.obolibrary.org/obo/",
                           "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                           '')
ssmpy.semantic_base("doid.db")

merpy.download_lexicon(doid_link, "doid", "owl")
merpy.process_lexicon("doid", "owl")
document = "zoophilia zoophobia zaspopathy"
entities = merpy.get_entities(document, "doid")
print(entities)
print(merpy.get_similarities(entities, 'doid.db'))
コード例 #8
0
with open('../bioasq_data/mesh_terms_synonyms.txt',
          encoding='utf-8') as finput_terms:
    l_terms_syn = finput_terms.readlines()

dict_terms_synonyms = {}
for i in l_terms_syn:
    aux = i.split('\t')
    dict_terms_synonyms[aux[0]] = aux[1].replace('\n', '')

conv_dict = {}
for key, values in dict_terms_synonyms.items():
    l_synonyms = values.split(',')
    if key not in l_synonyms:
        l_synonyms.append(key)

    for i in l_synonyms:
        conv_dict[i.strip()] = dict_terms.get(key)

merpy.create_lexicon(conv_dict.keys(), "meshlex")
merpy.create_mappings(conv_dict, "meshlex")
merpy.show_lexicons()
merpy.process_lexicon("meshlex")

#DEBUG
print(merpy.get_entities("I like abdominal injuries", "meshlex"))
print(merpy.get_entities("I like Calcimycin", "meshlex"))
print(
    merpy.get_entities(
        "I like Calcimycin it is a good aurelia aurita and Temefos is awesome! abate lowercase",
        "meshlex"))
コード例 #9
0
def build_relations_dict():
    """Iterates over all sentences in train, dev sets recognizes CIE-O-3, ICD10-CM and DeCS entities and establish a relation
        between two entities in a given sentece

    Ensures:
        dict stored in file './tmp/relations_cieo3_icd10cm.json' ES ICD10-CM <-> CIEO3 relations and in file 
        './tmp/relations_cieo3_esdecs.json' with ES DeCS <-> CIEO3 relations
    """

    #Create CIE-O-3 lexicon
    lexicon_name = "cieo3"
    ontology_graph, name_to_id, synonym_to_id = load_cieo3()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    #Create ICD10-CM lexicon
    lexicon_name = "icd10cmes"
    ontology_graph, name_to_id = load_spanish_icd10cm()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    #Create DECS lexicon
    lexicon_name = "es_decs"
    ontology_graph, name_to_id, synonym_to_id = load_es_decs()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    filenames_1 = [
        "./data/datasets/train-set-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/train-set-to-publish/cantemist-norm/")
    ]
    filenames_2 = [
        "./data/datasets/dev-set1-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/dev-set1-to-publish/cantemist-norm/")
    ]
    filenames_3 = [
        "./data/datasets/dev-set2-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/dev-set2-to-publish/cantemist-norm/")
    ]
    filenames_4 = [
        "./data/datasets/test-background-set-to-publish/" + input_file
        for input_file in os.listdir(
            "./data/datasets/test-background-set-to-publish/")
    ]

    filenames = filenames_1 + filenames_2 + filenames_3  # + filenames_4

    relations_1, relations_2 = dict(), dict()
    doc_count = int()

    for doc in filenames:

        if doc[-3:] == "txt":
            #if doc == "cc_onco1016.txt":
            doc_count += 1
            print("DOC_COUNT:", doc_count)
            with open(doc, 'r') as doc_file:
                text = doc_file.read()
                doc_file.close()

            sentences = [Sentence(sent) for sent in split_single(text)]

            for sentence in sentences:
                sent_text = sentence.to_original_text()
                cieo3_entities = merpy.get_entities(sent_text, "cieo3")
                icd10cm_entities = merpy.get_entities(sent_text, "icd10cmes")
                es_decs_entities = merpy.get_entities(sent_text, "es_decs")

                if icd10cm_entities != [['']] and cieo3_entities != [['']]:
                    icd10cm_codes = [entity[3] for entity in icd10cm_entities]
                    cieo3_codes = [entity[3] for entity in cieo3_entities]

                    for code in cieo3_codes:

                        if code in relations_1:
                            current_values = relations_1[code]
                            current_values.extend(icd10cm_codes)
                            relations_1[code] = current_values
                        else:
                            relations_1[code] = icd10cm_codes

                if es_decs_entities != [['']] and cieo3_entities != [['']]:
                    es_decs_codes = [entity[3] for entity in es_decs_entities]
                    cieo3_codes = [entity[3] for entity in cieo3_entities]

                    for code in cieo3_codes:

                        if code in relations_2:
                            current_values = relations_2[code]
                            current_values.extend(es_decs_codes)
                            relations_2[code] = es_decs_codes
                        else:
                            relations_2[code] = es_decs_codes

    #Output the relations into json files
    d = json.dumps(relations_1)
    d_file = open("./tmp/relations_cieo3_icd10cm.json", 'w')
    d_file.write(d)
    d_file.close()

    b = json.dumps(relations_2)
    b_file = open("./tmp/relations_cieo3_esdecs.json", 'w')
    b_file.write(b)
    b_file.close()
コード例 #10
0
def annotate_documents(task, subset, name_to_id, output_dir):
    """Recognise entities (NER) and link them to the respective CIE-O-3 code (Normalisation), if available, using MERPY"""

    lexicon_name = "cieo3"
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    dataset_dir = str()

    if subset == "train":
        dataset_dir = "./data/datasets/train-set-to-publish/cantemist-norm/"

    elif subset == "dev1":
        dataset_dir = "./data/datasets/dev-set1-to-publish/cantemist-norm/"

    elif subset == "dev2":
        dataset_dir = "./data/datasets/dev-set2-to-publish/cantemist-norm/"

    elif subset == "test":
        dataset_dir = "./data/datasets/test-background-set-to-publish"

    doc_count = int()
    doc_annot_ratio = int()
    doc_w_annotations_count = int()
    total_entity_count = int()
    linked_mentions = int()
    total_doc_count = int(len(os.listdir(dataset_dir)) / 2)

    for doc in os.listdir(dataset_dir):

        if doc[-3:] == "txt":

            doc_count += 1
            output_string = str()
            print("Annotating " + str(doc_count) + " of " +
                  str(total_doc_count) + " documents")

            with open(dataset_dir + doc, 'r') as input_file:
                text = input_file.read()
                input_file.close()
                doc_entity_count = int()

                entities = merpy.get_entities(text, lexicon_name)

                for entity in entities:

                    if entity != ['']:
                        total_entity_count += 1
                        doc_entity_count += 1

                        if len(entity
                               ) == 4:  # linked mentions with CIE-O-3 code
                            linked_mentions += 1
                            output_string += "T" + str(
                                doc_entity_count
                            ) + "\tMORFOLOGIA_NEOPLASIA " + entity[
                                0] + " " + entity[1] + "\t" + entity[2] + "\n"

                            if task == "norm":
                                output_string += "#" + str(
                                    doc_entity_count
                                ) + "\tAnnotatorNotes\tT" + str(
                                    doc_entity_count) + "\t" + entity[3] + "\n"

                        elif len(entity) == 3:  # mentions without CIE-O-3 code
                            output_string += "T" + str(
                                doc_entity_count
                            ) + "\tMORFOLOGIA_NEOPLASIA " + entity[
                                0] + " " + entity[1] + "\t" + entity[2] + "\n"

                            if task == "norm":
                                output_string += "#" + str(
                                    doc_entity_count
                                ) + "\tAnnotatorNotes T" + str(
                                    doc_entity_count) + "\tNA\n"

                if doc_entity_count > 0:
                    doc_w_annotations_count += 1

                elif doc_entity_count == 0:
                    output_string = "NA\tNA NA NA\tNA\n"

                    if task == "norm":
                        output_string += "#" + str(
                            doc_entity_count) + "\tAnnotatorNotes T" + str(
                                doc_entity_count) + "\tNA\n"

            # output annotations file
            output_filename = output_dir + doc[:-4] + ".ann"

            with open(output_filename, 'w') as output_file:
                output_file.write(output_string)
                output_file.close()

    try:
        doc_annot_ratio = float(doc_w_annotations_count / total_doc_count)
        mentions_ratio = float(total_entity_count / doc_w_annotations_count)
        doc_linked_ratio = float(linked_mentions / doc_w_annotations_count)
        linked_ratio = float(linked_mentions / total_entity_count)

    except:
        mentions_ratio = 0.0
        doc_linked_ratio = 0.0
        linked_ratio = 0.0

    output_str = "TOTAL DOCUMENTS: " + str(total_doc_count) + "\n"
    output_str += "DOCS WITH ANNOTATIONS: " + str(
        doc_w_annotations_count) + "\n"
    output_str += "RATIO OF DOCS WITH ANNOTATIONS: " + str(
        doc_annot_ratio) + "\n"
    output_str += "TOTAL ENTITY MENTIONS: " + str(total_entity_count) + "\n"
    output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n"
    output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n"
    output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str(
        doc_linked_ratio) + "\n"
    output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio)

    file_name = "./mer_annotations/" + task + "/" + task + "_" + subset + "_stats"

    with open(file_name, "w") as output:
        output.write(output_str)
        output.close()
コード例 #11
0
def annotate_documents(language, name_to_id):
    """
    Recognise entities (Named Entity Recognition) and link them to the respective ICD 10 CM code (Named Entity Linking), if available

    Requires:
        language: str, "pt", "en", "es" for Portuguese, English or Spanish, respectively

    Ensures:
        for each abstract in 'scielo_abstracts' dir creates an annotation file in 'mer_annotations' dir and an overall statistics file about the annotation process

    """

    lexicon_name = "icd10cm_" + language
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    abstracts_dir = "./scielo_abstracts/"
    doc_w_ann_count = int()
    entity_count = int()
    linked_mentions = int()

    for abstract in os.listdir(abstracts_dir):

        if abstract[-2:] == language:
            output_string = str()

            with open(abstracts_dir + abstract, 'r') as input_file:
                text = input_file.read()
                input_file.close()
                document_ent_count = int()

                entities = merpy.get_entities(text, lexicon_name)

                for entity in entities:

                    if entity != ['']:
                        entity_count += 1
                        document_ent_count += 1

                        if len(entity) == 4:  # linked mentions with ICD code
                            linked_mentions += 1
                            output_string += "T" + str(
                                document_ent_count
                            ) + "\t" + entity[0] + " " + entity[
                                1] + "\t" + entity[2] + "\t" + entity[3] + "\n"

                        elif len(entity) == 3:  # mentions without ICD code
                            output_string += "T" + str(
                                document_ent_count
                            ) + "\t" + entity[0] + " " + entity[
                                1] + "\t" + entity[2] + "\n"

                if document_ent_count > 0:
                    doc_w_ann_count += 1

            output_filename = "./mer_annotations/" + language + "/" + abstract + ".ann"

            with open(output_filename, 'w') as output_file:
                output_file.write(output_string)
                output_file.close()

    try:
        mentions_ratio = float(entity_count / doc_w_ann_count)
        doc_linked_ratio = float(linked_mentions / doc_w_ann_count)
        linked_ratio = float(linked_mentions / entity_count)

    except:
        mentions_ratio = 0.0
        doc_linked_ratio = 0.0
        linked_ratio = 0.0

    output_str = "DOCUMENTS WITH ANNOTATIONS: " + str(doc_w_ann_count) + "\n"
    output_str += "TOTAL ENTITY MENTIONS: " + str(entity_count) + "\n"
    output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n"
    output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n"
    output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str(
        doc_linked_ratio) + "\n"
    output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio)

    file_name = "mer_annotation_stats_" + language

    with open(file_name, "w") as output:
        output.write(output_str)
        output.close()
for index, row in decs_data.iterrows():
    l_terms = str(row['Synonyms']).split('|')
    if row['Term_Spanish'] not in l_terms:
        l_terms.append(row['Term_Spanish'])

    try:
        parent = dict_child_par.get(str(row['#DeCS_code']))
        if parent == None:
            parent = '-'
            parent_info = '-'
        else:
            parent_info = decs_dict.get(parent)
            if parent_info == None:
                parent_info = '-'
    except KeyError:
        parent = '-'
        parent_info = '-'

    parent_tup = (parent, parent_info)

    for i in l_terms:
        conv_dict[i] = str([row['#DeCS_code'], parent_tup])

merpy.create_lexicon(conv_dict.keys(), "decsparlex")
merpy.create_mappings(conv_dict, "decsparlex")
merpy.show_lexicons()
merpy.process_lexicon("decsparlex")

#DEBUG
print(merpy.get_entities("lo nervio abducens es un gran temefós",
                         "decsparlex"))
コード例 #13
0
import os
import pandas as pd

def get_entity_dict(ent):
	ent_dict = {
		"offsets": "LIVB {} {}".format(ent[0], ent[1]),
		"text": ent[2].replace("\n", " "),
	}
	return ent_dict

if len(sys.argv) >= 3:

	input_dir = sys.argv[1]
	output_dir = sys.argv[2]

	result = []
	for document in glob(os.path.join(input_dir, "*.txt")):

		with open(document, 'r') as f:
			data = f.read()
                     
		entities = merpy.get_entities(data, "ncbi")
		entities = [get_entity_dict(ent) for ent in entities if len(ent) == 3]
		df = pd.DataFrame(entities)
		df = df.dropna()
		df = df.loc[df.astype(str).drop_duplicates().index]
		df = df.reset_index(drop=True)
		df = df.rename("T{}".format)
		ann_filename = os.path.basename(document).split(".")[0]+".ann"
		df.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
コード例 #14
0
import json
import sys

import merpy
from tqdm import tqdm

# Notice that this script requires:
#
#   - that the MER's source code is changed, to allow the use of
#     <skos:prefLabel> and <skos:altLabel> properties (used by OCHV)
#   - that a lexicon (whose name is given as the third argument) has been
#     processed by MER
#
# The Dockerfile in this repository takes care of that.

in_filename, lexicon_name, out_filename = sys.argv[1:]

with open(in_filename) as f:
    data = json.load(f)

with open(out_filename, 'w') as f:
    for key, text in tqdm(data.items()):
        annotations = merpy.get_entities(text, lexicon_name)
        json.dump({key: annotations}, f)
        f.write('\n')
        f.flush()
コード例 #15
0
total_sents = 0
total_docs = 0
# can be parallelized
for pmid in document_entities:
    if pmid == "":
        continue
    if pmid not in pmid_to_abst:
        print("missing this abstract:", pmid)
        #import pdb; pdb.set_trace()
        missing_texts += 1
        continue
    total_docs += 1
    doc = nlp(pmid_to_abst[pmid])
    merpy.create_lexicon(document_entities[pmid], "biomarker" + pmid)
    merpy.process_lexicon("biomarker" + pmid)
    doc_entities = merpy.get_entities(pmid_to_abst[pmid], "biomarker" + pmid)
    entity_spans = []
    for e in doc_entities:
        try:
            int(e[0]), int(e[1])
        except ValueError:
            print("ERROR", e)
            continue
        entity_spans.append(doc.char_span(int(e[0]), int(e[1]), label="GPE"))
    entity_spans = [e for e in entity_spans if e is not None]
    try:
        doc.ents = entity_spans[:]
    except:
        import pdb
        pdb.set_trace()
    total_entities += len(entity_spans)