def __init__(self): nlp = spacy.load('en_core_sci_lg') abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) self.linker = EntityLinker(resolve_abbreviations=True, name="umls") nlp.add_pipe(self.linker) self.nlp = nlp
def set_up(self): # pylint: disable=import-outside-toplevel self._load_lang_model() if "umls_link" in self.processors: # add UMLS entity linking component from scispacy.linking import EntityLinker linker = EntityLinker(resolve_abbreviations=True, name="umls") self.nlp.add_pipe(linker)
def umls_entlink(self): """ Add UMLS entity linker and abbreviation detector to spaCy pipeline_ie """ abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) linker = EntityLinker(resolve_abbreviations=True, name="umls") self.nlp.add_pipe(linker)
def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) self.linker = EntityLinker(candidate_generator=candidate_generator, filter_for_definitions=False)
def get_types(input_filename, output_filename): nlp = spacy.load("en_core_sci_sm") type_count_dict = {} log_file = open("blender_log.tsv", "w") # This line takes a while, because we have to download ~1GB of data # and load a large JSON file (the knowledge base). Be patient! # Thankfully it should be faster after the first time you use it, because # the downloads are cached. # NOTE: The resolve_abbreviations parameter is optional, and requires that # the AbbreviationDetector pipe has already been added to the pipeline. Adding # the AbbreviationDetector pipe and setting resolve_abbreviations to True means # that linking will only be performed on the long form of abbreviations. tui_label_dict = read_type_ids("tui_labels.tsv") linker = EntityLinker(resolve_abbreviations=True, name="umls") nlp.add_pipe(linker) input_file = open(input_filename) count = 0 for line in input_file: count += 1 if count % 1000 == 0: print("count " + str(count)) doc = nlp(line[:-1]) if len(doc.ents) != 0: for i in range(len(doc.ents)): entity = doc.ents[i] for umls_ent in entity._.kb_ents: types_list = linker.kb.cui_to_entity[umls_ent[0]].types for typ in types_list: if typ not in type_count_dict: type_count_dict[typ] = 0 type_count_dict[typ] += 1 # import pdb; pdb.set_trace() log_file.write(line[:-1] + "\t" + tui_label_dict[typ] + "\n") output_file = open(output_filename, "w") for typ in type_count_dict: output_file.write(typ + "\t" + str(type_count_dict[typ]) + '\n')
import argparse import spacy from scispacy.abbreviation import AbbreviationDetector from scispacy.linking import EntityLinker import math import itertools import timeit disease_chemical_model = 'en_ner_bc5cdr_md' # example code https://github.com/allenai/scispacy nlp = spacy.load(disease_chemical_model) # abbreviation_pipe = AbbreviationDetector(nlp) # nlp.add_pipe(abbreviation_pipe) linker = EntityLinker(name="umls") nlp.add_pipe(linker) # nuance example: # example input: 'Sickle cell-hemoglobin SS disease' # example output: (Sickle, SS) # each of the two would get 5 umls concept ids in order of relevance score (descending) # where the most relevant concept id for Sickle is correct ('C0002895', 0.7860167026519775) # but for SS it's not ('C0039101', 1.0) # return all the ids def get_concept_ids(concept_text): entities = nlp(concept_text) ids = [[id[0] for id in concepts._.kb_ents] for concepts in entities.ents if len(concepts._.kb_ents) > 0] return list(itertools.chain.from_iterable(ids))
""" import spacy from scispacy.linking import EntityLinker nlp = spacy.load("en_core_sci_sm") # This line takes a while, because we have to download ~1GB of data # and load a large JSON file (the knowledge base). Be patient! # Thankfully it should be faster after the first time you use it, because # the downloads are cached. # NOTE: The resolve_abbreviations parameter is optional, and requires that # the AbbreviationDetector pipe has already been added to the pipeline. Adding # the AbbreviationDetector pipe and setting resolve_abbreviations to True means # that linking will only be performed on the long form of abbreviations. linker = EntityLinker(resolve_abbreviations=True, name="umls") nlp.add_pipe(linker) doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \ inherited motor neuron disease caused by the expansion \ of a polyglutamine tract within the androgen receptor (AR). \ SBMA can be caused by this easily.") # Let's look at a random entity! entity = doc.ents[1] print("Name: ", entity) for umls_ent in entity._.kb_ents: print(linker.kb.cui_to_entity[umls_ent[0]])
def load_linker(): linker = EntityLinker(resolve_abbreviations=True, name="umls") return linker