Esempio n. 1
0
 def __init__(self):
     nlp = spacy.load('en_core_sci_lg')
     abbreviation_pipe = AbbreviationDetector(nlp)
     nlp.add_pipe(abbreviation_pipe)
     self.linker = EntityLinker(resolve_abbreviations=True, name="umls")
     nlp.add_pipe(self.linker)
     self.nlp = nlp
Esempio n. 2
0
    def set_up(self):
        # pylint: disable=import-outside-toplevel
        self._load_lang_model()
        if "umls_link" in self.processors:  # add UMLS entity linking component
            from scispacy.linking import EntityLinker

            linker = EntityLinker(resolve_abbreviations=True, name="umls")
            self.nlp.add_pipe(linker)
Esempio n. 3
0
 def umls_entlink(self):
     """
     Add UMLS entity linker and abbreviation detector to spaCy pipeline_ie
     """
     abbreviation_pipe = AbbreviationDetector(self.nlp)
     self.nlp.add_pipe(abbreviation_pipe)
     linker = EntityLinker(resolve_abbreviations=True, name="umls")
     self.nlp.add_pipe(linker)
Esempio n. 4
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = EntityLinker(candidate_generator=candidate_generator, filter_for_definitions=False)
Esempio n. 5
0
def get_types(input_filename, output_filename):
    nlp = spacy.load("en_core_sci_sm")
    type_count_dict = {}
    log_file = open("blender_log.tsv", "w")
    # This line takes a while, because we have to download ~1GB of data
    # and load a large JSON file (the knowledge base). Be patient!
    # Thankfully it should be faster after the first time you use it, because
    # the downloads are cached.
    # NOTE: The resolve_abbreviations parameter is optional, and requires that
    # the AbbreviationDetector pipe has already been added to the pipeline. Adding
    # the AbbreviationDetector pipe and setting resolve_abbreviations to True means
    # that linking will only be performed on the long form of abbreviations.
    tui_label_dict = read_type_ids("tui_labels.tsv")
    linker = EntityLinker(resolve_abbreviations=True, name="umls")
    nlp.add_pipe(linker)

    input_file = open(input_filename)
    count = 0
    for line in input_file:
        count += 1
        if count % 1000 == 0:
            print("count " + str(count))
        doc = nlp(line[:-1])
        if len(doc.ents) != 0:
            for i in range(len(doc.ents)):
                entity = doc.ents[i]
                for umls_ent in entity._.kb_ents:

                    types_list = linker.kb.cui_to_entity[umls_ent[0]].types
                    for typ in types_list:
                        if typ not in type_count_dict:
                            type_count_dict[typ] = 0
                        type_count_dict[typ] += 1
                        # import pdb; pdb.set_trace()
                        log_file.write(line[:-1] + "\t" + tui_label_dict[typ] +
                                       "\n")
    output_file = open(output_filename, "w")
    for typ in type_count_dict:
        output_file.write(typ + "\t" + str(type_count_dict[typ]) + '\n')
import argparse
import spacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
import math
import itertools
import timeit

disease_chemical_model = 'en_ner_bc5cdr_md'

# example code https://github.com/allenai/scispacy
nlp = spacy.load(disease_chemical_model)
# abbreviation_pipe = AbbreviationDetector(nlp)
# nlp.add_pipe(abbreviation_pipe)
linker = EntityLinker(name="umls")
nlp.add_pipe(linker)

# nuance example:
# example input: 'Sickle cell-hemoglobin SS disease'
# example output: (Sickle, SS)
# each of the two would get 5 umls concept ids in order of relevance score (descending)
# where the most relevant concept id for Sickle is correct ('C0002895', 0.7860167026519775)
# but for SS it's not ('C0039101', 1.0)


# return all the ids
def get_concept_ids(concept_text):
    entities = nlp(concept_text)
    ids = [[id[0] for id in concepts._.kb_ents] for concepts in entities.ents
           if len(concepts._.kb_ents) > 0]
    return list(itertools.chain.from_iterable(ids))
Esempio n. 7
0
"""

import spacy
from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

# This line takes a while, because we have to download ~1GB of data
# and load a large JSON file (the knowledge base). Be patient!
# Thankfully it should be faster after the first time you use it, because
# the downloads are cached.
# NOTE: The resolve_abbreviations parameter is optional, and requires that
# the AbbreviationDetector pipe has already been added to the pipeline. Adding
# the AbbreviationDetector pipe and setting resolve_abbreviations to True means
# that linking will only be performed on the long form of abbreviations.
linker = EntityLinker(resolve_abbreviations=True, name="umls")

nlp.add_pipe(linker)

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

# Let's look at a random entity!
entity = doc.ents[1]

print("Name: ", entity)

for umls_ent in entity._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])
Esempio n. 8
0
def load_linker():
    linker = EntityLinker(resolve_abbreviations=True, name="umls")
    return linker