def __init__( self, nlp: Language = None, name: str = "scispacy_linker", candidate_generator: CandidateGenerator = None, resolve_abbreviations: bool = True, k: int = 30, threshold: float = 0.7, no_definition_threshold: float = 0.95, filter_for_definitions: bool = True, max_entities_per_mention: int = 5, linker_name: str = None, ): # TODO(Mark): Remove in scispacy v1.0. Span.set_extension("umls_ents", default=[], force=True) Span.set_extension("kb_ents", default=[], force=True) self.candidate_generator = candidate_generator or CandidateGenerator( name=linker_name) self.resolve_abbreviations = resolve_abbreviations self.k = k self.threshold = threshold self.no_definition_threshold = no_definition_threshold self.kb = self.candidate_generator.kb self.filter_for_definitions = filter_for_definitions self.max_entities_per_mention = max_entities_per_mention # TODO(Mark): Remove in scispacy v1.0. This is for backward compatability only. self.umls = self.kb
def test_candidate_generation(self): umls_fixture = UmlsKnowledgeBase( "tests/fixtures/umls_test_fixture.json") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index( dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10) canonical_ids = [x.concept_id for x in results[0]] assert canonical_ids == ['C0000005', 'C0000102', 'C0000084'] # The mention was an exact match, so should have a distance of zero to a concept: assert results[0][0] == MentionCandidate( concept_id='C0000005', aliases=['(131)I-Macroaggregated Albumin'], similarities=[1.0]) # Test we don't crash with zero vectors results = candidate_generator(['ZZZZ'], 10) assert results == [[]]
def test_empty_list(self): umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) results = candidate_generator([], 10) assert results == []
def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
def __init__(self, candidate_generator: CandidateGenerator = None, resolve_abbreviations: bool = True, k: int = 30, threshold: float = 0.7, filter_for_definitions: bool = True, max_entities_per_mention: int = 5): Span.set_extension("umls_ents", default=[], force=True) self.candidate_generator = candidate_generator or CandidateGenerator() self.resolve_abbreviations = resolve_abbreviations self.k = k self.threshold = threshold self.umls = self.candidate_generator.umls self.filter_for_definitions = filter_for_definitions self.max_entities_per_mention = max_entities_per_mention
def init_umls_nlp_linker(): base_dir = '' tfidf_path = base_dir + 'tfidf_vectors_sparse.npz' ann_path = base_dir + 'nmslib_index.bin' ann_index = load_approximate_nearest_neighbours_index( tfidf_vectors_path=tfidf_path, ann_index_path=ann_path) vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib')) ann_concept = json.load( open(cached_path(base_dir + 'concept_aliases.json'))) umlsknowlegebase = UmlsKnowledgeBase( file_path=base_dir + 'umls_2017_aa_cat0129.json', types_file_path=base_dir + 'umls_semantic_type_tree.tsv') cg = CandidateGenerator(ann_index=ann_index, tfidf_vectorizer=vec, ann_concept_aliases_list=ann_concept, umls=umlsknowlegebase) linker = UmlsEntityLinker(candidate_generator=cg, max_entities_per_mention=1) nlp.add_pipe(linker) return linker
import os from multiprocessing import Pool import multiprocessing as multi import pickle import scispacy from scispacy.linking import EntityLinker from spacy.symbols import ORTH import time import re from spacy.language import Language import pdb import copy from tqdm import tqdm from scispacy.candidate_generation import CandidateGenerator MeshCandidateGenrator = CandidateGenerator(name='mesh') KB=MeshCandidateGenrator.kb K=100 Resolve_abbreviations = True Threshold = 0.3 No_definition_threshold = 0.95 Filter_for_definitions = True Max_entities_per_mention = 30 def candidate_dui_generator(mention_strings): batch_candidates = MeshCandidateGenrator(mention_strings, K) batched_sorted_candidates = list() for candidates in batch_candidates: predicted = [] for cand in candidates: score = max(cand.similarities)