コード例 #1
0
ファイル: linking.py プロジェクト: swipswaps/scispacy
    def __init__(
        self,
        nlp: Language = None,
        name: str = "scispacy_linker",
        candidate_generator: CandidateGenerator = None,
        resolve_abbreviations: bool = True,
        k: int = 30,
        threshold: float = 0.7,
        no_definition_threshold: float = 0.95,
        filter_for_definitions: bool = True,
        max_entities_per_mention: int = 5,
        linker_name: str = None,
    ):
        # TODO(Mark): Remove in scispacy v1.0.
        Span.set_extension("umls_ents", default=[], force=True)
        Span.set_extension("kb_ents", default=[], force=True)

        self.candidate_generator = candidate_generator or CandidateGenerator(
            name=linker_name)
        self.resolve_abbreviations = resolve_abbreviations
        self.k = k
        self.threshold = threshold
        self.no_definition_threshold = no_definition_threshold
        self.kb = self.candidate_generator.kb
        self.filter_for_definitions = filter_for_definitions
        self.max_entities_per_mention = max_entities_per_mention

        # TODO(Mark): Remove in scispacy v1.0. This is for backward compatability only.
        self.umls = self.kb
コード例 #2
0
    def test_candidate_generation(self):

        umls_fixture = UmlsKnowledgeBase(
            "tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(
                dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                                 umls_concept_aliases,
                                                 umls_fixture)
        results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10)

        canonical_ids = [x.concept_id for x in results[0]]
        assert canonical_ids == ['C0000005', 'C0000102', 'C0000084']

        # The mention was an exact match, so should have a distance of zero to a concept:
        assert results[0][0] == MentionCandidate(
            concept_id='C0000005',
            aliases=['(131)I-Macroaggregated Albumin'],
            similarities=[1.0])

        # Test we don't crash with zero vectors
        results = candidate_generator(['ZZZZ'], 10)
        assert results == [[]]
コード例 #3
0
    def test_empty_list(self):
        
        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
        results = candidate_generator([], 10)

        assert results == []
コード例 #4
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
コード例 #5
0
    def __init__(self,
                 candidate_generator: CandidateGenerator = None,
                 resolve_abbreviations: bool = True,
                 k: int = 30,
                 threshold: float = 0.7,
                 filter_for_definitions: bool = True,
                 max_entities_per_mention: int = 5):

        Span.set_extension("umls_ents", default=[], force=True)

        self.candidate_generator = candidate_generator or CandidateGenerator()
        self.resolve_abbreviations = resolve_abbreviations
        self.k = k
        self.threshold = threshold
        self.umls = self.candidate_generator.umls
        self.filter_for_definitions = filter_for_definitions
        self.max_entities_per_mention = max_entities_per_mention
コード例 #6
0
def init_umls_nlp_linker():
    base_dir = ''
    tfidf_path = base_dir + 'tfidf_vectors_sparse.npz'
    ann_path = base_dir + 'nmslib_index.bin'
    ann_index = load_approximate_nearest_neighbours_index(
        tfidf_vectors_path=tfidf_path, ann_index_path=ann_path)
    vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib'))
    ann_concept = json.load(
        open(cached_path(base_dir + 'concept_aliases.json')))
    umlsknowlegebase = UmlsKnowledgeBase(
        file_path=base_dir + 'umls_2017_aa_cat0129.json',
        types_file_path=base_dir + 'umls_semantic_type_tree.tsv')
    cg = CandidateGenerator(ann_index=ann_index,
                            tfidf_vectorizer=vec,
                            ann_concept_aliases_list=ann_concept,
                            umls=umlsknowlegebase)
    linker = UmlsEntityLinker(candidate_generator=cg,
                              max_entities_per_mention=1)
    nlp.add_pipe(linker)
    return linker
コード例 #7
0
import os
from multiprocessing import Pool
import multiprocessing as multi
import pickle
import scispacy
from scispacy.linking import EntityLinker
from spacy.symbols import ORTH
import time
import re
from spacy.language import Language
import pdb
import copy
from tqdm import tqdm
from scispacy.candidate_generation import CandidateGenerator

MeshCandidateGenrator = CandidateGenerator(name='mesh')
KB=MeshCandidateGenrator.kb
K=100
Resolve_abbreviations = True
Threshold = 0.3
No_definition_threshold = 0.95
Filter_for_definitions = True
Max_entities_per_mention  = 30

def candidate_dui_generator(mention_strings):
    batch_candidates = MeshCandidateGenrator(mention_strings, K)
    batched_sorted_candidates = list()
    for candidates in batch_candidates:
        predicted = []
        for cand in candidates:
            score = max(cand.similarities)