def test_candidate_generation(self): umls_fixture = UmlsKnowledgeBase( "tests/fixtures/umls_test_fixture.json") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index( dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10) canonical_ids = [x.concept_id for x in results[0]] assert canonical_ids == ['C0000005', 'C0000102', 'C0000084'] # The mention was an exact match, so should have a distance of zero to a concept: assert results[0][0] == MentionCandidate( concept_id='C0000005', aliases=['(131)I-Macroaggregated Albumin'], similarities=[1.0]) # Test we don't crash with zero vectors results = candidate_generator(['ZZZZ'], 10) assert results == [[]]
def test_empty_list(self): umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) results = candidate_generator([], 10) assert results == []
def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
def test_create_index(self): umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) assert len(umls_concept_aliases) == 93 assert len(ann_index) == 93 # Number of deduplicated aliases + canonical ids tfidf_params = tfidf_vectorizer.get_params() assert tfidf_params["analyzer"] == "char_wb" assert tfidf_params["min_df"] == 10 assert tfidf_params["ngram_range"] == (3, 3)
def __init__(self, ann_index: FloatIndex = None, tfidf_vectorizer: TfidfVectorizer = None, ann_concept_aliases_list: List[str] = None, umls: UmlsKnowledgeBase = None, verbose: bool = False, ef_search: int = 200) -> None: self.ann_index = ann_index or load_approximate_nearest_neighbours_index(ef_search=ef_search) self.vectorizer = tfidf_vectorizer or joblib.load(cached_path(DEFAULT_PATHS["tfidf_vectorizer"])) self.ann_concept_aliases_list = ann_concept_aliases_list or \ json.load(open(cached_path(DEFAULT_PATHS["concept_aliases_list"]))) self.umls = umls or UmlsKnowledgeBase() self.verbose = verbose
def init_umls_nlp_linker(): base_dir = '' tfidf_path = base_dir + 'tfidf_vectors_sparse.npz' ann_path = base_dir + 'nmslib_index.bin' ann_index = load_approximate_nearest_neighbours_index( tfidf_vectors_path=tfidf_path, ann_index_path=ann_path) vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib')) ann_concept = json.load( open(cached_path(base_dir + 'concept_aliases.json'))) umlsknowlegebase = UmlsKnowledgeBase( file_path=base_dir + 'umls_2017_aa_cat0129.json', types_file_path=base_dir + 'umls_semantic_type_tree.tsv') cg = CandidateGenerator(ann_index=ann_index, tfidf_vectorizer=vec, ann_concept_aliases_list=ann_concept, umls=umlsknowlegebase) linker = UmlsEntityLinker(candidate_generator=cg, max_entities_per_mention=1) nlp.add_pipe(linker) return linker
def create_tfidf_ann_index( out_path: str, umls: UmlsKnowledgeBase = None ) -> Tuple[List[str], TfidfVectorizer, FloatIndex]: """ Build tfidf vectorizer and ann index. Warning: Running this function on the whole of UMLS requires ~ 200GB of RAM ... TODO: Make this not take 200GB of RAM. Parameters ---------- out_path: str, required. The path where the various model pieces will be saved. umls : UmlsKnowledgeBase, optional. The umls kb items to generate the index and vectors for. """ tfidf_vectorizer_path = f'{out_path}/tfidf_vectorizer.joblib' ann_index_path = f'{out_path}/nmslib_index.bin' tfidf_vectors_path = f'{out_path}/tfidf_vectors_sparse.npz' uml_concept_aliases_path = f'{out_path}/concept_aliases.json' umls = umls or UmlsKnowledgeBase() # nmslib hyperparameters (very important) # guide: https://github.com/nmslib/nmslib/blob/master/python_bindings/parameters.md # Default values resulted in very low recall. # set to the maximum recommended value. Improves recall at the expense of longer indexing time. # TODO: This variable name is so hot because I don't actually know what this parameter does. m_parameter = 100 # `C` for Construction. Set to the maximum recommended value # Improves recall at the expense of longer indexing time construction = 2000 num_threads = 60 # set based on the machine index_params = { 'M': m_parameter, 'indexThreadQty': num_threads, 'efConstruction': construction, 'post': 0 } print( f'No tfidf vectorizer on {tfidf_vectorizer_path} or ann index on {ann_index_path}' ) umls_concept_aliases = list(umls.alias_to_cuis.keys()) umls_concept_aliases = numpy.array(umls_concept_aliases) # NOTE: here we are creating the tf-idf vectorizer with float32 type, but we can serialize the # resulting vectors using float16, meaning they take up half the memory on disk. Unfortunately # we can't use the float16 format to actually run the vectorizer, because of this bug in sparse # matrix representations in scipy: https://github.com/scipy/scipy/issues/7408 print(f'Fitting tfidf vectorizer on {len(umls_concept_aliases)} aliases') tfidf_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 3), min_df=10, dtype=numpy.float32) start_time = datetime.datetime.now() uml_concept_alias_tfidfs = tfidf_vectorizer.fit_transform( umls_concept_aliases) print(f'Saving tfidf vectorizer to {tfidf_vectorizer_path}') joblib.dump(tfidf_vectorizer, tfidf_vectorizer_path) end_time = datetime.datetime.now() total_time = (end_time - start_time) print( f'Fitting and saving vectorizer took {total_time.total_seconds()} seconds' ) print(f'Finding empty (all zeros) tfidf vectors') empty_tfidfs_boolean_flags = numpy.array( uml_concept_alias_tfidfs.sum(axis=1) != 0).reshape(-1, ) deleted_aliases = umls_concept_aliases[empty_tfidfs_boolean_flags == False] # pylint: disable=singleton-comparison number_of_non_empty_tfidfs = len(deleted_aliases) total_number_of_tfidfs = uml_concept_alias_tfidfs.shape[0] print( f'Deleting {number_of_non_empty_tfidfs}/{total_number_of_tfidfs} aliases because their tfidf is empty' ) # remove empty tfidf vectors, otherwise nmslib will crash umls_concept_aliases = umls_concept_aliases[ empty_tfidfs_boolean_flags].tolist() uml_concept_alias_tfidfs = uml_concept_alias_tfidfs[ empty_tfidfs_boolean_flags] print(deleted_aliases) print( f'Saving list of concept ids and tfidfs vectors to {uml_concept_aliases_path} and {tfidf_vectors_path}' ) json.dump(umls_concept_aliases, open(uml_concept_aliases_path, "w")) scipy.sparse.save_npz(tfidf_vectors_path, uml_concept_alias_tfidfs.astype(numpy.float16)) print( f'Fitting ann index on {len(umls_concept_aliases)} aliases (takes 2 hours)' ) start_time = datetime.datetime.now() ann_index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) ann_index.addDataPointBatch(uml_concept_alias_tfidfs) ann_index.createIndex(index_params, print_progress=True) ann_index.saveIndex(ann_index_path) end_time = datetime.datetime.now() elapsed_time = end_time - start_time print(f'Fitting ann index took {elapsed_time.total_seconds()} seconds') return umls_concept_aliases, tfidf_vectorizer, ann_index
sci_nlp = spacy.load('en_core_sci_md') # sci_nlp = spacy.load('en_core_sci_lg') st21pv_set = set([ 'T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 'T038', 'T058', 'T062', 'T074', 'T082', 'T091', 'T092', 'T097', 'T098', 'T103', 'T168', 'T170', 'T201', 'T204' ]) def cui2ent(cui): try: return umls_kb.cui_to_entity[cui] except KeyError: return None @lru_cache(maxsize=None) def cui2st(cui): ent = cui2ent(cui) if ent is None: return None else: return max(ent.types, key=lambda x: umls_tree.get_node_from_id(x).level) umls_kb = UmlsKnowledgeBase() all_cuis = set(umls_kb.cui_to_entity.keys())