def test_candidate_generation(self):

        umls_fixture = UmlsKnowledgeBase(
            "tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(
                dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                                 umls_concept_aliases,
                                                 umls_fixture)
        results = candidate_generator(['(131)I-Macroaggregated Albumin'], 10)

        canonical_ids = [x.concept_id for x in results[0]]
        assert canonical_ids == ['C0000005', 'C0000102', 'C0000084']

        # The mention was an exact match, so should have a distance of zero to a concept:
        assert results[0][0] == MentionCandidate(
            concept_id='C0000005',
            aliases=['(131)I-Macroaggregated Albumin'],
            similarities=[1.0])

        # Test we don't crash with zero vectors
        results = candidate_generator(['ZZZZ'], 10)
        assert results == [[]]
Esempio n. 2
0
    def test_empty_list(self):
        
        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)
        results = candidate_generator([], 10)

        assert results == []
Esempio n. 3
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
Esempio n. 4
0
    def test_create_index(self):

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)

        assert len(umls_concept_aliases) == 93
        assert len(ann_index) == 93 # Number of deduplicated aliases + canonical ids
        tfidf_params = tfidf_vectorizer.get_params()

        assert tfidf_params["analyzer"] == "char_wb"
        assert tfidf_params["min_df"] == 10
        assert tfidf_params["ngram_range"] == (3, 3)
Esempio n. 5
0
    def __init__(self,
                 ann_index: FloatIndex = None,
                 tfidf_vectorizer: TfidfVectorizer = None,
                 ann_concept_aliases_list: List[str] = None,
                 umls: UmlsKnowledgeBase = None,
                 verbose: bool = False,
                 ef_search: int = 200) -> None:

        self.ann_index = ann_index or load_approximate_nearest_neighbours_index(ef_search=ef_search)

        self.vectorizer = tfidf_vectorizer or joblib.load(cached_path(DEFAULT_PATHS["tfidf_vectorizer"]))
        self.ann_concept_aliases_list = ann_concept_aliases_list or \
            json.load(open(cached_path(DEFAULT_PATHS["concept_aliases_list"])))

        self.umls = umls or UmlsKnowledgeBase()
        self.verbose = verbose
Esempio n. 6
0
def init_umls_nlp_linker():
    base_dir = ''
    tfidf_path = base_dir + 'tfidf_vectors_sparse.npz'
    ann_path = base_dir + 'nmslib_index.bin'
    ann_index = load_approximate_nearest_neighbours_index(
        tfidf_vectors_path=tfidf_path, ann_index_path=ann_path)
    vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib'))
    ann_concept = json.load(
        open(cached_path(base_dir + 'concept_aliases.json')))
    umlsknowlegebase = UmlsKnowledgeBase(
        file_path=base_dir + 'umls_2017_aa_cat0129.json',
        types_file_path=base_dir + 'umls_semantic_type_tree.tsv')
    cg = CandidateGenerator(ann_index=ann_index,
                            tfidf_vectorizer=vec,
                            ann_concept_aliases_list=ann_concept,
                            umls=umlsknowlegebase)
    linker = UmlsEntityLinker(candidate_generator=cg,
                              max_entities_per_mention=1)
    nlp.add_pipe(linker)
    return linker
Esempio n. 7
0
def create_tfidf_ann_index(
    out_path: str,
    umls: UmlsKnowledgeBase = None
) -> Tuple[List[str], TfidfVectorizer, FloatIndex]:
    """
    Build tfidf vectorizer and ann index.

    Warning: Running this function on the whole of UMLS requires ~ 200GB of RAM ...
    TODO: Make this not take 200GB of RAM.

    Parameters
    ----------
    out_path: str, required.
        The path where the various model pieces will be saved.
    umls : UmlsKnowledgeBase, optional.
        The umls kb items to generate the index and vectors for.

    """
    tfidf_vectorizer_path = f'{out_path}/tfidf_vectorizer.joblib'
    ann_index_path = f'{out_path}/nmslib_index.bin'
    tfidf_vectors_path = f'{out_path}/tfidf_vectors_sparse.npz'
    uml_concept_aliases_path = f'{out_path}/concept_aliases.json'

    umls = umls or UmlsKnowledgeBase()

    # nmslib hyperparameters (very important)
    # guide: https://github.com/nmslib/nmslib/blob/master/python_bindings/parameters.md
    # Default values resulted in very low recall.

    # set to the maximum recommended value. Improves recall at the expense of longer indexing time.
    # TODO: This variable name is so hot because I don't actually know what this parameter does.
    m_parameter = 100
    # `C` for Construction. Set to the maximum recommended value
    # Improves recall at the expense of longer indexing time
    construction = 2000
    num_threads = 60  # set based on the machine
    index_params = {
        'M': m_parameter,
        'indexThreadQty': num_threads,
        'efConstruction': construction,
        'post': 0
    }

    print(
        f'No tfidf vectorizer on {tfidf_vectorizer_path} or ann index on {ann_index_path}'
    )
    umls_concept_aliases = list(umls.alias_to_cuis.keys())
    umls_concept_aliases = numpy.array(umls_concept_aliases)

    # NOTE: here we are creating the tf-idf vectorizer with float32 type, but we can serialize the
    # resulting vectors using float16, meaning they take up half the memory on disk. Unfortunately
    # we can't use the float16 format to actually run the vectorizer, because of this bug in sparse
    # matrix representations in scipy: https://github.com/scipy/scipy/issues/7408
    print(f'Fitting tfidf vectorizer on {len(umls_concept_aliases)} aliases')
    tfidf_vectorizer = TfidfVectorizer(analyzer='char_wb',
                                       ngram_range=(3, 3),
                                       min_df=10,
                                       dtype=numpy.float32)
    start_time = datetime.datetime.now()
    uml_concept_alias_tfidfs = tfidf_vectorizer.fit_transform(
        umls_concept_aliases)
    print(f'Saving tfidf vectorizer to {tfidf_vectorizer_path}')
    joblib.dump(tfidf_vectorizer, tfidf_vectorizer_path)
    end_time = datetime.datetime.now()
    total_time = (end_time - start_time)
    print(
        f'Fitting and saving vectorizer took {total_time.total_seconds()} seconds'
    )

    print(f'Finding empty (all zeros) tfidf vectors')
    empty_tfidfs_boolean_flags = numpy.array(
        uml_concept_alias_tfidfs.sum(axis=1) != 0).reshape(-1, )
    deleted_aliases = umls_concept_aliases[empty_tfidfs_boolean_flags == False]  # pylint: disable=singleton-comparison
    number_of_non_empty_tfidfs = len(deleted_aliases)
    total_number_of_tfidfs = uml_concept_alias_tfidfs.shape[0]

    print(
        f'Deleting {number_of_non_empty_tfidfs}/{total_number_of_tfidfs} aliases because their tfidf is empty'
    )
    # remove empty tfidf vectors, otherwise nmslib will crash
    umls_concept_aliases = umls_concept_aliases[
        empty_tfidfs_boolean_flags].tolist()
    uml_concept_alias_tfidfs = uml_concept_alias_tfidfs[
        empty_tfidfs_boolean_flags]
    print(deleted_aliases)

    print(
        f'Saving list of concept ids and tfidfs vectors to {uml_concept_aliases_path} and {tfidf_vectors_path}'
    )
    json.dump(umls_concept_aliases, open(uml_concept_aliases_path, "w"))
    scipy.sparse.save_npz(tfidf_vectors_path,
                          uml_concept_alias_tfidfs.astype(numpy.float16))

    print(
        f'Fitting ann index on {len(umls_concept_aliases)} aliases (takes 2 hours)'
    )
    start_time = datetime.datetime.now()
    ann_index = nmslib.init(method='hnsw',
                            space='cosinesimil_sparse',
                            data_type=nmslib.DataType.SPARSE_VECTOR)
    ann_index.addDataPointBatch(uml_concept_alias_tfidfs)
    ann_index.createIndex(index_params, print_progress=True)
    ann_index.saveIndex(ann_index_path)
    end_time = datetime.datetime.now()
    elapsed_time = end_time - start_time
    print(f'Fitting ann index took {elapsed_time.total_seconds()} seconds')

    return umls_concept_aliases, tfidf_vectorizer, ann_index
Esempio n. 8
0
sci_nlp = spacy.load('en_core_sci_md')
# sci_nlp = spacy.load('en_core_sci_lg')

st21pv_set = set([
    'T005', 'T007', 'T017', 'T022', 'T031', 'T033', 'T037', 'T038', 'T058',
    'T062', 'T074', 'T082', 'T091', 'T092', 'T097', 'T098', 'T103', 'T168',
    'T170', 'T201', 'T204'
])


def cui2ent(cui):
    try:
        return umls_kb.cui_to_entity[cui]
    except KeyError:
        return None


@lru_cache(maxsize=None)
def cui2st(cui):
    ent = cui2ent(cui)
    if ent is None:
        return None
    else:
        return max(ent.types,
                   key=lambda x: umls_tree.get_node_from_id(x).level)


umls_kb = UmlsKnowledgeBase()
all_cuis = set(umls_kb.cui_to_entity.keys())