Example #1
0
def q_umls_d_wiki():
    test_mentions = get_mention_docs("test")
    train_mentions = get_mention_docs("train")
    dev_mentions = get_mention_docs("dev")
    mentions = {}
    mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in train_mentions.items()})
    mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in test_mentions.items()})
    mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in dev_mentions.items()})

    mrconso = get_mrconso()
    aliases = {k: " ".join(set(v["alias"]["ENG"]) - en_stops) for k, v in mrconso.items() if "ENG" in v["alias"]}
    mention_ids = sorted(mentions)
    cuis = sorted(aliases)
    vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5), max_features=100000)
    print(vectorizer)
    X_cui = vectorizer.fit_transform([aliases[cid] for cid in cuis])
    X_mention = vectorizer.transform([mentions[mid] for mid in mention_ids])
    print(X_cui.shape, X_mention.shape)

    nbrs = NN(n_neighbors=64, algorithm='auto', metric='cosine', leaf_size=64, n_jobs=10)
    print("fitting nn...")
    nbrs.fit(X_cui)
    print("finding nbrs...")
    ns = nbrs.kneighbors(X_mention, return_distance=False)
    with open('ns_balltree.pkl', 'wb') as fout:
        pickle.dump((ns, cuis, mention_ids), fout)
    I = ns
    i = 0
    j = 0
    with open('mm_tfidf_candidates.json', 'w') as fout:
        for i in range(I.shape[0]):
            mention_id = mention_ids[i]
            nbrs = []
            for j in range(I.shape[1]):
                nbr = I[i, j]
                nbrs.append(cuis[nbr])
            fout.write(json.dumps({"mention_id" : mention_id, "tfidf_candidates": nbrs}))
            fout.write('\n')