def q_umls_d_wiki(): test_mentions = get_mention_docs("test") train_mentions = get_mention_docs("train") dev_mentions = get_mention_docs("dev") mentions = {} mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in train_mentions.items()}) mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in test_mentions.items()}) mentions.update({k: ' '.join(set(v["text"].split()) - en_stops) for k, v in dev_mentions.items()}) mrconso = get_mrconso() aliases = {k: " ".join(set(v["alias"]["ENG"]) - en_stops) for k, v in mrconso.items() if "ENG" in v["alias"]} mention_ids = sorted(mentions) cuis = sorted(aliases) vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5), max_features=100000) print(vectorizer) X_cui = vectorizer.fit_transform([aliases[cid] for cid in cuis]) X_mention = vectorizer.transform([mentions[mid] for mid in mention_ids]) print(X_cui.shape, X_mention.shape) nbrs = NN(n_neighbors=64, algorithm='auto', metric='cosine', leaf_size=64, n_jobs=10) print("fitting nn...") nbrs.fit(X_cui) print("finding nbrs...") ns = nbrs.kneighbors(X_mention, return_distance=False) with open('ns_balltree.pkl', 'wb') as fout: pickle.dump((ns, cuis, mention_ids), fout) I = ns i = 0 j = 0 with open('mm_tfidf_candidates.json', 'w') as fout: for i in range(I.shape[0]): mention_id = mention_ids[i] nbrs = [] for j in range(I.shape[1]): nbr = I[i, j] nbrs.append(cuis[nbr]) fout.write(json.dumps({"mention_id" : mention_id, "tfidf_candidates": nbrs})) fout.write('\n')