def mcsm(embedding_list, embedding_type_list, type_list=t_list, k=40, lang_range=['ENG'], check_intersection=False): if check_intersection: if not os.path.exists("intersection.txt"): intersection_cui = get_intersection( embedding_list, embedding_type_list) with open("intersection.txt", "w", encoding="utf-8") as f: for cui in intersection_cui: f.write(cui.strip() + "\n") else: with open("intersection.txt", "r", encoding="utf-8") as f: lines = f.readlines() intersection_cui = [line.strip() for line in lines] umls = UMLS("../../umls", source_range='SNOMEDCT_US', lang_range=lang_range) if check_intersection: cui_list = [cui for cui in intersection_cui if cui in umls.cui2sty and umls.cui2sty[cui] in type_list] else: cui_list = [cui for cui, sty in umls.cui2sty.items() if sty in type_list] opt = [] for index, embedding in enumerate(embedding_list): if embedding_type_list[index].lower() == "cui": opt.append(mcsm_cui(embedding, umls, cui_list, type_list, k)) if embedding_type_list[index].lower() == "word": opt.append(mcsm_word(embedding, umls, cui_list, type_list, k)) if embedding_type_list[index].lower() == "bert": opt.append(mcsm_bert(embedding, umls, cui_list, type_list, k, summary_method="MEAN")) opt.append(mcsm_bert(embedding, umls, cui_list, type_list, k, summary_method="CLS")) return opt
def __init__(self, umls_folder, model_name_or_path, lang, json_save_path=None, max_lui_per_cui=8, max_length=32): self.umls = UMLS(umls_folder, lang_range=lang) self.len = len(self.umls.rel) self.max_lui_per_cui = max_lui_per_cui self.max_length = max_length self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.json_save_path = json_save_path self.calculate_class_count()
def __init__(self, umls_folder, model_name_or_path, lang, json_save_path=None, max_lui_per_cui=8, max_length=32, negative_sampling=True, debug=False): self.debug = debug self.umls = UMLS(umls_folder, lang_range=lang, debug=self.debug) self.len = len(self.umls.rel) self.max_lui_per_cui = max_lui_per_cui self.max_length = max_length self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.json_save_path = json_save_path self.calculate_class_count() self.negative_sampling = negative_sampling
def mrm_ccs(embedding_list, embedding_type_list, k=40, check_intersection=False): cui_to_icd9, icd9_to_cui = get_icd9_cui_mappings() if check_intersection: if not os.path.exists("intersection.txt"): intersection_cui = get_intersection(embedding_list, embedding_type_list) with open("intersection.txt", "w", encoding="utf-8") as f: for cui in intersection_cui: f.write(cui.strip() + "\n") else: with open("intersection.txt", "r", encoding="utf-8") as f: lines = f.readlines() intersection_cui = [line.strip() for line in lines] umls = UMLS("../../umls", only_load_dict=True) if check_intersection: cui_list = [ cui for cui in intersection_cui if cui in list(cui_to_icd9.keys()) ] else: cui_list = list(cui_to_icd9.keys()) icd9_list = [cui_to_icd9[cui] for cui in cui_list] icd9_set = set(icd9_list) icd9_pair = get_icd9_pairs(icd9_set) icd9_coarse_pair = get_coarse_icd9_pairs(icd9_set) icd9_to_description = get_icd9_to_description() #icd9_reverse_dict_pair = get_icd9_reverse_dict(icd9_pair) #icd9_reverse_dict_coarse_pair = get_icd9_reverse_dict(icd9_coarse_pair) #ipdb.set_trace() # type label # Only part of the icd is calculated as center # icd9_to_check = set(icd9_pairs.keys()) # icd9_to_check.intersection_update(set(icd9_to_idx.keys())) pair_center_label = [] #pair_label = [] coarse_pair_center_label = [] #coarse_pair_label = [] for cui in cui_list: if cui_to_icd9[cui] in icd9_pair: pair_center_label.append(1) else: pair_center_label.append(0) #pair_label.append(icd9_reverse_dict_pair[cui_to_icd9[cui]]) if cui_to_icd9[cui] in icd9_coarse_pair: coarse_pair_center_label.append(1) else: coarse_pair_center_label.append(0) #coarse_pair_label.append(icd9_reverse_dict_coarse_pair[cui_to_icd9[cui]]) # generate_description description = [] for cui in cui_list: if cui in cui_to_icd9 and cui_to_icd9[cui] in icd9_to_description: description.append(icd9_to_description[cui_to_icd9[cui]]) elif cui in cui_to_icd9 and tree.find(cui_to_icd9[cui]): description.append(tree.find(cui_to_icd9[cui]).description) elif cui in umls.cui2str: description.append(list(umls.cui2str[cui])[0]) else: description.append("") print(f"Can not find description for {cui}") #ipdb.set_trace() opt = [] for index, embedding in enumerate(embedding_list): print("*************************") if embedding_type_list[index].lower() == "cui": opt.append( mrm_ccs_cui(embedding, icd9_list, cui_list, pair_center_label, icd9_pair, k)) opt.append( mrm_ccs_cui(embedding, icd9_list, cui_list, coarse_pair_center_label, icd9_coarse_pair, k)) if embedding_type_list[index].lower() == "word": opt.append( mrm_ccs_word(embedding, icd9_list, description, pair_center_label, icd9_pair, k)) opt.append( mrm_ccs_word(embedding, icd9_list, description, coarse_pair_center_label, icd9_coarse_pair, k)) if embedding_type_list[index].lower() == "bert": opt.append( mrm_ccs_bert(embedding, icd9_list, description, pair_center_label, icd9_pair, k, summary_method="MEAN")) opt.append( mrm_ccs_bert(embedding, icd9_list, description, coarse_pair_center_label, icd9_coarse_pair, k, summary_method="MEAN")) opt.append( mrm_ccs_bert(embedding, icd9_list, description, pair_center_label, icd9_pair, k, summary_method="CLS")) opt.append( mrm_ccs_bert(embedding, icd9_list, description, coarse_pair_center_label, icd9_coarse_pair, k, summary_method="CLS")) return opt
] train_input_1 = [ cui2id.get(cui, use_embedding_count - 1) for cui in cui_train_1 ] train_y = [rel2id[rel] for rel in rel_train] test_input_0 = [ cui2id.get(cui, use_embedding_count - 1) for cui in cui_test_0 ] test_input_1 = [ cui2id.get(cui, use_embedding_count - 1) for cui in cui_test_1 ] test_y = [rel2id[rel] for rel in rel_test] # Find standard term name if not embedding_type == 'cui': umls = UMLS("../../umls", only_load_dict=True) cui2str = {} #ipdb.set_trace() for cui in cui_set: standard_term = umls.search(code=cui, max_number=1) if standard_term is not None: cui2str[cui] = standard_term[0] else: cui2str[cui] = cui # Deal word type embedding if embedding_type == 'word': # tokenize from nltk.tokenize import word_tokenize cui2tokenize = {}
def mrm_ndfrt(embedding_list, embedding_type_list, concept_filename, k=40, check_intersection=True): if check_intersection: if not os.path.exists("intersection.txt"): intersection_cui = get_intersection( embedding_list, embedding_type_list) with open("intersection.txt", "w", encoding="utf-8") as f: for cui in intersection_cui: f.write(cui.strip() + "\n") else: with open("intersection.txt", "r", encoding="utf-8") as f: lines = f.readlines() intersection_cui = [line.strip() for line in lines] query_to_targets, cui_list = get_drug_diseases_to_check(concept_filename) umls = UMLS("../../umls", only_load_dict=True) # source_range='SNOMEDCT_US')#, only_load_dict=True) if check_intersection: cui_list = [cui for cui in cui_list if cui in intersection_cui] #cui_list = [cui for cui in umls.cui2str if umls.cui2sty[cui] in sty_list] #cui_list = [cui for cui in cui_list if cui in umls.sty_list] """ for cui in cui_list: if not cui in umls.cui2str: print(cui) ipdb.set_trace() """ opt = [] """ # Origin print("ORIGIN") for index, embedding in enumerate(embedding_list): if embedding_type_list[index].lower() == "cui": opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "origin")) if embedding_type_list[index].lower() == "word": opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "origin")) if embedding_type_list[index].lower() == "bert": #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, # query_to_targets, k, "origin", summary_method="MEAN")) opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, "origin", summary_method="CLS")) # For UMLSBert for index, embedding in enumerate(embedding_list): if embedding_type_list[index].lower() == "bert": print("BETA") beta_path = os.path.join(embedding, "run", "1000000", "rel embedding") if os.path.exists(beta_path): if concept_filename.find('treat') >= 0: method = "may_treat" else: method = "may_prevent" #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, # query_to_targets, k, method, summary_method="MEAN")) opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, method, summary_method="CLS")) # For average and max print("ALL") for index, embedding in enumerate(embedding_list): if embedding_type_list[index].lower() == "cui": opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "all")) if embedding_type_list[index].lower() == "word": opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "all")) if embedding_type_list[index].lower() == "bert": #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, # query_to_targets, k, "all", summary_method="MEAN")) opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, "all", summary_method="CLS")) """ for index, embedding in enumerate(embedding_list): if embedding_type_list[index].lower() == "cui": opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "origin")) opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "all")) if embedding_type_list[index].lower() == "word": opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "origin")) opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "all")) if embedding_type_list[index].lower() == "bert": opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, "origin", summary_method="CLS")) beta_path = os.path.join(embedding, "run", "1000000", "rel embedding") if os.path.exists(beta_path): if concept_filename.find('treat') >= 0: method = "may_treat" else: method = "may_prevent" opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, method, summary_method="CLS")) opt.append(mrm_ndfrt_bert(embedding, umls, cui_list, query_to_targets, k, "all", summary_method="CLS")) return opt
from ndfrt_analysis import get_drug_diseases_to_check import sys sys.path.append("../../pretrain") from load_umls import UMLS query_to_targets = get_drug_diseases_to_check("may_treat_cui.txt") query_to_targets_1 = get_drug_diseases_to_check("may_prevent_cui.txt") cui_set = set() for query, targets in query_to_targets.items(): cui_set.update([query]) cui_set.update(targets) print(len(cui_set)) umls = UMLS("../../umls", source_range='SNOMEDCT_US') sty_set = set() count = 0 for cui in cui_set: if cui in umls.cui2sty: count += 1 sty_set.update([umls.cui2sty[cui]]) print(count) print(len(sty_set)) print(sty_set) count = 0 for cui in umls.cui2sty: if umls.cui2sty[cui] in sty_set: count += 1 print(count)