def load_encounter(): encounters1 = set() encounters2 = set() cnt = 0 with codecs.open("SUTTER_ENCOUNTER_DETAIL_V1.tab", "r", encoding='utf-8', errors='ignore') as f_in: next(f_in) for line in f_in: if cnt % 100000 == 0: print(cnt, len(encounters1)) cnt += 1 x = line.strip().split("\t") encounters1.add(x[1]) cnt = 0 with codecs.open("SUTTER_ORDER_MED_DETAIL_V1.tab", "r", encoding='utf-8', errors='ignore') as f_in: next(f_in) for line in f_in: if cnt % 100000 == 0: print(cnt, len(encounters2)) cnt += 1 x = line.strip().split("\t") encounters2.add(x[9]) dump(encounters1.intersection(encounters2), "valid_encounters")
def order_encounters(name): import random print(name) encounters = load(name + '.pkl') orders = ["voc", "random", "freq", 'rare'] ordered = [[] for _ in range(len(orders))] counters = dd(int) for enc in encounters: for code in enc[1]: counters[code] += 1 vocab = sorted(list(counters.keys())) code_to_vocab_index = {} for i in range(len(vocab)): code_to_vocab_index[vocab[i]] = i for enc in encounters: if len(enc[1]) == 0: continue for i, order in enumerate(orders): enc_1 = list(set(enc[1])) if order == "voc": enc_1 = sorted(enc_1, key=lambda x: code_to_vocab_index[x]) ordered[i].append((enc[0], enc_1)) elif order == "random": random.shuffle(enc_1) ordered[i].append((enc[0], enc_1)) elif order == "freq": enc_1 = sorted(enc_1, key=lambda x: counters[x], reverse=True) ordered[i].append((enc[0], enc_1)) elif order == "rare": enc_1 = sorted(enc_1, key=lambda x: counters[x]) ordered[i].append((enc[0], enc_1)) for i, order in enumerate(orders): dump(ordered[i], name + "_" + order + ".pkl")
def build_ground_truth(): rx_to_ndc = load("rx_to_ndc.pkl") rx_to_icd = dd(list) icd_to_ndc = dd(list) with open("./data/MEDI_11242015.csv") as f_in: cnt = -1 for line in f_in: cnt += 1 if cnt == 0: continue x = line.strip().split(",") rx_to_icd[x[0]].append(x[5]) for rx in set(rx_to_icd.keys()).intersection(set(rx_to_ndc.keys())): for ndc in rx_to_ndc[rx]: for icd in rx_to_icd[rx]: codes = icd9.get_children(icd) for code in codes: icd_to_ndc[code].append(ndc) for icd in icd_to_ndc: icd_to_ndc[icd] = list(set(icd_to_ndc[icd])) dump(dict(icd_to_ndc), "icd_to_ndc.pkl") ndc_to_icd = dd(list) for icd in icd_to_ndc: for ndc in icd_to_ndc[icd]: ndc_to_icd[ndc].append(icd) for ndc in ndc_to_icd: ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc])) dump(dict(ndc_to_icd), "ndc_to_icd.pkl")
def extract_mapping(): medications = load_medication_details() cnt = 0 records = {} with codecs.open("SUTTER_ORDER_MED_DETAIL_V1.tab", "r", encoding='utf-8', errors='ignore') as f_in: next(f_in) for line in f_in: if cnt % 100000 == 0: print(cnt, len(records), line) cnt += 1 x = line.strip().split("\t") if x[-1] in medications and medications[x[-1]][7] != "": records[x[1]] = (x[2], medications[x[-1]][7]) diag_cnt = dd(int) diag_drug_pair_cnt = dd(int) drug_cnt = dd(int) for diag, drug in records.values(): diag_drug_pair_cnt[(diag, drug[:6])] += 1 diag_cnt[diag] += 1 drug_cnt[drug[:6]] += 1 sorted_diag_drug_pair = sorted(diag_drug_pair_cnt.items(), key=lambda x: x[1], reverse=True) diag_to_drug = dd(list) drug_to_diag = dd(list) for diag, drug in diag_drug_pair_cnt: diag_to_drug[diag].append(drug) drug_to_diag[drug].append(diag) dump((dict(diag_to_drug), dict(drug_to_diag)), "diag_drug_mapping.pkl")
def convert_ground_truth(): gt = load("icd_to_ndc.pkl") diag_vocab = load("diag_vocab.pkl") drug_vocab = load("drug_vocab.pkl") gt_index = {} for c in gt: code = normalize_icd(c) if code in diag_vocab: diag = diag_vocab[code] gt_index[diag] = [] for code1 in gt[c]: if code1 in drug_vocab: drug = drug_vocab[code1] gt_index[diag].append(drug) for icd in gt_index: gt_index[icd] = list(set(gt_index[icd])) dump(gt_index, "icd_to_ndc_index.pkl") ndc_to_icd = dd(list) for icd in gt_index: for ndc in gt_index[icd]: ndc_to_icd[ndc].append(icd) for ndc in ndc_to_icd: ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc])) dump(dict(ndc_to_icd), "ndc_to_icd_index.pkl")
def clean_encounter(diag_pres): encounters = [] for p in diag_pres: pres = set() for med in p[1]: pres.add(med[-1]) encounters.append((p[0], list(pres))) dump(encounters, "sutter_encounters.pkl")
def get_data(): dataset = "sutter" level = 2 import os results = dd(list) xs = {} for file in os.listdir("/home/yzhang3151/project/AutoPrescribe2/data"): if file.endswith(".pkl") and file.startswith("%s_%s" % (dataset, level)): d = file.split("_") results[d[2]].append((int(d[5][1:]), int(d[6][1:]), float(d[7].replace("jacc", "")))) for k in results: xs[k] = [y[2] for y in sorted(results[k], key=lambda x: (x[0], x[1]))] dump(xs, "traj_%s_%s.pkl" % (dataset, level)) xs = load("traj_%s_%s.pkl" % (dataset, level)) name_mapping = { "voc": "Vocabulary", "random": "Random", "freq": "Frequent first", "rare": "Rare first" } line_type = { "random": "-", "freq": "--", "rare": "s", "voc": "^" } fig, ax = plt.subplots(figsize=(8, 4)) for k in name_mapping: line, = ax.plot([x for x in xs[k]][:len(xs["random"])], line_type[k], linewidth=2, label=name_mapping[k]) # x = np.linspace(0, 10, 500) # dashes = [10, 5, 100, 5] # 10 points on, 5 off, 100 on, 5 off # # line1, = ax.plot(x, np.sin(x), '--', linewidth=2, # label='Dashes set retroactively') # line1.set_dashes(dashes) # # line2, = ax.plot(x, -1 * np.sin(x), dashes=[30, 5, 10, 5], # label='Dashes set proactively') ax.set_xlabel("Epochs", fontsize=20) ax.set_ylabel("Jaccard Coefficient", fontsize=20) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(15) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(15) ax.legend(loc='lower right', fontsize=20) # plt.show() fig.tight_layout() plt.savefig("traj_%s_%s.pdf" % (dataset, level))
def load_mapping(): diag_drug_mapping = load("diag_drug_mapping.pkl") diag_to_drug = {} drug_to_diag = {} for diag in diag_drug_mapping[0]: diag_to_drug[diag.replace(".", "")] = diag_drug_mapping[0][diag] for drug in diag_drug_mapping[1]: drug_to_diag[drug] = [] for diag in diag_drug_mapping[1][drug]: drug_to_diag[drug].append(diag.replace(".", "")) dump((diag_to_drug, drug_to_diag), "mimic_diag_drug_mapping.pkl")
def sort_encounter(): train = load("mimic_encounter_gpi.train.pkl") test = load("mimic_encounter_gpi.dev.pkl") sorted_train = [] sorted_test = [] for d in train: sorted_train.append((d[0], sorted(d[1]))) for d in test: sorted_test.append((d[0], sorted(d[1]))) dump(sorted_train, "mimic_encounter_gpi_sorted.train.pkl") dump(sorted_test, "mimic_encounter_gpi_sorted.dev.pkl")
def load_ndc_gpi_mapping(): f_in = open("data/ndw_v_product.txt") title = next(f_in).strip().split("|") data = [] for line in f_in: data.append(line.strip().split("|")) ndc_to_gpi_6 = {} for d in data: ndc_to_gpi_6[d[1]] = d[58] dump(ndc_to_gpi_6, "ndc_to_gpi_6.pkl")
def load_rx_to_ndc(): drugs = {} in_to_drug = dd(list) in_to_pin = dd(list) with codecs.open(get_path("rxnorm.csv"), "r", "utf-8") as f_in: reader = csv.reader(f_in) cnt = 0 for row in reader: if cnt != 0: drug = { "rx": row[0], "tty": row[1], "ndc": [], "name": row[3], "va_classes": row[4], "treating": row[5].split(";"), "ingredients": row[6].split(";") } if row[2] != '': for code in row[2].strip("[").strip("]").split(","): if code is not None and code != 'None': drug["ndc"].append(code.strip().strip("'")) drugs[row[0]] = drug for ing in drug["ingredients"]: in_to_drug[ing].append(drug) if drug["tty"] == "PIN": in_to_pin[ing].append(drug["rx"]) cnt += 1 rx_to_ndc = dd(list) for rx in drugs: for ndc in drugs[rx]["ndc"]: rx_to_ndc[rx].append(ndc) for ing in in_to_drug: for drug in in_to_drug[ing]: for ndc in drug["ndc"]: rx_to_ndc[ing].append(ndc) for ing in in_to_pin: for pin in in_to_pin[ing]: for ndc in rx_to_ndc[ing]: rx_to_ndc[pin].append(ndc) ndc_to_rx = dd(list) for rx in rx_to_ndc: for ndc in rx_to_ndc[rx]: ndc_to_rx[ndc].append(rx) dump(rx_to_ndc, "rx_to_ndc.pkl") dump(ndc_to_rx, "ndc_to_rx.pkl")
def fit(self, train_set): freq = dd(lambda: dd(int)) for pair in train_set: for t0 in pair[0]: for t1 in pair[1]: freq[t0][t1] += 1 for tk in freq: sorted_freq = sorted(freq[tk].items(), key=lambda x: x[1], reverse=True) self.freq[tk] = sorted_freq dump(dict(self.freq), self.data + "_freq.pkl")
def train_mimic(): train_set = load("mimic_encounter_gpi.train.pkl") test_set = load("mimic_encounter_gpi.dev.pkl") mfm = MostFreqMatch(3, "mimic") mfm.fit(train_set) results = [] prediction_list = [] truth_list = [] for item in test_set: prediction = mfm.predict(item[0]) prediction_list.append(prediction) truth_list.append(item[1]) results.append((item[0], item[1], prediction)) dump(results, "mimic_result_freq.pkl")
def get_icd_to_gpi_map(): icd_to_ndc = load("icd_to_ndc.pkl") ndc_to_gpi = load("ndc_to_gpi_6.pkl") icd_to_gpi = dd(set) for icd in icd_to_ndc: for ndc in icd_to_ndc[icd]: if ndc in ndc_to_gpi: icd_to_gpi[icd].add(ndc_to_gpi[ndc]) gpi_to_icd = dd(set) for icd in icd_to_gpi: for gpi in icd_to_gpi[icd]: gpi_to_icd[gpi].add(icd) dump((dict(icd_to_gpi), dict(gpi_to_icd)), "icd_gpi_map.pkl")
def eval_sutter(): input_vocab = load("sutter_diag_vocab.pkl") output_vocab = load("sutter_drug_vocab_3.pkl") test_set = load("sutter_encounter.dev.pkl") rb = RuleBased(1) rb.load() results = [] prediction_list = [] truth_list = [] for item in test_set: prediction = rb.predict(item[0]) prediction_list.append(prediction) truth_list.append(item[1]) results.append((item[0], item[1], prediction)) dump(results, "sutter_result_rule.pkl")
def gen_vocab(encounters, level): diag_vocab = {} drug_vocab = {} cnt1 = 0 cnt2 = 0 for p in encounters: for diag in p[0]: if not diag in diag_vocab: diag_vocab[diag] = cnt1 cnt1 += 1 for drug in p[1]: if not drug[:level] in drug_vocab: drug_vocab[drug[:level]] = cnt2 cnt2 += 1 dump(diag_vocab, "mimic_diag_vocab.pkl") dump(drug_vocab, "mimic_drug_vocab_%s.pkl" % level)
def join_prescription(meds, medications): prescriptions = [] for eid in meds: pres = [] for med in meds[eid]: if not med[-1] in medications: continue med_detail = medications[med[-1]] pres.append( tuple( list(med) + [ med_detail[2], med_detail[3], med_detail[6], med_detail[7] ])) prescriptions.append(pres) dump(prescriptions, "prescriptions.pkl")
def join_encounters(valid_encounters, meds, medications): pairs = [] for eid in valid_encounters: pairs.append((valid_encounters[eid], meds[eid])) diag_pres = [] for p in pairs: pres = [] for med in p[1]: med_detail = medications[med[-1]] pres.append( tuple( list(med) + [ med_detail[2], med_detail[3], med_detail[6], med_detail[7] ])) diag_pres.append((p[0], pres)) dump(pairs, "diagnosis_prescription_pairs.pkl")
def train(): import numpy as np from sklearn import metrics level = 2 data = "mimic" # input_vocab = load("sutter_diag_vocab.pkl") # output_vocab = load("sutter_drug_vocab_%s.pkl" % level) input_vocab = load("%s_diag_vocab.pkl" % (data)) output_vocab = load("%s_drug_vocab_%s.pkl" % (data, level)) train_encounters = load("%s_encounters_%s.train.pkl" % (data, level)) test_encounters = load("%s_encounters_%s.test.pkl" % (data, level)) # test_set = [] # train_set = [] # for enc in test_encounters: # test_set.append((enc[0], [output_vocab[code] for code in enc[1]])) mfm = MostFreqMatch(3, "mimic") mfm.fit(train_encounters) # output_dim = len(output_vocab) # test_y = np.zeros((len(test_set), output_dim)) # test_result = np.zeros((len(test_set), output_dim)) # for i, pair in enumerate(test_set): # for j in pair[1]: # test_y[i, j] = 1 # for code in pair[0]: # if code in mfm.freq: # for tk in mfm.freq[code][:5]: # test_result[output_vocab[tk[0]]] += 1 # # # auc = metrics.roc_auc_score(test_y, test_result, 'micro') results = [] prediction_list = [] truth_list = [] for item in test_encounters: prediction = mfm.predict(item[0]) prediction_list.append(prediction) truth_list.append(item[1]) results.append((item[0], item[1], prediction)) dump(results, "sutter_result_freq_%s.pkl" % level)
def eval_freq(): level = 2 input_vocab = load("sutter_diag_vocab.pkl") output_vocab = load("sutter_drug_vocab_%s.pkl" % level) test_set = load("sutter_encounters.test_%s.pkl" % level) mfm = MostFreqMatch(1) mfm.load("sutter_freq.pkl") results = [] prediction_list = [] truth_list = [] for item in test_set: prediction = mfm.predict(item[0]) prediction_list.append(prediction) truth_list.append(item[1]) results.append((item[0], item[1], prediction)) dump(results, "sutter_result_freq_%s.pkl" % level) evaluator = Evaluator() evaluator.eval(mfm) evaluator.eval_golden(mfm)
def get_encounter_level(encounters, level, sorted_diag_rank): new_encounters = [] for enc in encounters: input = [] output = [] for code in enc[0]: if len(code) > 0: input.append(code.replace(".", "")) for code in enc[1]: if len(code) > 0: output.append(code[:level]) new_encounters.append((input, output)) new_encounters_clean = clean_encounters(new_encounters, sorted_diag_rank) print(len(new_encounters_clean), len(new_encounters)) dump(new_encounters_clean, "mimic_encounters_%s.pkl" % level) dump(new_encounters_clean[:int(len(new_encounters_clean) * 0.8)], "mimic_encounters_%s.train.pkl" % level) dump(new_encounters_clean[int(len(new_encounters_clean) * 0.8):], "mimic_encounters_%s.test.pkl" % level) gen_vocab(new_encounters_clean, level)
# truth_list = [] # prediction_list = [] # for line in open("seq2seq.h256.txt"): # if cnt % 3 == 1: # truth = set(line.strip().split("T: ")[1].split(" ")) # truth_list.append(truth) # if cnt % 3 == 2: # result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" ")) # prediction_list.append(result) # cnt += 1 # cnt = 0 results = [] input = [] truth = [] for line in open("mimic_unsort_seq2seq.h256.txt"): if cnt % 3 == 0: input = set(line.strip().split("S: ")[1].split(" ")) if cnt % 3 == 1: if len(line.strip().split("T: ")) <= 1: truth = [] continue truth = set(line.strip().split("T: ")[1].split(" ")) if cnt % 3 == 2: result = set(line.strip().split("Gen: ")[1].replace( "END", "").strip().split(" ")) if len(truth) > 0: results.append((input, truth, result)) cnt += 1 dump(results, "mimic_unsort_result_seq2seq.pkl")
# truth_list = [] # prediction_list = [] # for line in open("seq2seq.h256.txt"): # if cnt % 3 == 1: # truth = set(line.strip().split("T: ")[1].split(" ")) # truth_list.append(truth) # if cnt % 3 == 2: # result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" ")) # prediction_list.append(result) # cnt += 1 # cnt = 0 results = [] input = [] truth = [] for line in open("sutter_sorted_seq2seq.h256.txt"): if cnt % 3 == 0: input = set(line.strip().split("S: ")[1].split(" ")) if cnt % 3 == 1: if len(line.strip().split("T: ")) <= 1: truth = [] continue truth = set(line.strip().split("T: ")[1].split(" ")) if cnt % 3 == 2: result = set(line.strip().split("Gen: ")[1].replace( "END", "").strip().split(" ")) if len(truth) > 0: results.append((input, truth, result)) cnt += 1 dump(results, "sutter_sorted_result_seq2seq.pkl")
def do_train(self): p = self.processor config = self.config min_measure = 1e6 print("start train") for epoch in range(self.config.max_epoch): for step, (source_inputs, target_inputs, target_outputs, source_mask_inputs, target_mask_inputs, refs) in enumerate(p.gen_batch(p.train_data)): self.train_fn(source_inputs, target_inputs, target_outputs, source_mask_inputs, target_mask_inputs) if step % config.print_loss_per == 0: train_loss = self.comp_loss(p.train_data) dev_loss = self.comp_loss(p.dev_data) if dev_loss < min_measure: min_measure = dev_loss self.save_params(config.saved_model_file + "_%s_%s" % (epoch, int(step / 1000) * 1000)) print('epoch', epoch, 'step', step) print('train', train_loss, 'dev', dev_loss, 'min', min_measure) self.do_eval() if step % 500 == 0: self.do_eval( training=False, filename='%s_%s_%s_seq2seq_e%s_s%s.txt' % (config.name, config.level, config.order, epoch, step), max_batch=10000) cnt = 0 results = [] input = [] truth = [] for line in open('%s_%s_%s_seq2seq_e%s_s%s.txt' % (config.name, config.level, config.order, epoch, step)): if cnt % 3 == 0: input = list( set(line.strip().split("S: ")[1].split(" "))) if cnt % 3 == 1: if len(line.strip().split("T: ")) <= 1: truth = [] continue truth = list( set(line.strip().split("T: ")[1].split(" "))) if cnt % 3 == 2: result = set(line.strip().split("Gen: ")[1].split( "END")[0].strip().split(" ")) if '' in result: result.remove('') if len(truth) > 0: results.append((input, truth, result)) cnt += 1 input_list, truth_list, prediction_list = eval_utils.get_results( results) jaccard = eval_utils.get_average_jaccard( truth_list, prediction_list) acc = eval_utils.get_average_accuracy( truth_list, prediction_list) print("jaccard", jaccard, "acc", acc) dump( results, "%s_%s_%s_result_seq2seq_e%s_s%s_jacc%s_acc%s.pkl" % (config.name, config.level, config.order, epoch, step, round(jaccard, 5), round(acc, 5)))
def clean_mimic(ndc_to_gpi): mimic_data = load("mimic_episodes.pkl") clean_mimic_data = [] for d in mimic_data: drugs = [] for drug in d[1]: if len(drug) != 11: continue drugs.append(drug) clean_mimic_data.append((d[0], drugs)) random.shuffle(clean_mimic_data) mimic_train = clean_mimic_data[:49000] mimic_dev = clean_mimic_data[49000:] dump(mimic_train, "mimic_encounter.train.pkl") dump(mimic_dev, "mimic_encounter.dev.pkl") mimic_data_gpi = [] for d in clean_mimic_data: drugs = [] for drug in d[1]: if drug in ndc_to_gpi: drugs.append(ndc_to_gpi[drug]) else: print(drug) mimic_data_gpi.append((d[0], list(set(drugs)))) dump(mimic_data_gpi, "mimic_encounter_gpi.pkl") dump(mimic_data_gpi[:49000], "mimic_encounter_gpi.train.pkl") dump(mimic_data_gpi[49000:], "mimic_encounter_gpi.dev.pkl") mimic_diag_vocab = {} mimic_drug_vocab = {} for d in mimic_data_gpi: for diag in d[0]: if diag not in mimic_diag_vocab: mimic_diag_vocab[diag] = len(mimic_diag_vocab) for drug in d[1]: if drug not in mimic_drug_vocab: mimic_drug_vocab[drug] = len(mimic_drug_vocab) dump(mimic_diag_vocab, "mimic_diag_vocab.pkl") dump(mimic_drug_vocab, "mimic_drug_vocab.pkl")
# truth_list = [] # prediction_list = [] # for line in open("seq2seq.h256.txt"): # if cnt % 3 == 1: # truth = set(line.strip().split("T: ")[1].split(" ")) # truth_list.append(truth) # if cnt % 3 == 2: # result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" ")) # prediction_list.append(result) # cnt += 1 # cnt = 0 results = [] input = [] truth = [] for line in open('sutter_%s_%s_seq2seq.txt' % (config.level, config.order)): if cnt % 3 == 0: input = set(line.strip().split("S: ")[1].split(" ")) if cnt % 3 == 1: if len(line.strip().split("T: ")) <= 1: truth = [] continue truth = set(line.strip().split("T: ")[1].split(" ")) if cnt % 3 == 2: result = set(line.strip().split("Gen: ")[1].replace( "END", "").strip().split(" ")) if len(truth) > 0: results.append((input, truth, result)) cnt += 1 dump(results, "sutter_%s_%s_result_seq2seq.pkl" % (config.level, config.order))
def test(): input_vocab = load("sutter_diag_vocab.pkl") output_vocab = load("sutter_drug_vocab_3.pkl") test_encounters = load("sutter_encounter.dev.pkl") test_set = [] for enc in test_encounters: test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]])) mlp = load_model("sutter_mlp_model.h5") input_dim = len(input_vocab) output_dim = len(output_vocab) test_x = np.zeros((len(test_set), input_dim)) test_y = np.zeros((len(test_set), output_dim)) for i, pair in enumerate(test_set): for j in pair[0]: test_x[i, j] = 1 for j in pair[1]: test_y[i, j] = 1 index_to_source = {} index_to_target = {} for token in input_vocab: index_to_source[input_vocab[token]] = token for token in output_vocab: index_to_target[output_vocab[token]] = token for i in range(1, 10): threshold = float(i) / 500.0 labels, results = mlp.predict(test_x) results[results >= threshold] = 1 results[results < threshold] = 0 jaccard = metrics.jaccard_similarity_score(test_y, results) print(threshold, jaccard) labels, results = mlp.predict(test_x) results[results >= 0.012] = 1 results[results < 0.012] = 0 cnts, indices = results.nonzero() jaccard = metrics.jaccard_similarity_score(test_y, results) zero_one = metrics.jaccard_similarity_score(test_y, results) outputs = [[] for i in range(len(test_set))] for i, cnt in enumerate(cnts): outputs[cnt].append(index_to_target[indices[i]]) merge = [] for i, item in enumerate(outputs): print(test_encounters[i][0]) print(test_encounters[i][1]) print(outputs[i]) print("") merge.append(list(test_encounters[i]) + [outputs[i]]) from utils.data import dump dump(merge, "mimic_result_mlp_0.012.pkl") truth_list = [] prediction_list = [] for enc in merge: truth_list.append(enc[1]) prediction_list.append(enc[2])
def train(): level = 6 data = "mimic" input_vocab = load("%s_diag_vocab.pkl" % data) output_vocab = load("%s_drug_vocab_%s.pkl" % (data, level)) train_encounters = load("%s_encounters_%s.train.pkl" % (data, level)) test_encounters = load("%s_encounters_%s.test.pkl" % (data, level)) test_set = [] train_set = [] for enc in train_encounters: train_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]])) for enc in test_encounters: if len(enc[1]) == 0: continue test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]])) mlp = MLP(data=data, level = level) mlp.load_data(train_set, test_set[:5000], len(input_vocab), len(output_vocab)) mlp.build_model() mlp.fit(20) input_dim = len(input_vocab) output_dim = len(output_vocab) test_x = np.zeros((len(test_set), input_dim)) test_y = np.zeros((len(test_set), output_dim)) for i, pair in enumerate(test_set): for j in pair[0]: test_x[i, j] = 1 if len(pair[1]) == 0: print(i, pair) for j in pair[1]: test_y[i, j] = 1 index_to_source = {} index_to_target = {} for token in input_vocab: index_to_source[input_vocab[token]] = token for token in output_vocab: index_to_target[output_vocab[token]] = token import copy labels, rs = mlp.predict(test_x) auc = metrics.roc_auc_score(test_y, rs, 'micro') for i in range(1, 20): results = copy.deepcopy(rs) threshold = float(i) / 500.0 results[results >= threshold] = 1 results[results < threshold] = 0 jaccard = metrics.jaccard_similarity_score(test_y, results) acc = metrics.accuracy_score(test_y, results, ) # metrics.auc(test_y, results) print(threshold, round(auc, 4), round(jaccard, 4), round(acc, 4)) labels, results = mlp.predict(test_x) results[results >= 0.012] = 1 results[results < 0.012] = 0 cnts, indices = results.nonzero() jaccard = metrics.jaccard_similarity_score(test_y, results) zero_one = metrics.jaccard_similarity_score(test_y, results) outputs = [[] for i in range(len(test_set))] for i, cnt in enumerate(cnts): outputs[cnt].append(index_to_target[indices[i]]) merge = [] for i, item in enumerate(outputs): print(test_encounters[i][0]) print(test_encounters[i][1]) print(outputs[i]) print("") merge.append(list(test_encounters[i]) + [outputs[i]]) from utils.data import dump dump(merge, "mimic_result_mlp_0.012.pkl") truth_list = [] prediction_list = [] for enc in merge: truth_list.append(enc[1]) prediction_list.append(enc[2])