Ejemplo n.º 1
0
def load_encounter():
    encounters1 = set()
    encounters2 = set()
    cnt = 0
    with codecs.open("SUTTER_ENCOUNTER_DETAIL_V1.tab",
                     "r",
                     encoding='utf-8',
                     errors='ignore') as f_in:
        next(f_in)
        for line in f_in:
            if cnt % 100000 == 0:
                print(cnt, len(encounters1))
            cnt += 1
            x = line.strip().split("\t")
            encounters1.add(x[1])
    cnt = 0
    with codecs.open("SUTTER_ORDER_MED_DETAIL_V1.tab",
                     "r",
                     encoding='utf-8',
                     errors='ignore') as f_in:
        next(f_in)
        for line in f_in:
            if cnt % 100000 == 0:
                print(cnt, len(encounters2))
            cnt += 1
            x = line.strip().split("\t")
            encounters2.add(x[9])

    dump(encounters1.intersection(encounters2), "valid_encounters")
Ejemplo n.º 2
0
def order_encounters(name):
    import random
    print(name)
    encounters = load(name + '.pkl')
    orders = ["voc", "random", "freq", 'rare']
    ordered = [[] for _ in range(len(orders))]
    counters = dd(int)
    for enc in encounters:
        for code in enc[1]:
            counters[code] += 1
    vocab = sorted(list(counters.keys()))
    code_to_vocab_index = {}
    for i in range(len(vocab)):
        code_to_vocab_index[vocab[i]] = i
    for enc in encounters:
        if len(enc[1]) == 0:
            continue
        for i, order in enumerate(orders):
            enc_1 = list(set(enc[1]))
            if order == "voc":
                enc_1 = sorted(enc_1, key=lambda x: code_to_vocab_index[x])
                ordered[i].append((enc[0], enc_1))
            elif order == "random":
                random.shuffle(enc_1)
                ordered[i].append((enc[0], enc_1))
            elif order == "freq":
                enc_1 = sorted(enc_1, key=lambda x: counters[x], reverse=True)
                ordered[i].append((enc[0], enc_1))
            elif order == "rare":
                enc_1 = sorted(enc_1, key=lambda x: counters[x])
                ordered[i].append((enc[0], enc_1))
    for i, order in enumerate(orders):
        dump(ordered[i], name + "_" + order + ".pkl")
Ejemplo n.º 3
0
def build_ground_truth():
    rx_to_ndc = load("rx_to_ndc.pkl")
    rx_to_icd = dd(list)
    icd_to_ndc = dd(list)

    with open("./data/MEDI_11242015.csv") as f_in:
        cnt = -1
        for line in f_in:
            cnt += 1
            if cnt == 0:
                continue
            x = line.strip().split(",")
            rx_to_icd[x[0]].append(x[5])

    for rx in set(rx_to_icd.keys()).intersection(set(rx_to_ndc.keys())):
        for ndc in rx_to_ndc[rx]:
            for icd in rx_to_icd[rx]:
                codes = icd9.get_children(icd)
                for code in codes:
                    icd_to_ndc[code].append(ndc)

    for icd in icd_to_ndc:
        icd_to_ndc[icd] = list(set(icd_to_ndc[icd]))
    dump(dict(icd_to_ndc), "icd_to_ndc.pkl")

    ndc_to_icd = dd(list)
    for icd in icd_to_ndc:
        for ndc in icd_to_ndc[icd]:
            ndc_to_icd[ndc].append(icd)
    for ndc in ndc_to_icd:
        ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc]))
    dump(dict(ndc_to_icd), "ndc_to_icd.pkl")
Ejemplo n.º 4
0
def extract_mapping():
    medications = load_medication_details()
    cnt = 0
    records = {}
    with codecs.open("SUTTER_ORDER_MED_DETAIL_V1.tab",
                     "r",
                     encoding='utf-8',
                     errors='ignore') as f_in:
        next(f_in)
        for line in f_in:
            if cnt % 100000 == 0:
                print(cnt, len(records), line)
            cnt += 1
            x = line.strip().split("\t")
            if x[-1] in medications and medications[x[-1]][7] != "":
                records[x[1]] = (x[2], medications[x[-1]][7])

    diag_cnt = dd(int)
    diag_drug_pair_cnt = dd(int)
    drug_cnt = dd(int)
    for diag, drug in records.values():
        diag_drug_pair_cnt[(diag, drug[:6])] += 1
        diag_cnt[diag] += 1
        drug_cnt[drug[:6]] += 1

    sorted_diag_drug_pair = sorted(diag_drug_pair_cnt.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

    diag_to_drug = dd(list)
    drug_to_diag = dd(list)
    for diag, drug in diag_drug_pair_cnt:
        diag_to_drug[diag].append(drug)
        drug_to_diag[drug].append(diag)
    dump((dict(diag_to_drug), dict(drug_to_diag)), "diag_drug_mapping.pkl")
Ejemplo n.º 5
0
def convert_ground_truth():
    gt = load("icd_to_ndc.pkl")
    diag_vocab = load("diag_vocab.pkl")
    drug_vocab = load("drug_vocab.pkl")
    gt_index = {}
    for c in gt:
        code = normalize_icd(c)
        if code in diag_vocab:
            diag = diag_vocab[code]
            gt_index[diag] = []
            for code1 in gt[c]:
                if code1 in drug_vocab:
                    drug = drug_vocab[code1]
                    gt_index[diag].append(drug)
    for icd in gt_index:
        gt_index[icd] = list(set(gt_index[icd]))
    dump(gt_index, "icd_to_ndc_index.pkl")

    ndc_to_icd = dd(list)
    for icd in gt_index:
        for ndc in gt_index[icd]:
            ndc_to_icd[ndc].append(icd)
    for ndc in ndc_to_icd:
        ndc_to_icd[ndc] = list(set(ndc_to_icd[ndc]))
    dump(dict(ndc_to_icd), "ndc_to_icd_index.pkl")
Ejemplo n.º 6
0
def clean_encounter(diag_pres):
    encounters = []
    for p in diag_pres:
        pres = set()
        for med in p[1]:
            pres.add(med[-1])
        encounters.append((p[0], list(pres)))
    dump(encounters, "sutter_encounters.pkl")
Ejemplo n.º 7
0
def get_data():
    dataset = "sutter"
    level = 2
    import os
    results = dd(list)
    xs = {}
    for file in os.listdir("/home/yzhang3151/project/AutoPrescribe2/data"):
        if file.endswith(".pkl") and file.startswith("%s_%s" % (dataset, level)):
            d = file.split("_")
            results[d[2]].append((int(d[5][1:]), int(d[6][1:]), float(d[7].replace("jacc", ""))))

    for k in results:
        xs[k] = [y[2] for y in sorted(results[k], key=lambda x: (x[0], x[1]))]

    dump(xs, "traj_%s_%s.pkl" % (dataset, level))

    xs = load("traj_%s_%s.pkl" % (dataset, level))

    name_mapping = {
        "voc": "Vocabulary",
        "random": "Random",
        "freq": "Frequent first",
        "rare": "Rare first"
    }
    line_type = {
        "random": "-",
        "freq": "--",
        "rare": "s",
        "voc": "^"
    }


    fig, ax = plt.subplots(figsize=(8, 4))

    for k in name_mapping:
        line, = ax.plot([x for x in xs[k]][:len(xs["random"])], line_type[k], linewidth=2, label=name_mapping[k])

    # x = np.linspace(0, 10, 500)
    # dashes = [10, 5, 100, 5]  # 10 points on, 5 off, 100 on, 5 off
    #
    # line1, = ax.plot(x, np.sin(x), '--', linewidth=2,
    #                  label='Dashes set retroactively')
    # line1.set_dashes(dashes)
    #
    # line2, = ax.plot(x, -1 * np.sin(x), dashes=[30, 5, 10, 5],
    #                  label='Dashes set proactively')
    ax.set_xlabel("Epochs", fontsize=20)
    ax.set_ylabel("Jaccard Coefficient", fontsize=20)
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(15)
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(15)
    ax.legend(loc='lower right', fontsize=20)
    # plt.show()
    fig.tight_layout()
    plt.savefig("traj_%s_%s.pdf" % (dataset, level))
Ejemplo n.º 8
0
def load_mapping():
    diag_drug_mapping = load("diag_drug_mapping.pkl")
    diag_to_drug = {}
    drug_to_diag = {}
    for diag in diag_drug_mapping[0]:
        diag_to_drug[diag.replace(".", "")] = diag_drug_mapping[0][diag]
    for drug in diag_drug_mapping[1]:
        drug_to_diag[drug] = []
        for diag in diag_drug_mapping[1][drug]:
            drug_to_diag[drug].append(diag.replace(".", ""))
    dump((diag_to_drug, drug_to_diag), "mimic_diag_drug_mapping.pkl")
Ejemplo n.º 9
0
def sort_encounter():
    train = load("mimic_encounter_gpi.train.pkl")
    test = load("mimic_encounter_gpi.dev.pkl")
    sorted_train = []
    sorted_test = []
    for d in train:
        sorted_train.append((d[0], sorted(d[1])))
    for d in test:
        sorted_test.append((d[0], sorted(d[1])))
    dump(sorted_train, "mimic_encounter_gpi_sorted.train.pkl")
    dump(sorted_test, "mimic_encounter_gpi_sorted.dev.pkl")
Ejemplo n.º 10
0
def load_ndc_gpi_mapping():
    f_in = open("data/ndw_v_product.txt")
    title = next(f_in).strip().split("|")
    data = []
    for line in f_in:
        data.append(line.strip().split("|"))

    ndc_to_gpi_6 = {}
    for d in data:
        ndc_to_gpi_6[d[1]] = d[58]

    dump(ndc_to_gpi_6, "ndc_to_gpi_6.pkl")
Ejemplo n.º 11
0
def load_rx_to_ndc():
    drugs = {}
    in_to_drug = dd(list)
    in_to_pin = dd(list)
    with codecs.open(get_path("rxnorm.csv"), "r", "utf-8") as f_in:
        reader = csv.reader(f_in)
        cnt = 0
        for row in reader:
            if cnt != 0:
                drug = {
                    "rx": row[0],
                    "tty": row[1],
                    "ndc": [],
                    "name": row[3],
                    "va_classes": row[4],
                    "treating": row[5].split(";"),
                    "ingredients": row[6].split(";")
                }
                if row[2] != '':
                    for code in row[2].strip("[").strip("]").split(","):
                        if code is not None and code != 'None':
                            drug["ndc"].append(code.strip().strip("'"))
                drugs[row[0]] = drug

                for ing in drug["ingredients"]:
                    in_to_drug[ing].append(drug)
                    if drug["tty"] == "PIN":
                        in_to_pin[ing].append(drug["rx"])
            cnt += 1

    rx_to_ndc = dd(list)
    for rx in drugs:
        for ndc in drugs[rx]["ndc"]:
            rx_to_ndc[rx].append(ndc)

    for ing in in_to_drug:
        for drug in in_to_drug[ing]:
            for ndc in drug["ndc"]:
                rx_to_ndc[ing].append(ndc)

    for ing in in_to_pin:
        for pin in in_to_pin[ing]:
            for ndc in rx_to_ndc[ing]:
                rx_to_ndc[pin].append(ndc)

    ndc_to_rx = dd(list)
    for rx in rx_to_ndc:
        for ndc in rx_to_ndc[rx]:
            ndc_to_rx[ndc].append(rx)

    dump(rx_to_ndc, "rx_to_ndc.pkl")
    dump(ndc_to_rx, "ndc_to_rx.pkl")
Ejemplo n.º 12
0
    def fit(self, train_set):
        freq = dd(lambda: dd(int))
        for pair in train_set:
            for t0 in pair[0]:
                for t1 in pair[1]:
                    freq[t0][t1] += 1
        for tk in freq:
            sorted_freq = sorted(freq[tk].items(),
                                 key=lambda x: x[1],
                                 reverse=True)
            self.freq[tk] = sorted_freq

        dump(dict(self.freq), self.data + "_freq.pkl")
Ejemplo n.º 13
0
def train_mimic():
    train_set = load("mimic_encounter_gpi.train.pkl")
    test_set = load("mimic_encounter_gpi.dev.pkl")
    mfm = MostFreqMatch(3, "mimic")
    mfm.fit(train_set)
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "mimic_result_freq.pkl")
Ejemplo n.º 14
0
def get_icd_to_gpi_map():
    icd_to_ndc = load("icd_to_ndc.pkl")
    ndc_to_gpi = load("ndc_to_gpi_6.pkl")
    icd_to_gpi = dd(set)
    for icd in icd_to_ndc:
        for ndc in icd_to_ndc[icd]:
            if ndc in ndc_to_gpi:
                icd_to_gpi[icd].add(ndc_to_gpi[ndc])
    gpi_to_icd = dd(set)
    for icd in icd_to_gpi:
        for gpi in icd_to_gpi[icd]:
            gpi_to_icd[gpi].add(icd)

    dump((dict(icd_to_gpi), dict(gpi_to_icd)), "icd_gpi_map.pkl")
Ejemplo n.º 15
0
def eval_sutter():
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_3.pkl")
    test_set = load("sutter_encounter.dev.pkl")
    rb = RuleBased(1)
    rb.load()
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = rb.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_rule.pkl")
Ejemplo n.º 16
0
def gen_vocab(encounters, level):
    diag_vocab = {}
    drug_vocab = {}
    cnt1 = 0
    cnt2 = 0
    for p in encounters:
        for diag in p[0]:
            if not diag in diag_vocab:
                diag_vocab[diag] = cnt1
                cnt1 += 1
        for drug in p[1]:
            if not drug[:level] in drug_vocab:
                drug_vocab[drug[:level]] = cnt2
                cnt2 += 1
    dump(diag_vocab, "mimic_diag_vocab.pkl")
    dump(drug_vocab, "mimic_drug_vocab_%s.pkl" % level)
Ejemplo n.º 17
0
def join_prescription(meds, medications):
    prescriptions = []
    for eid in meds:
        pres = []
        for med in meds[eid]:
            if not med[-1] in medications:
                continue
            med_detail = medications[med[-1]]
            pres.append(
                tuple(
                    list(med) + [
                        med_detail[2], med_detail[3], med_detail[6],
                        med_detail[7]
                    ]))
        prescriptions.append(pres)
    dump(prescriptions, "prescriptions.pkl")
Ejemplo n.º 18
0
def join_encounters(valid_encounters, meds, medications):
    pairs = []
    for eid in valid_encounters:
        pairs.append((valid_encounters[eid], meds[eid]))
    diag_pres = []
    for p in pairs:
        pres = []
        for med in p[1]:
            med_detail = medications[med[-1]]
            pres.append(
                tuple(
                    list(med) + [
                        med_detail[2], med_detail[3], med_detail[6],
                        med_detail[7]
                    ]))
        diag_pres.append((p[0], pres))
    dump(pairs, "diagnosis_prescription_pairs.pkl")
Ejemplo n.º 19
0
def train():
    import numpy as np
    from sklearn import metrics
    level = 2
    data = "mimic"
    # input_vocab = load("sutter_diag_vocab.pkl")
    # output_vocab = load("sutter_drug_vocab_%s.pkl" % level)
    input_vocab = load("%s_diag_vocab.pkl" % (data))
    output_vocab = load("%s_drug_vocab_%s.pkl" % (data, level))
    train_encounters = load("%s_encounters_%s.train.pkl" % (data, level))
    test_encounters = load("%s_encounters_%s.test.pkl" % (data, level))
    # test_set = []
    # train_set = []
    # for enc in test_encounters:
    #     test_set.append((enc[0], [output_vocab[code] for code in enc[1]]))

    mfm = MostFreqMatch(3, "mimic")
    mfm.fit(train_encounters)

    # output_dim = len(output_vocab)
    # test_y = np.zeros((len(test_set), output_dim))
    # test_result = np.zeros((len(test_set), output_dim))
    # for i, pair in enumerate(test_set):
    #     for j in pair[1]:
    #         test_y[i, j] = 1
    #     for code in pair[0]:
    #         if code in mfm.freq:
    #             for tk in mfm.freq[code][:5]:
    #                 test_result[output_vocab[tk[0]]] += 1
    #
    #
    # auc = metrics.roc_auc_score(test_y, test_result, 'micro')

    results = []
    prediction_list = []
    truth_list = []
    for item in test_encounters:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_freq_%s.pkl" % level)
Ejemplo n.º 20
0
def eval_freq():
    level = 2
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_%s.pkl" % level)
    test_set = load("sutter_encounters.test_%s.pkl" % level)
    mfm = MostFreqMatch(1)
    mfm.load("sutter_freq.pkl")
    results = []
    prediction_list = []
    truth_list = []
    for item in test_set:
        prediction = mfm.predict(item[0])
        prediction_list.append(prediction)
        truth_list.append(item[1])
        results.append((item[0], item[1], prediction))
    dump(results, "sutter_result_freq_%s.pkl" % level)

    evaluator = Evaluator()
    evaluator.eval(mfm)
    evaluator.eval_golden(mfm)
Ejemplo n.º 21
0
def get_encounter_level(encounters, level, sorted_diag_rank):
    new_encounters = []
    for enc in encounters:
        input = []
        output = []
        for code in enc[0]:
            if len(code) > 0:
                input.append(code.replace(".", ""))
        for code in enc[1]:
            if len(code) > 0:
                output.append(code[:level])
        new_encounters.append((input, output))
    new_encounters_clean = clean_encounters(new_encounters, sorted_diag_rank)
    print(len(new_encounters_clean), len(new_encounters))
    dump(new_encounters_clean, "mimic_encounters_%s.pkl" % level)
    dump(new_encounters_clean[:int(len(new_encounters_clean) * 0.8)],
         "mimic_encounters_%s.train.pkl" % level)
    dump(new_encounters_clean[int(len(new_encounters_clean) * 0.8):],
         "mimic_encounters_%s.test.pkl" % level)
    gen_vocab(new_encounters_clean, level)
Ejemplo n.º 22
0
# truth_list = []
# prediction_list = []
# for line in open("seq2seq.h256.txt"):
#     if cnt % 3 == 1:
#         truth = set(line.strip().split("T: ")[1].split(" "))
#         truth_list.append(truth)
#     if cnt % 3 == 2:
#         result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" "))
#         prediction_list.append(result)
#     cnt += 1
#
cnt = 0
results = []
input = []
truth = []
for line in open("mimic_unsort_seq2seq.h256.txt"):
    if cnt % 3 == 0:
        input = set(line.strip().split("S: ")[1].split(" "))
    if cnt % 3 == 1:
        if len(line.strip().split("T: ")) <= 1:
            truth = []
            continue
        truth = set(line.strip().split("T: ")[1].split(" "))
    if cnt % 3 == 2:
        result = set(line.strip().split("Gen: ")[1].replace(
            "END", "").strip().split(" "))
        if len(truth) > 0:
            results.append((input, truth, result))
    cnt += 1
dump(results, "mimic_unsort_result_seq2seq.pkl")
Ejemplo n.º 23
0
# truth_list = []
# prediction_list = []
# for line in open("seq2seq.h256.txt"):
#     if cnt % 3 == 1:
#         truth = set(line.strip().split("T: ")[1].split(" "))
#         truth_list.append(truth)
#     if cnt % 3 == 2:
#         result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" "))
#         prediction_list.append(result)
#     cnt += 1
#
cnt = 0
results = []
input = []
truth = []
for line in open("sutter_sorted_seq2seq.h256.txt"):
    if cnt % 3 == 0:
        input = set(line.strip().split("S: ")[1].split(" "))
    if cnt % 3 == 1:
        if len(line.strip().split("T: ")) <= 1:
            truth = []
            continue
        truth = set(line.strip().split("T: ")[1].split(" "))
    if cnt % 3 == 2:
        result = set(line.strip().split("Gen: ")[1].replace(
            "END", "").strip().split(" "))
        if len(truth) > 0:
            results.append((input, truth, result))
    cnt += 1
dump(results, "sutter_sorted_result_seq2seq.pkl")
Ejemplo n.º 24
0
    def do_train(self):
        p = self.processor
        config = self.config

        min_measure = 1e6
        print("start train")

        for epoch in range(self.config.max_epoch):
            for step, (source_inputs, target_inputs, target_outputs,
                       source_mask_inputs, target_mask_inputs,
                       refs) in enumerate(p.gen_batch(p.train_data)):
                self.train_fn(source_inputs, target_inputs, target_outputs,
                              source_mask_inputs, target_mask_inputs)
                if step % config.print_loss_per == 0:
                    train_loss = self.comp_loss(p.train_data)
                    dev_loss = self.comp_loss(p.dev_data)
                    if dev_loss < min_measure:
                        min_measure = dev_loss
                        self.save_params(config.saved_model_file + "_%s_%s" %
                                         (epoch, int(step / 1000) * 1000))
                    print('epoch', epoch, 'step', step)
                    print('train', train_loss, 'dev', dev_loss, 'min',
                          min_measure)
                    self.do_eval()
                if step % 500 == 0:
                    self.do_eval(
                        training=False,
                        filename='%s_%s_%s_seq2seq_e%s_s%s.txt' %
                        (config.name, config.level, config.order, epoch, step),
                        max_batch=10000)
                    cnt = 0
                    results = []
                    input = []
                    truth = []
                    for line in open('%s_%s_%s_seq2seq_e%s_s%s.txt' %
                                     (config.name, config.level, config.order,
                                      epoch, step)):
                        if cnt % 3 == 0:
                            input = list(
                                set(line.strip().split("S: ")[1].split(" ")))
                        if cnt % 3 == 1:
                            if len(line.strip().split("T: ")) <= 1:
                                truth = []
                                continue
                            truth = list(
                                set(line.strip().split("T: ")[1].split(" ")))
                        if cnt % 3 == 2:
                            result = set(line.strip().split("Gen: ")[1].split(
                                "END")[0].strip().split(" "))
                            if '' in result:
                                result.remove('')
                            if len(truth) > 0:
                                results.append((input, truth, result))
                        cnt += 1
                    input_list, truth_list, prediction_list = eval_utils.get_results(
                        results)
                    jaccard = eval_utils.get_average_jaccard(
                        truth_list, prediction_list)
                    acc = eval_utils.get_average_accuracy(
                        truth_list, prediction_list)
                    print("jaccard", jaccard, "acc", acc)
                    dump(
                        results,
                        "%s_%s_%s_result_seq2seq_e%s_s%s_jacc%s_acc%s.pkl" %
                        (config.name, config.level, config.order, epoch, step,
                         round(jaccard, 5), round(acc, 5)))
Ejemplo n.º 25
0
def clean_mimic(ndc_to_gpi):
    mimic_data = load("mimic_episodes.pkl")
    clean_mimic_data = []
    for d in mimic_data:
        drugs = []
        for drug in d[1]:
            if len(drug) != 11:
                continue
            drugs.append(drug)
        clean_mimic_data.append((d[0], drugs))
    random.shuffle(clean_mimic_data)
    mimic_train = clean_mimic_data[:49000]
    mimic_dev = clean_mimic_data[49000:]
    dump(mimic_train, "mimic_encounter.train.pkl")
    dump(mimic_dev, "mimic_encounter.dev.pkl")

    mimic_data_gpi = []
    for d in clean_mimic_data:
        drugs = []
        for drug in d[1]:
            if drug in ndc_to_gpi:
                drugs.append(ndc_to_gpi[drug])
            else:
                print(drug)
        mimic_data_gpi.append((d[0], list(set(drugs))))
    dump(mimic_data_gpi, "mimic_encounter_gpi.pkl")
    dump(mimic_data_gpi[:49000], "mimic_encounter_gpi.train.pkl")
    dump(mimic_data_gpi[49000:], "mimic_encounter_gpi.dev.pkl")

    mimic_diag_vocab = {}
    mimic_drug_vocab = {}
    for d in mimic_data_gpi:
        for diag in d[0]:
            if diag not in mimic_diag_vocab:
                mimic_diag_vocab[diag] = len(mimic_diag_vocab)
        for drug in d[1]:
            if drug not in mimic_drug_vocab:
                mimic_drug_vocab[drug] = len(mimic_drug_vocab)
    dump(mimic_diag_vocab, "mimic_diag_vocab.pkl")
    dump(mimic_drug_vocab, "mimic_drug_vocab.pkl")
Ejemplo n.º 26
0
# truth_list = []
# prediction_list = []
# for line in open("seq2seq.h256.txt"):
#     if cnt % 3 == 1:
#         truth = set(line.strip().split("T: ")[1].split(" "))
#         truth_list.append(truth)
#     if cnt % 3 == 2:
#         result = set(line.strip().split("Gen: ")[1].replace("END", "").strip().split(" "))
#         prediction_list.append(result)
#     cnt += 1
#
cnt = 0
results = []
input = []
truth = []
for line in open('sutter_%s_%s_seq2seq.txt' % (config.level, config.order)):
    if cnt % 3 == 0:
        input = set(line.strip().split("S: ")[1].split(" "))
    if cnt % 3 == 1:
        if len(line.strip().split("T: ")) <= 1:
            truth = []
            continue
        truth = set(line.strip().split("T: ")[1].split(" "))
    if cnt % 3 == 2:
        result = set(line.strip().split("Gen: ")[1].replace(
            "END", "").strip().split(" "))
        if len(truth) > 0:
            results.append((input, truth, result))
    cnt += 1
dump(results, "sutter_%s_%s_result_seq2seq.pkl" % (config.level, config.order))
Ejemplo n.º 27
0
def test():
    input_vocab = load("sutter_diag_vocab.pkl")
    output_vocab = load("sutter_drug_vocab_3.pkl")
    test_encounters  = load("sutter_encounter.dev.pkl")
    test_set = []
    for enc in test_encounters:
        test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    mlp = load_model("sutter_mlp_model.h5")

    input_dim = len(input_vocab)
    output_dim = len(output_vocab)
    test_x = np.zeros((len(test_set), input_dim))
    test_y = np.zeros((len(test_set), output_dim))

    for i, pair in enumerate(test_set):
        for j in pair[0]:
            test_x[i, j] = 1
        for j in pair[1]:
            test_y[i, j] = 1

    index_to_source = {}
    index_to_target = {}
    for token in input_vocab:
        index_to_source[input_vocab[token]] = token
    for token in output_vocab:
        index_to_target[output_vocab[token]] = token

    for i in range(1, 10):
        threshold = float(i) / 500.0
        labels, results = mlp.predict(test_x)

        results[results >= threshold] = 1
        results[results < threshold] = 0

        jaccard = metrics.jaccard_similarity_score(test_y, results)
        print(threshold, jaccard)

    labels, results = mlp.predict(test_x)
    results[results >= 0.012] = 1
    results[results < 0.012] = 0
    cnts, indices = results.nonzero()
    jaccard = metrics.jaccard_similarity_score(test_y, results)
    zero_one = metrics.jaccard_similarity_score(test_y, results)


    outputs = [[] for i in range(len(test_set))]
    for i, cnt in enumerate(cnts):
        outputs[cnt].append(index_to_target[indices[i]])

    merge = []
    for i, item in enumerate(outputs):
        print(test_encounters[i][0])
        print(test_encounters[i][1])
        print(outputs[i])
        print("")

        merge.append(list(test_encounters[i]) + [outputs[i]])

    from utils.data import dump
    dump(merge, "mimic_result_mlp_0.012.pkl")

    truth_list = []
    prediction_list = []
    for enc in merge:
        truth_list.append(enc[1])
        prediction_list.append(enc[2])
Ejemplo n.º 28
0
def train():
    level = 6
    data = "mimic"
    input_vocab = load("%s_diag_vocab.pkl" % data)
    output_vocab = load("%s_drug_vocab_%s.pkl" % (data, level))
    train_encounters = load("%s_encounters_%s.train.pkl" % (data, level))
    test_encounters  = load("%s_encounters_%s.test.pkl" % (data, level))
    test_set = []
    train_set = []
    for enc in train_encounters:
        train_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    for enc in test_encounters:
        if len(enc[1]) == 0:
            continue
        test_set.append(([input_vocab[code] for code in enc[0]], [output_vocab[code] for code in enc[1]]))
    mlp = MLP(data=data, level = level)
    mlp.load_data(train_set, test_set[:5000], len(input_vocab), len(output_vocab))
    mlp.build_model()
    mlp.fit(20)

    input_dim = len(input_vocab)
    output_dim = len(output_vocab)
    test_x = np.zeros((len(test_set), input_dim))
    test_y = np.zeros((len(test_set), output_dim))

    for i, pair in enumerate(test_set):
        for j in pair[0]:
            test_x[i, j] = 1
        if len(pair[1]) == 0:
            print(i, pair)
        for j in pair[1]:
            test_y[i, j] = 1

    index_to_source = {}
    index_to_target = {}
    for token in input_vocab:
        index_to_source[input_vocab[token]] = token
    for token in output_vocab:
        index_to_target[output_vocab[token]] = token

    import copy
    labels, rs = mlp.predict(test_x)
    auc = metrics.roc_auc_score(test_y, rs, 'micro')

    for i in range(1, 20):
        results = copy.deepcopy(rs)
        threshold = float(i) / 500.0


        results[results >= threshold] = 1
        results[results < threshold] = 0

        jaccard = metrics.jaccard_similarity_score(test_y, results)
        acc = metrics.accuracy_score(test_y, results, )
        # metrics.auc(test_y, results)
        print(threshold, round(auc, 4), round(jaccard, 4), round(acc, 4))

    labels, results = mlp.predict(test_x)
    results[results >= 0.012] = 1
    results[results < 0.012] = 0
    cnts, indices = results.nonzero()
    jaccard = metrics.jaccard_similarity_score(test_y, results)
    zero_one = metrics.jaccard_similarity_score(test_y, results)


    outputs = [[] for i in range(len(test_set))]
    for i, cnt in enumerate(cnts):
        outputs[cnt].append(index_to_target[indices[i]])

    merge = []
    for i, item in enumerate(outputs):
        print(test_encounters[i][0])
        print(test_encounters[i][1])
        print(outputs[i])
        print("")

        merge.append(list(test_encounters[i]) + [outputs[i]])

    from utils.data import dump
    dump(merge, "mimic_result_mlp_0.012.pkl")

    truth_list = []
    prediction_list = []
    for enc in merge:
        truth_list.append(enc[1])
        prediction_list.append(enc[2])