Ejemplo n.º 1
0
def eval(model, data_eval, voc_size, epoch):
    eval_len = len(data_eval)
    # evaluate
    print('')
    model.eval()
    y_pred_prob = np.zeros((eval_len, voc_size[-1]))
    y_gt = y_pred_prob.copy()
    y_pred = y_pred_prob.copy()

    for step, input in enumerate(data_eval):
        pred_prob, target = model(input)
        pred_prob = F.sigmoid(pred_prob[0]).detach().cpu().numpy()
        target = target[0].detach().cpu().numpy()

        pred = pred_prob.copy()
        pred[pred >= 0.35] = 1
        pred[pred < 0.35] = 0
        y_pred[step, :] = pred
        y_pred_prob[step, :] = pred_prob
        y_gt[step, :] = target

        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    js, auc, p_1, p_3, p_5, f1, auprc = metric(y_gt, y_pred, y_pred_prob)
    llprint(
        '\tJS: %.4f, AUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f, AUPRC: %.4F\n'
        % (js, auc, p_1, p_3, p_5, f1, auprc))
Ejemplo n.º 2
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()
    smm_record = []
    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    case_study = defaultdict(dict)
    med_cnt = 0
    visit_cnt = 0
    for step, input in enumerate(data_eval):
        if len(input) < 2: # visit > 2
            continue
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []
        for i in range(1, len(input)):

            y_pred_label_tmp = []
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[input[i][2]] = 1
            y_gt.append(y_gt_tmp)

            target_output1 = model(input[:i])

            target_output1 = F.sigmoid(target_output1).detach().cpu().numpy()[0]
            y_pred_prob.append(target_output1)
            y_pred_tmp = target_output1.copy()
            y_pred_tmp[y_pred_tmp >= 0.3] = 1
            y_pred_tmp[y_pred_tmp < 0.3] = 0
            y_pred.append(y_pred_tmp)
            for idx, value in enumerate(y_pred_tmp):
                if value == 1:
                    y_pred_label_tmp.append(idx)
            y_pred_label.append(y_pred_label_tmp)
            med_cnt += len(y_pred_label_tmp)
            visit_cnt += 1

        smm_record.append(y_pred_label)
        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = multi_label_metric(np.array(y_gt), np.array(y_pred),
                                                                                   np.array(y_pred_prob))
        case_study[adm_ja] = {'ja': adm_ja, 'patient':input, 'y_label':y_pred_label}
        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rEval--Epoch: %d, Step: %d/%d' % (epoch, step, len(data_eval)))

    dill.dump(case_study, open(os.path.join('saved', model_name, 'case_study.pkl'), 'wb'))
    # ddi rate
    ddi_rate = ddi_rate_score(smm_record)

    llprint('\tDDI Rate: %.4f, Jaccard: %.4f,  PRAUC: %.4f, AVG_PRC: %.4f, AVG_RECALL: %.4f, AVG_F1: %.4f\n' % (
        ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(avg_r), np.mean(avg_f1)
    ))
    print('avg med', med_cnt / visit_cnt)

    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(avg_r), np.mean(avg_f1)
Ejemplo n.º 3
0
def eval(model, data_eval, voc_size, epoch):
    model.eval()

    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    smm_record = []
    med_cnt, visit_cnt = 0, 0

    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []

        for adm_index, adm in enumerate(input):
            output_logits = model(adm)

            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            # prediction prod
            output_logits = output_logits.detach().cpu().numpy()

            # prediction med set
            out_list, sorted_predict = sequence_output_process(
                output_logits, [voc_size[2], voc_size[2] + 1])
            y_pred_label.append(sorted(sorted_predict))
            y_pred_prob.append(np.mean(output_logits[:, :-2], axis=0))

            # prediction label
            y_pred_tmp = np.zeros(voc_size[2])
            y_pred_tmp[out_list] = 1
            y_pred.append(y_pred_tmp)
            visit_cnt += 1
            med_cnt += len(sorted_predict)

        smm_record.append(y_pred_label)

        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = \
                sequence_metric(np.array(y_gt), np.array(y_pred), np.array(y_pred_prob), np.array(y_pred_label))
        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rtest step: {} / {}'.format(step, len(data_eval)))

    # ddi rate
    ddi_rate = ddi_rate_score(smm_record,
                              path='../data/output/ddi_A_final.pkl')

    llprint(
        '\nDDI Rate: {:.4}, Jaccard: {:.4},  PRAUC: {:.4}, AVG_PRC: {:.4}, AVG_RECALL: {:.4}, AVG_F1: {:.4}, AVG_MED: {:.4}\n'
        .format(ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p),
                np.mean(avg_r), np.mean(avg_f1), med_cnt / visit_cnt))

    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(
        avg_r), np.mean(avg_f1), med_cnt / visit_cnt
Ejemplo n.º 4
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()
    smm_record = []
    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []
        input1_hidden, input2_hidden, target_hidden = None, None, None
        prev_target = None
        for adm_idx, adm in enumerate(input):

            target_output1, [input1_hidden,
                             input2_hidden, target_hidden] = model(
                                 adm, prev_target,
                                 [input1_hidden, input2_hidden, target_hidden])
            prev_target = adm[2]

            y_pred_label_tmp = []
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            target_output1 = F.sigmoid(
                target_output1).detach().cpu().numpy()[0]
            y_pred_prob.append(target_output1)
            y_pred_tmp = target_output1.copy()
            y_pred_tmp[y_pred_tmp >= 0.5] = 1
            y_pred_tmp[y_pred_tmp < 0.5] = 0
            y_pred.append(y_pred_tmp)
            for idx, value in enumerate(y_pred_tmp):
                if value == 1:
                    y_pred_label_tmp.append(idx)
            y_pred_label.append(y_pred_label_tmp)
        smm_record.append(y_pred_label)
        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = multi_label_metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))
        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))
    # ddi rate
    ddi_rate = ddi_rate_score(smm_record)

    llprint(
        '\tDDI Rate: %.4f, Jaccard: %.4f,  PRAUC: %.4f, AVG_PRC: %.4f, AVG_RECALL: %.4f, AVG_F1: %.4f\n'
        % (ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p),
           np.mean(avg_r), np.mean(avg_f1)))

    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(
        avg_r), np.mean(avg_f1)
Ejemplo n.º 5
0
def eval(model, data_eval, voc_size, epoch):
    model.eval()

    smm_record = []
    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    med_cnt, visit_cnt = 0, 0

    for step, input in enumerate(data_eval):
        y_gt, y_pred, y_pred_prob, y_pred_label = [], [], [], []

        if len(input) < 2: continue
        for i in range(1, len(input)):
            target_output = model(input[:i])

            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[input[i][2]] = 1
            y_gt.append(y_gt_tmp)

            # prediction prob
            target_output = F.sigmoid(target_output).detach().cpu().numpy()[0]
            y_pred_prob.append(target_output)

            # prediction med set
            y_pred_tmp = target_output.copy()
            y_pred_tmp[y_pred_tmp >= 0.4] = 1
            y_pred_tmp[y_pred_tmp < 0.4] = 0
            y_pred.append(y_pred_tmp)

            # prediction label
            y_pred_label_tmp = np.where(y_pred_tmp == 1)[0]
            y_pred_label.append(y_pred_label_tmp)
            med_cnt += len(y_pred_label_tmp)
            visit_cnt += 1

        smm_record.append(y_pred_label)
        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 =\
                multi_label_metric(np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))

        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rtest step: {} / {}'.format(step, len(data_eval)))

    # ddi rate
    ddi_rate = ddi_rate_score(smm_record,
                              path='../data/output/ddi_A_final.pkl')

    llprint(
        '\nDDI Rate: {:.4}, Jaccard: {:.4},  PRAUC: {:.4}, AVG_PRC: {:.4}, AVG_RECALL: {:.4}, AVG_F1: {:.4}, AVG_MED: {:.4}\n'
        .format(ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p),
                np.mean(avg_r), np.mean(avg_f1), med_cnt / visit_cnt))

    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(
        avg_r), np.mean(avg_f1), med_cnt / visit_cnt
Ejemplo n.º 6
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()

    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    records = []
    med_cnt, visit_cnt = 0, 0
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []
        i1_state, i2_state, i3_state = None, None, None
        for adm in input:
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            output_logits, i1_state, i2_state, i3_state = model(
                adm, i1_state, i2_state, i3_state)
            output_logits = output_logits.detach().cpu().numpy()

            out_list, sorted_predict = sequence_output_process(
                output_logits, [voc_size[2], voc_size[2] + 1])

            y_pred_label.append(sorted_predict)
            y_pred_prob.append(np.mean(output_logits[:, :-2], axis=0))

            y_pred_tmp = np.zeros(voc_size[2])
            y_pred_tmp[out_list] = 1
            y_pred.append(y_pred_tmp)
            visit_cnt += 1
            med_cnt += len(y_pred_tmp)
        records.append(y_pred_label)

        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = sequence_metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob),
            np.array(y_pred_label))
        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)

        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    # ddi rate
    ddi_rate = ddi_rate_score(records)
    llprint(
        '\tDDI Rate: %.4f, Jaccard: %.4f,  PRAUC: %.4f, AVG_PRC: %.4f, AVG_RECALL: %.4f, AVG_F1: %.4f, AVG_Med: %.4f\n'
        % (ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p),
           np.mean(avg_r), np.mean(avg_f1), med_cnt / visit_cnt))
    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(
        avg_r), np.mean(avg_f1)
Ejemplo n.º 7
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()
    smm_record = []
    auc, p_1, p_3, p_5, f1, prauc = [[] for _ in range(6)]
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []
        input1_hidden, input2_hidden, target_hidden = None, None, None
        for adm in input:
            y_pred_label_tmp = []
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            target_output1, output_logits, output_labels, [
                input1_hidden, input2_hidden, target_hidden
            ] = model(adm, [input1_hidden, input2_hidden, target_hidden])

            target_output1 = F.sigmoid(
                target_output1).detach().cpu().numpy()[0]
            a = np.argsort(target_output1)[::-1]
            b = np.max(output_logits, axis=-1)
            y_pred_prob.append(target_output1)
            y_pred_tmp = target_output1.copy()
            y_pred_tmp[y_pred_tmp >= 0.5] = 1
            y_pred_tmp[y_pred_tmp < 0.5] = 0
            y_pred.append(y_pred_tmp)
            for idx, value in enumerate(y_pred_tmp):
                if value == 1:
                    y_pred_label_tmp.append(idx)
            y_pred_label.append(y_pred_label_tmp)
        smm_record.append(y_pred_label)
        adm_auc, adm_p_1, adm_p_3, adm_p_5, adm_f1, adm_prauc = multi_label_metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))
        auc.append(adm_auc)
        p_1.append(adm_p_1)
        p_3.append(adm_p_3)
        p_5.append(adm_p_5)
        f1.append(adm_f1)
        prauc.append(adm_prauc)
        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    llprint(
        '\tAUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f, PRAUC: %.4f\n' %
        (np.mean(auc), np.mean(p_1), np.mean(p_3), np.mean(p_5), np.mean(f1),
         np.mean(prauc)))
    dill.dump(obj=smm_record, file=open('../data/smm_records.pkl', 'wb'))
Ejemplo n.º 8
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()

    auc, p_1, p_3, p_5, f1 = [[] for _ in range(5)]
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        i1_state, i2_state, i3_state = None, None, None
        for adm in input:
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            output_logits, i1_state, i2_state, i3_state = model(
                adm, i1_state, i2_state, i3_state)
            a = torch.argmax(output_logits, dim=-1)
            y_pred_prob_tmp = torch.mean(output_logits,
                                         dim=0).detach().cpu().numpy()[:-2]
            y_pred_prob.append(y_pred_prob_tmp)  # remove start and end token

            # target_output1 = F.sigmoid(target_output1).detach().cpu().numpy()[0]
            # a = np.argsort(target_output1)[::-1]
            # b = np.max(output_logits,axis=-1)
            # y_pred_prob.append(target_output1)
            y_pred_tmp = y_pred_prob_tmp.copy()
            y_pred_tmp[y_pred_tmp >= 0.3] = 1
            y_pred_tmp[y_pred_tmp < 0.3] = 0
            y_pred.append(y_pred_tmp)

        adm_auc, adm_p_1, adm_p_3, adm_p_5, adm_f1 = metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))
        auc.append(adm_auc)
        p_1.append(adm_p_1)
        p_3.append(adm_p_3)
        p_5.append(adm_p_5)
        f1.append(adm_f1)
        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    llprint(
        '\tAUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f\n' %
        (np.mean(auc), np.mean(p_1), np.mean(p_3), np.mean(p_5), np.mean(f1)))
Ejemplo n.º 9
0
def eval(model, data_eval, voc_size, epoch):
    eval_len = len(data_eval)
    # evaluate
    print('')
    model.eval()
    y_pred_prob = np.zeros((eval_len, voc_size[-1]))
    y_gt = y_pred_prob.copy()
    y_pred = y_pred_prob.copy()

    for step, input in enumerate(data_eval):
        pre_outputs, pre_labels, last_outputs, last_labels = model(input)
        last_outputs = F.softmax(last_outputs, dim=-1)
        last_v, last_arg = torch.max(last_outputs, dim=-1)
        last_v = last_v.detach().cpu().numpy()
        last_arg = last_arg.detach().cpu().numpy()

        def filter_other_token(x):
            if x[1] >= voc_size[-1]:
                return False
            return True

        try:
            last_v, last_arg = zip(
                *filter(filter_other_token, zip(last_v, last_arg)))
        except Exception:
            last_v, last_arg = [], []

        last_v, last_arg = list(last_v), list(last_arg)
        target = last_labels.detach().cpu().numpy()[:-1]  # remove end token

        pred_prob = np.zeros(voc_size[-1])
        pred_prob[last_arg] = last_v
        pred = pred_prob.copy()
        pred[last_arg] = 1
        y_pred[step, :] = pred
        y_pred_prob[step, :] = pred_prob
        y_gt[step, target] = 1

        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    js, auc, p_1, p_3, p_5, f1, auprc = metric(y_gt, y_pred, y_pred_prob)
    llprint(
        '\tJS: %.4f, AUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f, AUPRC: %.4F\n'
        % (js, auc, p_1, p_3, p_5, f1, auprc))
Ejemplo n.º 10
0
def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()

    auc, p_1, p_3, p_5, f1 = [[] for _ in range(5)]
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        input1_hidden, input2_hidden, target_hidden = None, None, None
        for adm in input:
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            output2_logits, output2_labels = model.seq_evaluate(target_hidden,
                                                                max_len=15)
            target_output1, _, [
                input1_hidden, input2_hidden, target_hidden
            ] = model(adm, [input1_hidden, input2_hidden, target_hidden])

            target_output1 = F.sigmoid(
                target_output1).detach().cpu().numpy()[0][0]
            y_pred_prob.append(target_output1)
            y_pred_tmp = target_output1.copy()
            y_pred_tmp[y_pred_tmp >= 0.5] = 1
            y_pred_tmp[y_pred_tmp < 0.5] = 0
            y_pred.append(y_pred_tmp)
            print('adm')
        adm_auc, adm_p_1, adm_p_3, adm_p_5, adm_f1 = metric(
            y_gt, y_pred, y_pred_prob)
        auc.append(adm_auc)
        p_1.append(adm_p_1)
        p_3.append(adm_p_3)
        p_5.append(adm_p_5)
        f1.append(adm_f1)
        llprint('\rEval--Epoch: %d, Step: %d/%d' %
                (epoch, step, len(data_eval)))

    llprint(
        '\tAUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f\n' %
        (np.mean(auc), np.mean(p_1), np.mean(p_3), np.mean(p_5), np.mean(f1)))
Ejemplo n.º 11
0
def predictSuffixAndTimeForPrefix(args, model, preprocess_manager, current,
                                  predict, ground_truth):
    in_time = 1

    start = len(ground_truth["prefix_event"])

    for i in range(start, predict["size"]):

        input_vec, num_features_all, num_features_activities = preprocess_manager.encode_test_set_add(
            args, current["line"], current["times"], current["times3"],
            current["line_add"], args.batch_size_test)

        Y = model.predict(input_vec, verbose=0)
        y_event = Y[0][0]
        y_time = Y[1][0][0]
        prediction = preprocess_manager.getSymbol(y_event)

        # update of prefix (event)
        current["line"] += prediction
        predict["predicted"] += prediction

        # update of prefix for suffix prediction (time + context)
        if prediction == '!':
            y_time = 0
            # note times2 will be automatically calculated based on times in the method "encode_test_set_add()"
            current["times"].append(y_time)
            current["times3"].append(current["times3"][-1] +
                                     timedelta(seconds=y_time))
            predict["suffix_time"] = predict["suffix_time"] + y_time
            current["line_add"].append(current["line_add"][0])
        else:
            # update of prefix for suffix prediction (time + context)
            if y_time < 0:
                y_time = 0
            y_time = y_time * preprocess_manager.divisor3

            current["times"].append(y_time)
            current["times3"].append(current["times3"][-1] +
                                     timedelta(seconds=y_time))
            predict["suffix_time"] = predict["suffix_time"] + y_time
            current["line_add"].append(current["line_add"][0])

        # termination; predict size = max sequence length - 1
        if prediction == '!' or len(current["line"]) == predict["size"]:
            util.llprint('\n! termination suffix prediction ... \n')
            break

        util.llprint("Prefix+Suffix-Time-%s: %f" %
                     (i, ground_truth["prefix_time"] + predict["suffix_time"]))
        util.llprint("Baseline-%s: %f" %
                     (i, preprocess_manager.avg_time_training_cases))

    if ground_truth["prefix_time"] + predict[
            "suffix_time"] > preprocess_manager.avg_time_training_cases:
        in_time = 0
    deviation_in_time = (ground_truth["prefix_time"] + predict["suffix_time"]
                         ) / preprocess_manager.avg_time_training_cases

    return predict, in_time, deviation_in_time
Ejemplo n.º 12
0
def roc_auc_dnc(y_gt, y_prob):
    all_micro = []
    for b in range(len(y_gt):
        all_micro.append(roc_auc_score(y_gt[b], y_prob[b], average='micro'))
    return np.mean(all_micro)

def precision_at_k_v2(y_gt, y_prob, k=3):
    precision = 0
    sort_index = np.argsort(y_prob, axis=-1)[:, ::-1][:, :k]
    for i in range(len(y_gt)):
        TP = 0
        for j in range(len(sort_index[i])):
            if y_gt[i, sort_index[i, j]] == 1:
                    TP += 1
        precision += TP / len(sort_index[i])
    return precision / len(y_gt)


def metric(y_eval, y_pred, y_pred_prob):

    auc = roc_auc_dnc(y_eval, y_pred_prob)
    p_1 = precision_at_k_v2(y_eval, y_pred_prob, k=1)
    p_3 = precision_at_k_v2(y_eval, y_pred_prob, k=3)
    p_5 = precision_at_k_v2(y_eval, y_pred_prob, k=5)
    f1 = f1_dnc(y_eval, y_pred)

    return auc, p_1, p_3, p_5, f1

def eval(model, data_eval, voc_size, epoch):
    # evaluate
    print('')
    model.eval()

    auc, p_1, p_3, p_5, f1 = [[] for _ in range(5)]
    for step, input in enumerate(data_eval):
        y_gt = []
        y_pred = []
        y_pred_prob = []
        input1_hidden, input2_hidden, target_hidden = None, None, None
        for adm in input:
            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            output2_logits, output2_labels = model.seq_evaluate(target_hidden, max_len=15)
            target_output1, _, [input1_hidden, input2_hidden, target_hidden] = model(adm, [input1_hidden, input2_hidden, target_hidden])

            target_output1 = F.sigmoid(target_output1).detach().cpu().numpy()[0][0]
            y_pred_prob.append(target_output1)
            y_pred_tmp = target_output1.copy()
            y_pred_tmp[y_pred_tmp>=0.5] = 1
            y_pred_tmp[y_pred_tmp<0.5] = 0
            y_pred.append(y_pred_tmp)
            print('adm')
        adm_auc, adm_p_1, adm_p_3, adm_p_5, adm_f1 = metric(y_gt, y_pred, y_pred_prob)
        auc.append(adm_auc)
        p_1.append(adm_p_1)
        p_3.append(adm_p_3)
        p_5.append(adm_p_5)
        f1.append(adm_f1)
        llprint('\rEval--Epoch: %d, Step: %d/%d' % (epoch, step, len(data_eval)))

    llprint('\tAUC: %.4f, P1: %.4f, P3: %.4f, P5: %.4f, F1: %.4f\n' % (
        np.mean(auc), np.mean(p_1), np.mean(p_3), np.mean(p_5), np.mean(f1)
    ))


        # last_outputs = F.softmax(last_outputs, dim=-1)
        # last_v, last_arg = torch.max(last_outputs, dim=-1)
        # last_v = last_v.detach().cpu().numpy()
        # last_arg = last_arg.detach().cpu().numpy()
        #
        # def filter_other_token(x):
        #     if x[1] >= voc_size[-1]:
        #         return False
        #     return True
        # try:
        #     last_v, last_arg = zip(*filter(filter_other_token, zip(last_v, last_arg)))
        # except Exception:
        #     last_v, last_arg = [], []
        #
        # last_v, last_arg = list(last_v), list(last_arg)
        # target = last_labels.detach().cpu().numpy()[:-1] # remove end token
        #
        # pred_prob = np.zeros(voc_size[-1])
        # pred_prob[last_arg] = last_v
        # pred = pred_prob.copy()
        # pred[last_arg] = 1
        # y_pred[step, :] = pred
        # y_pred_prob[step, :] = pred_prob
        # y_gt[step, target] = 1






def main():
    if not os.path.exists(os.path.join("saved", model_name)):
        os.makedirs(os.path.join("saved", model_name))

    data_path = '../data/records.pkl'
    voc_path = '../data/voc.pkl'
    device = torch.device('cuda:0')

    data = dill.load(open(data_path, 'rb'))
    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc['med_voc']

    split_point = int(len(data) * 2 / 3)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_eval = data[split_point:split_point + eval_len]

    EPOCH = 30
    LR = 0.001
    EVAL = True

    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word), len(med_voc.idx2word))
    model = RNN_Two(voc_size, device=device)
    if EVAL:
        pass
        #model.load_state_dict(torch.load(open(os.path.join("saved", model_name, resume_name), 'rb')))
    model.to(device=device)

    criterion1 = nn.BCEWithLogitsLoss().to(device)
    criterion2 = nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=LR)

    if EVAL:
        eval(model, data_eval, voc_size, 0)
    else:
        for epoch in range(EPOCH):
            loss_record1 = []
            loss_record2 = []
            start_time = time.time()
            model.train()
            for step, input in enumerate(data_train):
                input1_hidden, input2_hidden, target_hidden = None, None, None
                for adm in input:
                    loss1_target = np.zeros((1, voc_size[2]))
                    loss1_target[:, adm[2]] = 1
                    loss2_target = adm[2] + [voc_size[2]+1]

                    target_output1, target_output2, [input1_hidden, input2_hidden, target_hidden] = model(adm, [input1_hidden, input2_hidden, target_hidden])

                    loss1 = criterion1(target_output1, torch.LongTensor(loss1_target).to(device))
                    loss2 = criterion2(target_output2, torch.LongTensor(loss2_target).to(device))

                    optimizer.zero_grad()
                    loss1.backward()
                    loss2.backward()
                    optimizer.step()
                    loss_record1.append(loss1.item())
                    loss_record2.append(loss2.item())

                llprint('\rTrain--Epoch: %d, Step: %d/%d' % (epoch, step, len(data_train)))

            eval(model, data_eval, voc_size, epoch)

            end_time = time.time()
            elapsed_time = (end_time - start_time) / 60
            llprint('\tEpoch: %d, Loss1: %.4f, Loss2: %.4f One Epoch Time: %.2fm, Appro Left Time: %.2fh\n' % (epoch,
                                                                                                np.mean(loss_record1),
                                                                                                np.mean(loss_record2),
                                                                                                elapsed_time,
                                                                                                elapsed_time * (
                                                                                                            EPOCH - epoch - 1)/60))

            torch.save(model.state_dict(), open( os.path.join('saved', model_name, 'Epoch_%d_Loss1_%.4f.model' % (epoch, np.mean(loss_record1))), 'wb'))
            print('')

        # test
        torch.save(model.state_dict(), open(
            os.path.join('saved', model_name, 'final.model'), 'wb'))


if __name__ == '__main__':
    main()
Ejemplo n.º 13
0
def calc_metrics(args):
    prefix = 0
    prefix_all_enabled = 1
    prediction = list()
    gt_label = list()

    result_dir = args.result_dir
    if args.cross_validation == False:
        result_dir_fold = result_dir + args.data_set.split(
            ".csv")[0] + "__" + args.task + "_0.csv"
    else:
        result_dir_fold = result_dir + args.data_set.split(
            ".csv"
        )[0] + "__" + args.task + "_%d" % args.iteration_cross_validation + ".csv"

    with open(result_dir_fold, 'r') as csvfile:
        reader = csv.reader(csvfile,
                            delimiter=';',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)

        next(reader)

        for row in reader:
            if row == []:
                continue
            else:
                if int(row[1]) == prefix or prefix_all_enabled == 1:
                    gt_label.append(row[2])
                    prediction.append(row[3])

    util.llprint("\n\n")
    util.llprint("Metrics:\n")
    util.llprint("Accuracy: %f\n" %
                 sklearn.metrics.accuracy_score(gt_label, prediction))

    # calc metric for each label, and find their weighted mean
    util.llprint("Precision (weighted): %f\n" %
                 sklearn.metrics.precision_score(
                     gt_label, prediction, average='weighted'))
    util.llprint(
        "Recall (weighted): %f\n" %
        sklearn.metrics.recall_score(gt_label, prediction, average='weighted'))
    util.llprint(
        "F1-Score (weighted): %f\n" %
        sklearn.metrics.f1_score(gt_label, prediction, average='weighted'))

    # calc macro metric for each label, and find their unweighted mean
    util.llprint(
        "Precision (macro): %f\n" %
        sklearn.metrics.precision_score(gt_label, prediction, average='macro'))
    util.llprint(
        "Recall (macro): %f\n" %
        sklearn.metrics.recall_score(gt_label, prediction, average='macro'))
    util.llprint(
        "F1-Score (macro): %f\n" %
        sklearn.metrics.f1_score(gt_label, prediction, average='macro'))

    # calc micro metric over all examples
    util.llprint(
        "Precision (micro): %f\n" %
        sklearn.metrics.precision_score(gt_label, prediction, average='micro'))
    util.llprint(
        "Recall (micro): %f\n" %
        sklearn.metrics.recall_score(gt_label, prediction, average='micro'))
    util.llprint(
        "F1-Score (micro): %f\n\n" %
        sklearn.metrics.f1_score(gt_label, prediction, average='micro'))

    return sklearn.metrics.accuracy_score(
        gt_label, prediction), sklearn.metrics.precision_score(
            gt_label, prediction,
            average='weighted'), sklearn.metrics.recall_score(
                gt_label, prediction,
                average='weighted'), sklearn.metrics.f1_score(
                    gt_label, prediction, average='weighted')
Ejemplo n.º 14
0
    criterion = CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=LR)

    for epoch in range(EPOCH):
        loss_record = []
        start_time = time.time()
        model.train()
        for step, input in enumerate(data_train):
            output, y_gt = model(input)
            loss = criterion(output, y_gt)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_record.append(loss.item())

            llprint('\rTrain--Epoch: %d, Step: %d/%d' %
                    (epoch, step, len(data_train)))

        # evaluate
        print('')
        model.eval()
        y_pred_prob = np.zeros((eval_len, voc_size[-1]))
        y_gt = y_pred_prob.copy()
        y_pred = y_pred_prob.copy()

        for step, input in enumerate(data_eval):
            pred, pred_prob, target = model(input)
            y_pred[step, :] = pred
            y_pred_prob[step, :] = pred_prob
            y_gt[step, :] = target
            llprint('\rEval--Epoch: %d, Step: %d/%d' %
                    (epoch, step, len(data_eval)))
Ejemplo n.º 15
0
def test(args, preprocess_manager):
    batch_size = args.batch_size_test
    result_dir = args.result_dir
    task = args.task

    if preprocess_manager.num_features_additional > 0:
        lines, caseids, lines_add, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )
    else:
        lines, caseids, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_test_set(
        )

    model = keras.models.load_model(
        '%smodel_%s.h5' %
        (args.checkpoint_dir, preprocess_manager.iteration_cross_validation))

    predict_size = 1
    data_set_name = args.data_set.split('.csv')[0]
    generic_result_dir = result_dir + data_set_name + "__" + task
    fold_result_dir = generic_result_dir + "_%d%s" % (
        preprocess_manager.iteration_cross_validation, ".csv")
    result_dir = fold_result_dir

    with open(result_dir, 'w') as csvfile:
        spamwriter = csv.writer(csvfile,
                                delimiter=';',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        spamwriter.writerow([
            "CaseID", "Prefix length", "Groud truth", "Predicted",
            "Levenshtein", "Damerau", "Jaccard"
        ])

        for prefix_size in range(2, sequence_max_length):
            util.llprint("\nPrefix size: %d\n" % prefix_size)

            # if additional attributes exists
            if preprocess_manager.num_features_additional > 0:

                for line, caseid, line_add in zip(lines, caseids, lines_add):

                    cropped_line = ''.join(line[:prefix_size])
                    cropped_line_add = line_add[:prefix_size]

                    if '!' in cropped_line:
                        continue

                    ground_truth = ''.join(line[prefix_size:prefix_size +
                                                predict_size])
                    predicted = ''

                    for i in range(predict_size):

                        if len(ground_truth) <= i:
                            continue

                        input_vec, num_features_all, num_features_activities = preprocess_manager.encode_test_set_add(
                            args, cropped_line, cropped_line_add, batch_size)
                        y = model.predict(input_vec, verbose=0)
                        y_char = y[0][:]
                        prediction = preprocess_manager.getSymbol(y_char)
                        cropped_line += prediction
                        predicted += prediction

                        if prediction == '!':
                            print('! predicted, end case')
                            break

                    output = []
                    if len(ground_truth) > 0:

                        output.append(caseid)
                        output.append(prefix_size)
                        output.append(str(ground_truth).encode("utf-8"))
                        output.append(str(predicted).encode("utf-8"))
                        output.append(
                            1 - distance.nlevenshtein(predicted, ground_truth))

                        dls = 1 - (damerau_levenshtein_distance(
                            str(predicted), str(ground_truth)) /
                                   max(len(predicted), len(ground_truth)))
                        if dls < 0:
                            dls = 0
                        output.append(dls)
                        output.append(
                            1 - distance.jaccard(predicted, ground_truth))
                        spamwriter.writerow(output)

            # if no additional attributes exists
            else:
                for line, caseid in zip(lines, caseids):

                    cropped_line = ''.join(line[:prefix_size])

                    if '!' in cropped_line:
                        continue

                    ground_truth = ''.join(line[prefix_size:prefix_size +
                                                predict_size])
                    predicted = ''

                    for i in range(predict_size):

                        if len(ground_truth) <= i:
                            continue

                        input_vec = preprocess_manager.encode_test_set(
                            cropped_line, batch_size)
                        y = model.predict(input_vec, verbose=0)
                        y_char = y[0][:]
                        prediction = preprocess_manager.getSymbol(y_char)
                        cropped_line += prediction
                        predicted += prediction

                        if prediction == '!':
                            print('! predicted, end case')
                            break

                    output = []
                    if len(ground_truth) > 0:

                        output.append(caseid)
                        output.append(prefix_size)
                        output.append(str(ground_truth).encode("utf-8"))
                        output.append(str(predicted).encode("utf-8"))
                        output.append(
                            1 - distance.nlevenshtein(predicted, ground_truth))

                        dls = 1 - (damerau_levenshtein_distance(
                            str(predicted), str(ground_truth)) /
                                   max(len(predicted), len(ground_truth)))
                        if dls < 0:
                            dls = 0
                        output.append(dls)
                        output.append(
                            1 - distance.jaccard(predicted, ground_truth))
                        spamwriter.writerow(output)
Ejemplo n.º 16
0
def fine_tune(fine_tune_name=''):

    # load data
    data_path = '../data/output/records_final.pkl'
    voc_path = '../data/output/voc_final.pkl'
    device = torch.device('cpu:0')

    data = dill.load(open(data_path, 'rb'))
    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']
    ddi_A = dill.load(open('../data/output/ddi_A_final.pkl', 'rb'))

    split_point = int(len(data) * 2 / 3)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_test = data[split_point:split_point + eval_len]
    # data_eval = data[split_point+eval_len:]
    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))

    model = Leap(voc_size, device=device)
    model.load_state_dict(
        torch.load(
            open(os.path.join("saved", args.model_name, fine_tune_name),
                 'rb')))
    model.to(device)

    END_TOKEN = voc_size[2] + 1

    optimizer = Adam(model.parameters(), lr=args.lr)
    ddi_rate_record = []

    EPOCH = 100
    for epoch in range(EPOCH):
        loss_record = []
        start_time = time.time()
        random_train_set = [
            random.choice(data_train) for i in range(len(data_train))
        ]
        for step, input in enumerate(random_train_set):
            model.train()
            K_flag = False
            for adm in input:
                target = adm[2]
                output_logits = model(adm)
                out_list, sorted_predict = sequence_output_process(
                    output_logits.detach().cpu().numpy(),
                    [voc_size[2], voc_size[2] + 1])

                inter = set(out_list) & set(target)
                union = set(out_list) | set(target)
                jaccard = 0 if union == 0 else len(inter) / len(union)
                K = 0
                for i in out_list:
                    if K == 1:
                        K_flag = True
                        break
                    for j in out_list:
                        if ddi_A[i][j] == 1:
                            K = 1
                            break

                loss = -jaccard * K * torch.mean(
                    F.log_softmax(output_logits, dim=-1))
                loss_record.append(loss.item())
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

            llprint('\rtraining step: {} / {}'.format(step,
                                                      len(random_train_set)))

        if K_flag:
            print()
            ddi_rate, ja, prauc, avg_p, avg_r, avg_f1, avg_med = eval(
                model, data_test, voc_size, epoch)

    # test
    torch.save(
        model.state_dict(),
        open(os.path.join('saved', args.model_name, 'final.model'), 'wb'))
Ejemplo n.º 17
0
def main():
    if not os.path.exists(os.path.join("saved", model_name)):
        os.makedirs(os.path.join("saved", model_name))

    data_path = '../data/records_final.pkl'
    voc_path = '../data/voc_final.pkl'

    ehr_adj_path = '../data/ehr_adj_final.pkl'
    ddi_adj_path = '../data/ddi_A_final.pkl'
    device = torch.device('cuda:0')

    ehr_adj = dill.load(open(ehr_adj_path, 'rb'))
    ddi_adj = dill.load(open(ddi_adj_path, 'rb'))
    data = dill.load(open(data_path, 'rb'))
    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']

    split_point = int(len(data) * 2 / 3)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_test = data[split_point:split_point + eval_len]
    data_eval = data[split_point + eval_len:]

    EPOCH = 40
    LR = 0.0002
    TEST = args.eval
    Neg_Loss = args.ddi
    DDI_IN_MEM = args.ddi
    TARGET_DDI = 0.05
    T = 0.5
    decay_weight = 0.85

    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))
    model = GAMENet(voc_size,
                    ehr_adj,
                    ddi_adj,
                    emb_dim=64,
                    device=device,
                    ddi_in_memory=DDI_IN_MEM)
    if TEST:
        model.load_state_dict(torch.load(open(resume_name, 'rb')))
    model.to(device=device)

    print('parameters', get_n_params(model))
    optimizer = Adam(list(model.parameters()), lr=LR)

    if TEST:
        eval(model, data_test, voc_size, 0)
    else:
        history = defaultdict(list)
        best_epoch = 0
        best_ja = 0
        for epoch in range(EPOCH):
            loss_record1 = []
            start_time = time.time()
            model.train()
            prediction_loss_cnt = 0
            neg_loss_cnt = 0
            for step, input in enumerate(data_train):
                for idx, adm in enumerate(input):
                    seq_input = input[:idx + 1]
                    loss1_target = np.zeros((1, voc_size[2]))
                    loss1_target[:, adm[2]] = 1
                    loss3_target = np.full((1, voc_size[2]), -1)
                    for idx, item in enumerate(adm[2]):
                        loss3_target[0][idx] = item

                    target_output1, batch_neg_loss = model(seq_input)

                    loss1 = F.binary_cross_entropy_with_logits(
                        target_output1,
                        torch.FloatTensor(loss1_target).to(device))
                    loss3 = F.multilabel_margin_loss(
                        F.sigmoid(target_output1),
                        torch.LongTensor(loss3_target).to(device))
                    if Neg_Loss:
                        target_output1 = F.sigmoid(
                            target_output1).detach().cpu().numpy()[0]
                        target_output1[target_output1 >= 0.5] = 1
                        target_output1[target_output1 < 0.5] = 0
                        y_label = np.where(target_output1 == 1)[0]
                        current_ddi_rate = ddi_rate_score([[y_label]])
                        if current_ddi_rate <= TARGET_DDI:
                            loss = 0.9 * loss1 + 0.01 * loss3
                            prediction_loss_cnt += 1
                        else:
                            rnd = np.exp((TARGET_DDI - current_ddi_rate) / T)
                            if np.random.rand(1) < rnd:
                                loss = batch_neg_loss
                                neg_loss_cnt += 1
                            else:
                                loss = 0.9 * loss1 + 0.01 * loss3
                                prediction_loss_cnt += 1
                    else:
                        loss = 0.9 * loss1 + 0.01 * loss3

                    optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    optimizer.step()

                    loss_record1.append(loss.item())

                llprint(
                    '\rTrain--Epoch: %d, Step: %d/%d, L_p cnt: %d, L_neg cnt: %d'
                    % (epoch, step, len(data_train), prediction_loss_cnt,
                       neg_loss_cnt))
            # annealing
            T *= decay_weight

            ddi_rate, ja, prauc, avg_p, avg_r, avg_f1 = eval(
                model, data_eval, voc_size, epoch)

            history['ja'].append(ja)
            history['ddi_rate'].append(ddi_rate)
            history['avg_p'].append(avg_p)
            history['avg_r'].append(avg_r)
            history['avg_f1'].append(avg_f1)
            history['prauc'].append(prauc)

            end_time = time.time()
            elapsed_time = (end_time - start_time) / 60
            llprint(
                '\tEpoch: %d, Loss: %.4f, One Epoch Time: %.2fm, Appro Left Time: %.2fh\n'
                % (epoch, np.mean(loss_record1), elapsed_time, elapsed_time *
                   (EPOCH - epoch - 1) / 60))

            torch.save(
                model.state_dict(),
                open(
                    os.path.join(
                        'saved', model_name,
                        'Epoch_%d_JA_%.4f_DDI_%.4f.model' %
                        (epoch, ja, ddi_rate)), 'wb'))
            print('')
            if epoch != 0 and best_ja < ja:
                best_epoch = epoch
                best_ja = ja

        dill.dump(history,
                  open(os.path.join('saved', model_name, 'history.pkl'), 'wb'))

        # test
        torch.save(
            model.state_dict(),
            open(os.path.join('saved', model_name, 'final.model'), 'wb'))

        print('best_epoch:', best_epoch)
Ejemplo n.º 18
0
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    print_loss_total = 0  # Reset every print_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    training_pairs = [tensorsFromPair(random.choice(train_pairs))
                      for i in range(n_iters)]
    criterion = nn.CrossEntropyLoss()
    history = defaultdict(list)
    for epoch in range(30):
        for iter in range(1, n_iters + 1):
            training_pair = training_pairs[iter - 1]
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = train(input_tensor, target_tensor, encoder,
                        decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            llprint('\rTrain--Epoch: %d, Step: %d/%d' % (epoch, iter, n_iters))

        print_loss_avg = print_loss_total / n_iters
        print_loss_total = 0

        #eval
        y_gt = []
        y_pred = []
        y_pred_prob = []
        y_pred_label = []
        for pair in eval_pairs:
            y_gt_tmp = np.zeros(len(med_voc.idx2word))
            y_gt_tmp[np.array(pair[1])[:-1]-2] = 1
            y_gt.append(y_gt_tmp)

            input_tensor, output_tensor = tensorsFromPair(pair)
            output_logits = evaluate(encoder, decoder, input_tensor)
            output_logits = F.softmax(output_logits)
            output_logits = output_logits.detach().cpu().numpy()
            out_list, sorted_predict = sequence_output_process(output_logits, [SOS_token, EOS_token])

            y_pred_label.append(np.array(sorted_predict)-2)
            y_pred_prob.append(np.mean(output_logits[:, 2:], axis=0))

            y_pred_tmp = np.zeros(len(med_voc.idx2word))
            if len(out_list) != 0 :
                y_pred_tmp[np.array(out_list) - 2] = 1
            y_pred.append(y_pred_tmp)

        ja, prauc, avg_p, avg_r, avg_f1 = sequence_metric(np.array(y_gt), np.array(y_pred),
                                                        np.array(y_pred_prob),
                                                        np.array(y_pred_label))
        # ddi rate
        ddi_A = dill.load(open('../data/ddi_A_final.pkl', 'rb'))
        all_cnt = 0
        dd_cnt = 0
        for adm in y_pred_label:
            med_code_set = adm
            for i, med_i in enumerate(med_code_set):
                for j, med_j in enumerate(med_code_set):
                    if j <= i:
                        continue
                    all_cnt += 1
                    if ddi_A[med_i, med_j] == 1 or ddi_A[med_j, med_i] == 1:
                        dd_cnt += 1
        ddi_rate = dd_cnt / all_cnt

        history['ja'].append(ja)
        history['ddi_rate'].append(ddi_rate)
        history['avg_p'].append(avg_p)
        history['avg_r'].append(avg_r)
        history['avg_f1'].append(avg_f1)
        history['prauc'].append(prauc)
        llprint('\n\tDDI Rate: %.4f, Jaccard: %.4f,  PRAUC: %.4f, AVG_PRC: %.4f, AVG_RECALL: %.4f, AVG_F1: %.4f\n' % (
            ddi_rate, ja, prauc, avg_p, avg_r, avg_f1
        ))

        dill.dump(history, open(os.path.join('saved', model_name, 'history.pkl'), 'wb'))

        torch.save(encoder.state_dict(),
                   open(
                       os.path.join('saved', model_name, 'encoder_Epoch_%d_JA_%.4f_DDI_%.4f.model' % (epoch, ja, dd_cnt/all_cnt)),
                       'wb'))
        torch.save(decoder.state_dict(),
                   open(
                       os.path.join('saved', model_name, 'decoder_Epoch_%d_JA_%.4f_DDI_%.4f.model' % (epoch, ja, dd_cnt/all_cnt)),
                       'wb'))
Ejemplo n.º 19
0
def main():

    # load data
    data_path = '../data/output/records_final.pkl'
    voc_path = '../data/output/voc_final.pkl'
    device = torch.device('cuda:{}'.format(args.cuda))

    data = dill.load(open(data_path, 'rb'))
    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']

    split_point = int(len(data) * 2 / 3)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_test = data[split_point:split_point + eval_len]
    data_eval = data[split_point + eval_len:]
    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))

    END_TOKEN = voc_size[2] + 1

    model = Leap(voc_size, device=device)
    # model.load_state_dict(torch.load(open(args.resume_path, 'rb')))

    if args.Test:
        model.load_state_dict(torch.load(open(args.resume_path, 'rb')))
        model.to(device=device)
        tic = time.time()
        result = []
        for _ in range(10):
            test_sample = np.random.choice(data_test,
                                           round(len(data_test) * 0.8),
                                           replace=True)
            ddi_rate, ja, prauc, avg_p, avg_r, avg_f1, avg_med = eval(
                model, test_sample, voc_size, 0)
            result.append([ddi_rate, ja, avg_f1, prauc, avg_med])

        result = np.array(result)
        mean = result.mean(axis=0)
        std = result.std(axis=0)

        outstring = ""
        for m, s in zip(mean, std):
            outstring += "{:.4f} $\pm$ {:.4f} & ".format(m, s)

        print(outstring)
        print('test time: {}'.format(time.time() - tic))
        return

    model.to(device=device)
    print('parameters', get_n_params(model))
    optimizer = Adam(model.parameters(), lr=args.lr)

    history = defaultdict(list)
    best_epoch, best_ja = 0, 0

    EPOCH = 50
    for epoch in range(EPOCH):
        tic = time.time()
        print('\nepoch {} --------------------------'.format(epoch + 1))

        model.train()
        for step, input in enumerate(data_train):
            for adm in input:

                loss_target = adm[2] + [END_TOKEN]
                output_logits = model(adm)
                loss = F.cross_entropy(
                    output_logits,
                    torch.LongTensor(loss_target).to(device))
                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                optimizer.step()

            llprint('\rtraining step: {} / {}'.format(step, len(data_train)))

        print()
        tic2 = time.time()
        ddi_rate, ja, prauc, avg_p, avg_r, avg_f1, avg_med = eval(
            model, data_eval, voc_size, epoch)
        print('training time: {}, test time: {}'.format(
            time.time() - tic,
            time.time() - tic2))

        history['ja'].append(ja)
        history['ddi_rate'].append(ddi_rate)
        history['avg_p'].append(avg_p)
        history['avg_r'].append(avg_r)
        history['avg_f1'].append(avg_f1)
        history['prauc'].append(prauc)
        history['med'].append(avg_med)

        if epoch >= 5:
            print('ddi: {}, Med: {}, Ja: {}, F1: {}, PRAUC: {}'.format(
                np.mean(history['ddi_rate'][-5:]),
                np.mean(history['med'][-5:]), np.mean(history['ja'][-5:]),
                np.mean(history['avg_f1'][-5:]),
                np.mean(history['prauc'][-5:])))

        torch.save(model.state_dict(), open(os.path.join('saved', args.model_name, \
            'Epoch_{}_JA_{:.4}_DDI_{:.4}.model'.format(epoch, ja, ddi_rate)), 'wb'))

        if epoch != 0 and best_ja < ja:
            best_epoch = epoch
            best_ja = ja

        print('best_epoch: {}'.format(best_epoch))

        dill.dump(
            history,
            open(
                os.path.join('saved', args.model_name,
                             'history_{}.pkl'.format(args.model_name)), 'wb'))
Ejemplo n.º 20
0
    if EVAL:
        eval(model, data_eval, voc_size, 0)
    else:
        for epoch in range(EPOCH):
            loss_record = []
            start_time = time.time()
            model.train()
            for step, input in enumerate(data_train):
                output, y_gt = model(input)
                loss = criterion(output, y_gt)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                loss_record.append(loss.item())

                llprint('\rTrain--Epoch: %d, Step: %d/%d' %
                        (epoch, step, len(data_train)))

            eval(model, data_eval, voc_size, epoch)

            end_time = time.time()
            elapsed_time = (end_time - start_time) / 60
            llprint(
                '\n\tEpoch: %d, Loss: %.4f, One Epoch Time: %.2fm, Appro Left Time: %.2fh\n'
                % (epoch, np.mean(loss_record), elapsed_time, elapsed_time *
                   (EPOCH - epoch - 1) / 60))

            torch.save(
                model.state_dict(),
                open(
                    os.path.join(
                        'saved', model_name, 'Epoch_%d_Loss_%.4f.model' %
Ejemplo n.º 21
0
    def create_and_encode_training_set(self, args):

        lines = list()
        lines_add = list()

        for index, value in enumerate(
                self.train_index_per_fold[self.iteration_cross_validation]):
            lines.append(self.lines[value])
            if self.num_features_additional > 0:
                lines_add.append(self.features_additional_sequences[value])

        step = 1
        sentences = []
        softness = 0
        next_chars = []
        if self.num_features_additional > 0:
            sentences_add = []

        if self.num_features_additional > 0:
            for line, line_add in zip(lines, lines_add):
                for i in range(0, len(line), step):
                    if i == 0:
                        continue
                    sentences.append(line[0:i])
                    sentences_add.append(line_add[0:i])
                    next_chars.append(line[i])

            util.llprint("\nnb sequences: %d" % len(sentences))
            util.llprint("\nadd sequences: %d" % len(sentences_add))
        else:
            for t, line in enumerate(lines):
                for i in range(0, len(line), step):
                    if i == 0:
                        continue
                    sentences.append(line[0:i])
                    next_chars.append(line[i])

            util.llprint("\nnb sequences: %d" % len(sentences))

        util.llprint("\nnb Vectorization ...")

        X = np.zeros(
            (len(sentences), self.max_sequence_length, self.num_features_all),
            dtype=np.float64)
        Y = np.zeros((len(sentences), len(self.target_chars)),
                     dtype=np.float64)

        for i, sentence in enumerate(sentences):

            leftpad = self.max_sequence_length - len(sentence)
            if self.num_features_additional > 0:
                sentence_add = sentences_add[i]

            # set training set data
            for t, char in enumerate(sentence):
                for c in self.chars:
                    if c == char:
                        X[i, t + leftpad, self.char_indices[c]] = 1.0

                # add additional attributes
                if self.num_features_all > 0:
                    for x in range(0, self.num_features_additional):
                        X[i, t + leftpad,
                          len(self.chars) + x] = sentence_add[t][x]

            # set training set label
            for c in self.target_chars:
                if c == next_chars[i]:
                    Y[i, self.target_char_indices[c]] = 1 - softness
                else:
                    Y[i, self.target_char_indices[c]] = softness / (
                        len(self.target_chars) - 1)

        num_features_all = self.num_features_all
        num_features_activities = self.num_features_activities

        return X, Y, self.max_sequence_length, num_features_all, num_features_activities
Ejemplo n.º 22
0
def main():

    # load data
    data_path = '../data/output/records_final.pkl'
    voc_path = '../data/output/voc_final.pkl'

    ddi_adj_path = '../data/output/ddi_A_final.pkl'
    device = torch.device('cuda:{}'.format(args.cuda))

    ddi_adj = dill.load(open(ddi_adj_path, 'rb'))
    data = dill.load(open(data_path, 'rb'))

    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']

    np.random.seed(1203)
    np.random.shuffle(data)

    split_point = int(len(data) * 3 / 5)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_test = data[split_point:split_point + eval_len]
    data_eval = data[split_point + eval_len:]

    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))
    print(voc_size)
    model = DualNN(voc_size, ddi_adj, emb_dim=args.dim, device=device)
    # model.load_state_dict(torch.load(open(args.resume_path, 'rb')))

    if args.Test:
        model.load_state_dict(torch.load(open(args.resume_path, 'rb')))
        model.to(device=device)
        tic = time.time()
        label_list, prob_add, prob_delete = eval(model, data_eval, voc_size, 0,
                                                 1)

        threshold1, threshold2 = [], []
        for i in range(label_list.shape[1]):
            _, _, boundary_add = roc_curve(label_list[:, i],
                                           prob_add[:, i],
                                           pos_label=1)
            _, _, boundary_delete = roc_curve(label_list[:, i],
                                              prob_delete[:, i],
                                              pos_label=0)
            threshold1.append(boundary_add[min(round(len(boundary_add) * 0.05),
                                               len(boundary_add) - 1)])
            threshold2.append(boundary_delete[min(
                round(len(boundary_delete) * 0.05),
                len(boundary_delete) - 1)])
        # threshold1 = np.ones(voc_size[2]) * np.mean(threshold1)
        # threshold2 = np.ones(voc_size[2]) * np.mean(threshold2)
        print(np.mean(threshold1), np.mean(threshold2))
        eval(model, data_test, voc_size, 0, 0, threshold1, threshold2)
        print('test time: {}'.format(time.time() - tic))

        return

    model.to(device=device)
    print('parameters', get_n_params(model))
    # exit()
    optimizer = RMSprop(list(model.parameters()),
                        lr=args.lr,
                        weight_decay=args.weight_decay)

    # start iterations
    history = defaultdict(list)
    best_epoch, best_ja = 0, 0

    EPOCH = 40
    for epoch in range(EPOCH):
        t = 0
        tic = time.time()
        print('\nepoch {} --------------------------'.format(epoch + 1))

        model.train()
        for step, input in enumerate(data_train):
            if len(input) < 2: continue
            loss = 0
            for adm_idx, adm in enumerate(input):
                if adm_idx == 0: continue

                seq_input = input[:adm_idx + 1]

                loss_bce_target = np.zeros((1, voc_size[2]))
                loss_bce_target[:, adm[2]] = 1

                loss_bce_target_last = np.zeros((1, voc_size[2]))
                loss_bce_target_last[:, input[adm_idx - 1][2]] = 1

                add_target = np.zeros((1, voc_size[2]))
                add_target[:, np.where(loss_bce_target == 1)[1]] = 1
                delete_target = np.zeros((1, voc_size[2]))
                delete_target[:, np.where(loss_bce_target == 0)[1]] = 1

                loss_multi_target = np.full((1, voc_size[2]), -1)
                for idx, item in enumerate(adm[2]):
                    loss_multi_target[0][idx] = item

                loss_multi_target_last = np.full((1, voc_size[2]), -1)
                for idx, item in enumerate(input[adm_idx - 1][2]):
                    loss_multi_target_last[0][idx] = item

                loss_multi_add_target = np.full((1, voc_size[2]), -1)
                for i, item in enumerate(np.where(add_target == 1)[0]):
                    loss_multi_add_target[0][i] = item

                loss_multi_delete_target = np.full((1, voc_size[2]), -1)
                for i, item in enumerate(np.where(delete_target == 1)[0]):
                    loss_multi_delete_target[0][i] = item

                add_result, delete_result = model(seq_input)

                loss_bce = F.binary_cross_entropy_with_logits(add_result, torch.FloatTensor(add_target).to(device)) + \
                    F.binary_cross_entropy_with_logits(delete_result, torch.FloatTensor(delete_target).to(device))
                loss_multi = F.multilabel_margin_loss(F.sigmoid(add_result), torch.LongTensor(loss_multi_add_target).to(device)) + \
                    F.multilabel_margin_loss(F.sigmoid(delete_result), torch.LongTensor(loss_multi_delete_target).to(device))

                # l2 = 0
                # for p in model.parameters():
                #     l2 = l2 + (p ** 2).sum()

                loss += 0.95 * loss_bce + 0.05 * loss_multi

            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()

            llprint('\rtraining step: {} / {}'.format(step, len(data_train)))

        print()
        tic2 = time.time()
        ddi_rate, ja, prauc, avg_p, avg_r, avg_f1, add, delete, avg_med = eval(
            model, data_eval, voc_size, epoch)
        print('training time: {}, test time: {}'.format(
            time.time() - tic,
            time.time() - tic2))

        history['ja'].append(ja)
        history['ddi_rate'].append(ddi_rate)
        history['avg_p'].append(avg_p)
        history['avg_r'].append(avg_r)
        history['avg_f1'].append(avg_f1)
        history['prauc'].append(prauc)
        history['add'].append(add)
        history['delete'].append(delete)
        history['med'].append(avg_med)

        if epoch >= 5:
            print(
                'ddi: {}, Med: {}, Ja: {}, F1: {}, Add: {}, Delete: {}'.format(
                    np.mean(history['ddi_rate'][-5:]),
                    np.mean(history['med'][-5:]), np.mean(history['ja'][-5:]),
                    np.mean(history['avg_f1'][-5:]),
                    np.mean(history['add'][-5:]),
                    np.mean(history['delete'][-5:])))

        torch.save(model.state_dict(), open(os.path.join('saved', args.model_name, \
            'Epoch_{}_JA_{:.4}_DDI_{:.4}.model'.format(epoch, ja, ddi_rate)), 'wb'))

        if epoch != 0 and best_ja < ja:
            best_epoch = epoch
            best_ja = ja

        print('best_epoch: {}'.format(best_epoch))

    dill.dump(
        history,
        open(
            os.path.join('saved', args.model_name,
                         'history_{}.pkl'.format(args.model_name)), 'wb'))
Ejemplo n.º 23
0
def eval(model,
         data_eval,
         voc_size,
         epoch,
         val=0,
         threshold1=0.8,
         threshold2=0.2):
    model.eval()

    smm_record = []
    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    med_cnt, visit_cnt = 0, 0
    add_list, delete_list = [], []

    for step, input in enumerate(data_eval):
        y_gt, y_pred, y_pred_prob, y_pred_label = [], [], [], []
        add_temp_list, delete_temp_list = [], []
        if len(input) < 2: continue
        for adm_idx, adm in enumerate(input):
            if adm_idx == 0:
                y_old = np.zeros(voc_size[2])
                y_old[adm[2]] = 1
                continue

            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)

            result_out = model(input[:adm_idx + 1])
            # prediction prod
            y_pred_tmp = F.sigmoid(
                result_out[:, 0]).detach().cpu().numpy().tolist()
            y_pred_prob.append(y_pred_tmp)

            previous_set = np.where(y_old == 1)[0]

            # prediction med set
            # result = F.sigmoid(result).detach().cpu().numpy()[0]
            assignment = torch.max(result_out, axis=1)[1].cpu().numpy()
            y_old[assignment == 1] = 1
            y_old[assignment == 2] = 0
            y_pred.append(y_old)

            # prediction label
            y_pred_label_tmp = np.where(y_old == 1)[0]
            y_pred_label.append(sorted(y_pred_label_tmp))
            visit_cnt += 1
            med_cnt += len(y_pred_label_tmp)

            #### add or delete
            add_gt = set(np.where(y_gt_tmp == 1)[0]) - set(previous_set)
            delete_gt = set(previous_set) - set(np.where(y_gt_tmp == 1)[0])

            add_pre = set(np.where(y_old == 1)[0]) - set(previous_set)
            delete_pre = set(previous_set) - set(np.where(y_old == 1)[0])

            add_distance = len(set(add_pre) -
                               set(add_gt)) + len(set(add_gt) - set(add_pre))
            delete_distance = len(set(delete_pre) - set(delete_gt)) + len(
                set(delete_gt) - set(delete_pre))
            ####

            add_temp_list.append(add_distance)
            delete_temp_list.append(delete_distance)

        if len(add_temp_list) > 1:
            add_list.append(np.mean(add_temp_list))
            delete_list.append(np.mean(delete_temp_list))
        elif len(add_temp_list) == 1:
            add_list.append(add_temp_list[0])
            delete_list.append(delete_temp_list[0])

        smm_record.append(y_pred_label)
        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = multi_label_metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))

        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rtest step: {} / {}'.format(step, len(data_eval)))

    # ddi rate
    ddi_rate = ddi_rate_score(smm_record,
                              path='../data/output/ddi_A_final.pkl')

    llprint(
        '\nDDI Rate: {:.4}, Jaccard: {:.4},  AVG_F1: {:.4}, Add: {:.4}, Delete: {:.4}, AVG_MED: {:.4}\n'
        .format(ddi_rate, np.mean(ja), np.mean(avg_f1), np.mean(add_list),
                np.mean(delete_list), med_cnt / visit_cnt))

    return ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(
        avg_r), np.mean(avg_f1), np.mean(add_list), np.mean(
            delete_list), med_cnt / visit_cnt
Ejemplo n.º 24
0
def main():

    # load data
    data_path = '../data/output/records_final.pkl'
    voc_path = '../data/output/voc_final.pkl'

    ddi_adj_path = '../data/output/ddi_A_final.pkl'
    device = torch.device('cuda:{}'.format(args.cuda))

    ddi_adj = dill.load(open(ddi_adj_path, 'rb'))
    data = dill.load(open(data_path, 'rb'))

    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']

    np.random.seed(1203)
    np.random.shuffle(data)

    split_point = int(len(data) * 3 / 5)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    data_test = data[split_point:split_point + eval_len]
    data_eval = data[split_point + eval_len:]

    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))
    print(voc_size)
    model = SimNN(voc_size, ddi_adj, emb_dim=args.dim, device=device)
    # model.load_state_dict(torch.load(open(args.resume_path, 'rb')))

    if args.Test:
        model.load_state_dict(torch.load(open(args.resume_path, 'rb')))
        model.to(device=device)
        tic = time.time()
        eval(model, data_test, voc_size, 0)
        print('test time: {}'.format(time.time() - tic))
        return

    model.to(device=device)
    print('parameters', get_n_params(model))
    # exit()
    optimizer = RMSprop(list(model.parameters()),
                        lr=args.lr,
                        weight_decay=args.weight_decay)

    # start iterations
    history = defaultdict(list)
    best_epoch, best_ja = 0, 0

    criterion = torch.nn.CrossEntropyLoss()
    EPOCH = 40
    for epoch in range(EPOCH):
        t = 0
        tic = time.time()
        print('\nepoch {} --------------------------'.format(epoch + 1))
        model.train()
        for step, input in enumerate(data_train):
            if len(input) < 2: continue
            loss = 0
            for adm_idx, adm in enumerate(input):
                if adm_idx == 0: continue

                seq_input = input[:adm_idx + 1]

                loss_bce_target = np.zeros((1, voc_size[2]))
                loss_bce_target[:, adm[2]] = 1
                loss_bce_target_last = np.zeros((1, voc_size[2]))
                loss_bce_target_last[:, input[adm_idx - 1][2]] = 1

                target_list = loss_bce_target - loss_bce_target_last
                target_list[target_list == -1] = 2
                result = model(seq_input)

                loss += criterion(result,
                                  torch.LongTensor(target_list[0]).to(device))

                # l2 = 0
                # for p in model.parameters():
                #     l2 = l2 + (p ** 2).sum()

            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()

            llprint('\rtraining step: {} / {}'.format(step, len(data_train)))

        print()
        tic2 = time.time()
        ddi_rate, ja, prauc, avg_p, avg_r, avg_f1, add, delete, avg_med = eval(
            model, data_eval, voc_size, epoch)
        print('training time: {}, test time: {}'.format(
            time.time() - tic,
            time.time() - tic2))

        history['ja'].append(ja)
        history['ddi_rate'].append(ddi_rate)
        history['avg_p'].append(avg_p)
        history['avg_r'].append(avg_r)
        history['avg_f1'].append(avg_f1)
        history['prauc'].append(prauc)
        history['add'].append(add)
        history['delete'].append(delete)
        history['med'].append(avg_med)

        if epoch >= 5:
            print(
                'ddi: {}, Med: {}, Ja: {}, F1: {}, Add: {}, Delete: {}'.format(
                    np.mean(history['ddi_rate'][-5:]),
                    np.mean(history['med'][-5:]), np.mean(history['ja'][-5:]),
                    np.mean(history['avg_f1'][-5:]),
                    np.mean(history['add'][-5:]),
                    np.mean(history['delete'][-5:])))

        torch.save(model.state_dict(), open(os.path.join('saved', args.model_name, \
            'Epoch_{}_JA_{:.4}_DDI_{:.4}.model'.format(epoch, ja, ddi_rate)), 'wb'))

        if epoch != 0 and best_ja < ja:
            best_epoch = epoch
            best_ja = ja

        print('best_epoch: {}'.format(best_epoch))

    dill.dump(
        history,
        open(
            os.path.join('saved', args.model_name,
                         'history_{}.pkl'.format(args.model_name)), 'wb'))
Ejemplo n.º 25
0
def main():
    if not os.path.exists(os.path.join("saved", model_name)):
        os.makedirs(os.path.join("saved", model_name))

    data_path = '../data/records.pkl'
    voc_path = '../data/voc.pkl'
    ehr_adj_path = '../data/ehr_adj.pkl'
    ddi_adj_path = '../data/ddi_A.pkl'
    device = torch.device('cuda:0')

    ehr_adj = dill.load(open(ehr_adj_path, 'rb'))
    ddi_adj = dill.load(open(ddi_adj_path, 'rb'))
    data = dill.load(open(data_path, 'rb'))
    voc = dill.load(open(voc_path, 'rb'))
    diag_voc, pro_voc, med_voc = voc['diag_voc'], voc['pro_voc'], voc[
        'med_voc']

    split_point = int(len(data) * 2 / 3)
    data_train = data[:split_point]
    eval_len = int(len(data[split_point:]) / 2)
    # data_eval = data[split_point:split_point + eval_len]
    data_eval = data[split_point + eval_len:]

    EPOCH = 30
    LR = 0.001
    EVAL = True

    voc_size = (len(diag_voc.idx2word), len(pro_voc.idx2word),
                len(med_voc.idx2word))
    model = GMNN(voc_size, ehr_adj, ddi_adj, emb_dim=64, device=device)
    if EVAL:
        model.load_state_dict(
            torch.load(
                open(os.path.join("saved", model_name, resume_name), 'rb')))
    model.to(device=device)

    optimizer = Adam(list(model.parameters()), lr=LR)

    if EVAL:
        eval(model, data_eval, voc_size, 0)
    else:
        for epoch in range(EPOCH):
            loss_record1 = []
            loss_record2 = []
            start_time = time.time()
            model.train()
            for step, input in enumerate(data_train):
                input1_hidden, input2_hidden, target_hidden = None, None, None
                loss = 0
                for adm in input:
                    loss1_target = np.zeros((1, voc_size[2]))
                    loss1_target[:, adm[2]] = 1

                    loss2_target = adm[2] + [adm[2][0]]

                    loss3_target = np.full((1, voc_size[2]), -1)
                    for idx, item in enumerate(adm[2]):
                        loss3_target[0][idx] = item

                    target_output1, target_output2, [
                        input1_hidden, input2_hidden, target_hidden
                    ], batch_pos_loss, batch_neg_loss = model(
                        adm, [input1_hidden, input2_hidden, target_hidden])

                    loss1 = F.binary_cross_entropy_with_logits(
                        target_output1,
                        torch.FloatTensor(loss1_target).to(device))
                    loss2 = F.cross_entropy(
                        target_output2,
                        torch.LongTensor(loss2_target).to(device))

                    # loss = 9*loss1/10 + loss2/10
                    loss3 = F.multilabel_margin_loss(
                        F.sigmoid(target_output1),
                        torch.LongTensor(loss3_target).to(device))
                    loss += loss1 + 0.1 * loss3 + 0.01 * batch_neg_loss

                    loss_record1.append(loss.item())
                    loss_record2.append(loss3.item())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                llprint('\rTrain--Epoch: %d, Step: %d/%d' %
                        (epoch, step, len(data_train)))

            eval(model, data_eval, voc_size, epoch)

            end_time = time.time()
            elapsed_time = (end_time - start_time) / 60
            llprint(
                '\tEpoch: %d, Loss1: %.4f, Loss2: %.4f, One Epoch Time: %.2fm, Appro Left Time: %.2fh\n'
                % (epoch, np.mean(loss_record1), np.mean(loss_record2),
                   elapsed_time, elapsed_time * (EPOCH - epoch - 1) / 60))

            torch.save(
                model.state_dict(),
                open(
                    os.path.join(
                        'saved', model_name, 'Epoch_%d_Loss1_%.4f.model' %
                        (epoch, np.mean(loss_record1))), 'wb'))
            print('')

        # test
        torch.save(
            model.state_dict(),
            open(os.path.join('saved', model_name, 'final.model'), 'wb'))
Ejemplo n.º 26
0
    def __init__(self, args):
        self.num_folds = args.num_folds
        self.data_dir = args.data_dir + args.data_set

        util.llprint("Create structures...")
        lines = []
        caseids = []
        lastcase = ''
        line = ''
        firstLine = True
        numlines = 0
        check_additional_features = True

        features_additional_attributes = []
        features_additional_events = []
        features_additional_sequences = []

        csvfile = open(self.data_dir, 'r')
        spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')

        next(spamreader, None)
        for row in spamreader:

            # initial setting of additional features
            if check_additional_features:
                if len(row) == self.num_attributes_standard:
                    util.llprint("No additional attributes.\n")
                else:
                    self.num_features_additional = len(
                        row) - self.num_attributes_standard
                    util.llprint("Number of additional attributes: %d\n" %
                                 self.num_features_additional)
                check_additional_features = False

            if row[0] != lastcase:
                caseids.append(row[0])
                lastcase = row[0]
                if not firstLine:
                    lines.append(line)
                    if self.num_features_additional > 0:
                        features_additional_sequences.append(
                            features_additional_events)
                line = ''
                if self.num_features_additional > 0:
                    features_additional_events = []
                numlines += 1

            # get values of additional attributes
            if self.num_features_additional > 0:
                for index in range(
                        self.num_attributes_standard,
                        self.num_attributes_standard +
                        self.num_features_additional):
                    features_additional_attributes.append(row[index])
                features_additional_events.append(
                    features_additional_attributes)
                features_additional_attributes = []

            # add activity to a case
            line += chr(int(row[1]) + self.ascii_offset)
            firstLine = False

        lines.append(line)
        if self.num_features_additional > 0:
            features_additional_sequences.append(features_additional_events)
        numlines += 1

        # get elements per fold in case of split evaluation
        util.llprint("Loading Data starts... \n")
        self.elems_per_fold = int(round(numlines / self.num_folds))

        util.llprint("Calc max length of sequence\n")
        lines = list(map(lambda x: x + '!', lines))
        self.max_sequence_length = max(map(lambda x: len(x), lines))
        util.llprint("Max length of sequence: %d\n" % self.max_sequence_length)

        util.llprint(
            "Beginn calculation of total chars and total target chars... \n")
        self.chars = list(map(lambda x: set(x), lines))
        self.chars = list(set().union(*self.chars))
        self.chars.sort()
        self.target_chars = copy.copy(self.chars)
        self.chars.remove('!')
        util.llprint("Total chars: %d, target chars: %d\n" %
                     (len(self.chars), len(self.target_chars)))

        util.llprint("Beginn creation of dicts for char handling... \n")
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        self.target_char_indices = dict(
            (c, i) for i, c in enumerate(self.target_chars))
        self.target_indices_char = dict(
            (i, c) for i, c in enumerate(self.target_chars))
        util.llprint("Dics for char handling created\n")

        # set feature variables
        self.num_features_activities = len(self.chars)
        self.num_features_all = self.num_features_activities + self.num_features_additional

        # set structure variables
        self.lines = lines
        self.caseids = caseids
        if self.num_features_additional > 0:
            self.features_additional_sequences = features_additional_sequences

        # init validation
        kFold = KFold(n_splits=self.num_folds, random_state=0, shuffle=True)

        for train_index, test_index in kFold.split(lines):
            self.train_index_per_fold.append(train_index)
            self.test_index_per_fold.append(test_index)
Ejemplo n.º 27
0
def eval(model,
         data_eval,
         voc_size,
         epoch,
         val=0,
         threshold1=0.3,
         threshold2=0.3):
    model.eval()

    smm_record = []
    ja, prauc, avg_p, avg_r, avg_f1 = [[] for _ in range(5)]
    med_cnt, visit_cnt = 0, 0
    label_list, prob_add, prob_delete = [], [], []
    add_list, delete_list = [], []

    for step, input in enumerate(data_eval):
        y_gt, y_pred, y_pred_prob, y_pred_label = [], [], [], []
        add_temp_list, delete_temp_list = [], []

        if len(input) < 2: continue
        for adm_idx, adm in enumerate(input):
            if adm_idx == 0:
                y_old = np.zeros(voc_size[2])
                y_old[adm[2]] = 1
                continue

            y_gt_tmp = np.zeros(voc_size[2])
            y_gt_tmp[adm[2]] = 1
            y_gt.append(y_gt_tmp)
            label_list.append(y_gt_tmp)

            add_result, delete_result = model(input[:adm_idx + 1])
            # prediction prod
            y_pred_tmp_add = F.sigmoid(add_result).detach().cpu().numpy()[0]
            y_pred_tmp_delete = F.sigmoid(
                delete_result).detach().cpu().numpy()[0]
            y_pred_prob.append(y_pred_tmp_add)
            prob_add.append(y_pred_tmp_add)
            prob_delete.append(y_pred_tmp_delete)

            previous_set = np.where(y_old == 1)[0]

            # prediction med set
            y_old[y_pred_tmp_add >= threshold2] = 1
            y_old[y_pred_tmp_delete >= threshold1] = 0
            y_pred.append(y_old)

            # prediction label
            y_pred_label_tmp = np.where(y_old == 1)[0]
            y_pred_label.append(sorted(y_pred_label_tmp))
            visit_cnt += 1
            med_cnt += len(y_pred_label_tmp)

            #### add or delete
            add_gt = set(np.where(y_gt_tmp == 1)[0]) - set(previous_set)
            delete_gt = set(previous_set) - set(np.where(y_gt_tmp == 1)[0])

            add_pre = set(np.where(y_old == 1)[0]) - set(previous_set)
            delete_pre = set(previous_set) - set(np.where(y_old == 1)[0])

            add_distance = len(set(add_pre) -
                               set(add_gt)) + len(set(add_gt) - set(add_pre))
            delete_distance = len(set(delete_pre) - set(delete_gt)) + len(
                set(delete_gt) - set(delete_pre))
            ####

            add_temp_list.append(add_distance)
            delete_temp_list.append(delete_distance)

        if len(add_temp_list) > 1:
            add_list.append(np.mean(add_temp_list))
            delete_list.append(np.mean(delete_temp_list))
        elif len(add_temp_list) == 1:
            add_list.append(add_temp_list[0])
            delete_list.append(delete_temp_list[0])

        smm_record.append(y_pred_label)
        adm_ja, adm_prauc, adm_avg_p, adm_avg_r, adm_avg_f1 = multi_label_metric(
            np.array(y_gt), np.array(y_pred), np.array(y_pred_prob))

        ja.append(adm_ja)
        prauc.append(adm_prauc)
        avg_p.append(adm_avg_p)
        avg_r.append(adm_avg_r)
        avg_f1.append(adm_avg_f1)
        llprint('\rtest step: {} / {}'.format(step, len(data_eval)))

    # ddi rate
    ddi_rate = ddi_rate_score(smm_record,
                              path='../data/output/ddi_A_final.pkl')

    # llprint('\nDDI Rate: {:.4}, Jaccard: {:.4},  PRAUC: {:.4}, AVG_PRC: {:.4}, AVG_RECALL: {:.4}, AVG_F1: {:.4}, Add: {:.4}, Delete; {:.4}, AVG_MED: {:.4}\n'.format(
    #     ddi_rate, np.mean(ja), np.mean(prauc), np.mean(avg_p), np.mean(avg_r), np.mean(avg_f1), np.mean(add_list), np.mean(delete_list), med_cnt / visit_cnt
    # ))
    # print ('-1-', ddi_rate, '-2-',  np.mean(ja), '-3-', np.mean(prauc), '-4-', np.mean(avg_f1), '-5-', np.mean(add_list), '-6-', np.mean(delete_list), '-7-', med_cnt / visit_cnt)
    llprint(
        '\nDDI Rate: {:.4}, Jaccard: {:.4},  AVG_F1: {:.4}, Add: {:.4}, Delete: {:.4}, AVG_MED: {:.4}\n'
        .format(np.float(ddi_rate), np.mean(ja), np.mean(avg_f1),
                np.mean(add_list), np.mean(delete_list), med_cnt / visit_cnt))
    if val == 0:
        return np.float(ddi_rate), np.mean(ja), np.mean(prauc), np.mean(
            avg_p), np.mean(avg_r), np.mean(avg_f1), np.mean(
                add_list), np.mean(delete_list), med_cnt / visit_cnt
    else:
        return np.array(label_list), np.array(prob_add), np.array(prob_delete)
Ejemplo n.º 28
0
def train(args, preprocess_manager):
    util.llprint("Loading Data starts... \n")
    X, y, sequence_max_length, num_features_all, num_features_activities = preprocess_manager.create_and_encode_training_set(
        args)
    util.llprint("Loading Data done!\n")

    print('Build model...')

    # LSTM
    if args.dnn_architecture == 0:
        main_input = keras.layers.Input(shape=(sequence_max_length,
                                               num_features_all),
                                        name='main_input')
        l1 = keras.layers.recurrent.LSTM(100,
                                         implementation=2,
                                         activation="tanh",
                                         kernel_initializer='glorot_uniform',
                                         return_sequences=False,
                                         dropout=0.2)(main_input)
        b1 = keras.layers.normalization.BatchNormalization()(l1)

    # GRU
    elif args.dnn_architecture == 1:
        main_input = keras.layers.Input(shape=(sequence_max_length,
                                               num_features_all),
                                        name='main_input')
        l1 = keras.layers.recurrent.GRU(100,
                                        implementation=2,
                                        activation="tanh",
                                        kernel_initializer='glorot_uniform',
                                        return_sequences=False,
                                        dropout=0.2)(main_input)
        b1 = keras.layers.normalization.BatchNormalization()(l1)

    # RNN
    elif args.dnn_architecture == 2:
        main_input = keras.layers.Input(shape=(sequence_max_length,
                                               num_features_all),
                                        name='main_input')
        l1 = keras.layers.recurrent.SimpleRNN(
            100,
            implementation=2,
            activation="tanh",
            kernel_initializer='glorot_uniform',
            return_sequences=False,
            dropout=0.2)(main_input)
        b1 = keras.layers.normalization.BatchNormalization()(l1)

    activity_output = keras.layers.Dense(
        num_features_activities + 1,
        activation='softmax',
        name='activity_output',
        kernel_initializer='glorot_uniform')(b1)
    model = keras.models.Model(inputs=[main_input], outputs=[activity_output])

    optimizer = keras.optimizers.Nadam(lr=args.learning_rate,
                                       beta_1=0.9,
                                       beta_2=0.999,
                                       epsilon=1e-8,
                                       schedule_decay=0.004,
                                       clipvalue=3)
    model.compile(loss={'activity_output': 'categorical_crossentropy'},
                  optimizer=optimizer)
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                   patience=10)
    model_checkpoint = keras.callbacks.ModelCheckpoint(
        '%smodel_%s.h5' %
        (args.checkpoint_dir, preprocess_manager.iteration_cross_validation),
        monitor='val_loss',
        verbose=0,
        save_best_only=True,
        save_weights_only=False,
        mode='auto')
    lr_reducer = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                   factor=0.5,
                                                   patience=10,
                                                   verbose=0,
                                                   mode='auto',
                                                   min_delta=0.0001,
                                                   cooldown=0,
                                                   min_lr=0)
    model.summary()

    start_training_time = datetime.now()

    model.fit(X, {'activity_output': y},
              validation_split=1 / args.num_folds,
              callbacks=[early_stopping, model_checkpoint, lr_reducer],
              verbose=1,
              batch_size=args.batch_size_train,
              epochs=args.dnn_num_epochs)

    training_time = datetime.now() - start_training_time

    return training_time.total_seconds()
Ejemplo n.º 29
0
        # iterate folds
        for iteration_cross_validation in range(0, args.num_folds):
            preprocess_manager.iteration_cross_validation = iteration_cross_validation
            training_time_seconds.append(train.train(args, preprocess_manager))
            args.iteration_cross_validation = iteration_cross_validation
            test.test(args, preprocess_manager)
            accuracy_value, precision_value, recall_value, f1_value = metric.calc_metrics(args)
            accuracy_values.append(accuracy_value)
            precision_values.append(precision_value)
            recall_values.append(recall_value)
            f1_values.append(f1_value)

        # final output
        for index in range(0, len(accuracy_values)):
            util.llprint("Accuracy of fold %i: %f\n" % (index + 1, accuracy_values[index]))
            util.llprint("Precision of fold %i: %f\n" % (index + 1, precision_values[index]))
            util.llprint("Recall of fold %i: %f\n" % (index + 1, recall_values[index]))
            util.llprint("F1-Score of fold %i: %f\n" % (index + 1, f1_values[index]))
            util.llprint("Training time of fold %i: %f seconds\n" % (index + 1, training_time_seconds[index]))

        util.llprint(
            "Average accuracy %i-fold cross-validation: %f\n" % (args.num_folds, sum(accuracy_values) / args.num_folds))
        util.llprint("Average precision precision %i-fold cross-validation: %f\n" % (
            args.num_folds, sum(precision_values) / args.num_folds))
        util.llprint(
            "Average recall %i-fold cross-validation: %f\n" % (args.num_folds, sum(recall_values) / args.num_folds))
        util.llprint(
            "Average f1-score %i-fold cross-validation: %f\n" % (args.num_folds, sum(f1_values) / args.num_folds))
        util.llprint("Average training time in seconds %i-fold cross-validation: %f\n" % (
            args.num_folds, sum(training_time_seconds) / args.num_folds))
Ejemplo n.º 30
0
            y_pred_tmp[np.array(out_list) - 2] = 1
        y_pred.append(y_pred_tmp)

    ja, prauc, avg_p, avg_r, avg_f1 = sequence_metric(np.array(y_gt), np.array(y_pred),
                                                    np.array(y_pred_prob),
                                                    np.array(y_pred_label))
    # ddi rate
    ddi_A = dill.load(open('../data/ddi_A_final.pkl', 'rb'))
    all_cnt = 0
    dd_cnt = 0
    for adm in y_pred_label:
        med_code_set = adm
        for i, med_i in enumerate(med_code_set):
            for j, med_j in enumerate(med_code_set):
                if j <= i:
                    continue
                all_cnt += 1
                if ddi_A[med_i, med_j] == 1 or ddi_A[med_j, med_i] == 1:
                    dd_cnt += 1
    ddi_rate = dd_cnt / all_cnt
    llprint('\n\tDDI Rate: %.4f, Jaccard: %.4f,  PRAUC: %.4f, AVG_PRC: %.4f, AVG_RECALL: %.4f, AVG_F1: %.4f\n' % (
        ddi_rate, ja, prauc, avg_p, avg_r, avg_f1
    ))