Esempio n. 1
0
def train(**kwargs):
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)

    # init model
    model = GradientBoostingClassifier(n_estimators=100,
                                       learning_rate=0.1,
                                       verbose=1,
                                       n_iter_no_change=10,
                                       random_state=10)

    train_feat, train_label = train_data.get_feat_data()

    print("Start Training.")
    model.fit(train_feat, train_label)

    print("Training Finished.")

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_feat, test_label = test_data.get_feat_data()

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_feat,
                                                      test_label,
                                                      top_k_list=[3, 5, 10])

    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 2
0
def train(**kwargs):
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    model = TextCNN(**model_param)
    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if model_param["use_gpu"]:
        model.cuda()

    print("Model Inited.")
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        model.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred = model.forward(feat)

            if model_param["use_gpu"]:
                label = torch.LongTensor(dise).cuda()
            else:
                label = torch.LongTensor(dise)

            # label is [1,2,3...,27]
            loss = F.cross_entropy(pred, label - 1)

            # multi-class xent loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg
        metric_result, eval_log, eval_result = evaluate_clf(
            model, val_data_loader, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], model, "textcnn")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_data_loader,
                                                      top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 3
0
def train(**kwargs):
    w2v_model_name = "./ckpt/w2v"

    if os.path.exists(w2v_model_name):
        print("load word2vec model from", w2v_model_name)
        # load model directly
        w2v_model = Word2Vec.load(w2v_model_name)

    else:
        # load data
        filename = "./dataset/EHR/train/data.txt"
        fin = open(filename, "r")
        corpus = []
        for line in fin.readlines():
            corpus.append(line.strip().split()[2:])

        # learn word2vec model
        start_time = time.time()
        w2v_model = Word2Vec(corpus,
                             size=64,
                             window=3,
                             min_count=1,
                             workers=4,
                             sg=1)
        w2v_model.save("./ckpt/w2v")
        print("training done, costs {} secs.".format(time.time() - start_time))

    # start training and testing the MLP model
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # let's build a MLP for prediction
    model_param["w2v_model"] = w2v_model
    model = MLP(**model_param)

    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if model_param["use_gpu"]:
        model.cuda()

    print("Model Inited.")
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=kwargs["weight_decay"])

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        model.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred = model.forward(feat)

            if model_param["use_gpu"]:
                label = torch.LongTensor(dise).cuda()
            else:
                label = torch.LongTensor(dise)

            # label is [1,2,3...,27]
            loss = F.cross_entropy(pred, label - 1)

            # multi-class xent loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg
        metric_result, eval_log, eval_result = evaluate_clf(
            model, val_data_loader, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], model, "med2vec")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_data_loader,
                                                      top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
    pass
Esempio n. 4
0
def train(**kwargs):
    setup_seed(2020)
    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # init model
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    gnn = HGNN_SDS(**model_param)
    if model_param["w2v"] is not None:
        # load w2v data
        gnn.load_symp_embed(model_param["w2v"])

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    sds_sampler = SDS_sampler("dataset/EHR")

    # load pmi ss mat
    symp2symp_mat = sp.load_npz(os.path.join("dataset/EHR", "pmi_ss_mat.npz"))
    symp2symp_mat.setdiag(0)

    # total number of symptoms
    num_total_batch = gnn.num_symp // model_param["batch_size"]
    all_symp_index = np.arange(1, gnn.num_symp + 1)

    lambda_hard_r = lambda epoch: epoch * model_param[
        "hard_ratio"] / model_param["num_epoch"]

    # build hard map and pos map
    symp2symp_hard_map = [0]
    symp2symp_pos_map = [0]
    for k in all_symp_index:
        symp2symp_b_ar = symp2symp_mat[k].toarray().flatten()
        max_index = np.argmax(symp2symp_b_ar)
        if max_index == 0:
            symp2symp_pos_map.append(np.random.randint(1, k))
            symp2symp_hard_map.append(np.random.randint(1, k))

        else:
            symp2symp_pos_map.append(max_index)
            symp2symp_b_ar[max_index] = -1
            max_2nd_index = np.argmax(symp2symp_b_ar)
            if max_2nd_index == 0:
                symp2symp_hard_map.append(np.random.randint(1, k))
            else:
                symp2symp_hard_map.append(max_2nd_index)

    symp2symp_hard_map = np.array(symp2symp_hard_map)
    symp2symp_pos_map = np.array(symp2symp_pos_map)
    print("Pos / Hard symptom map Inited.")

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=model_param["lr"])
    last_total_loss = 1e10

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()
        np.random.shuffle(all_symp_index)

        hard_ratio = lambda_hard_r(epoch)

        for idx in range(num_total_batch):
            batch_symp = all_symp_index[idx *
                                        model_param["batch_size"]:(idx + 1) *
                                        model_param["batch_size"]]

            # get pos symp and neg symp
            pos_symp = symp2symp_pos_map[batch_symp]

            # sample neg
            neg_symp = np.random.randint(1, gnn.num_symp,
                                         model_param["batch_size"])

            # cope with overlapping in pos and neg symps
            overlap_index = (neg_symp == pos_symp)
            overlap_symp = neg_symp[overlap_index]
            neg_symp[overlap_index] = symp2symp_hard_map[overlap_symp]

            if hard_ratio > 0:
                num_hard = int(hard_ratio * model_param["batch_size"])
                neg_symp[:num_hard] = symp2symp_hard_map[neg_symp[:num_hard]]

            batch_symp_ts = torch.LongTensor(batch_symp)
            pos_symp_ts = torch.LongTensor(pos_symp)
            neg_symp_ts = torch.LongTensor(neg_symp)

            if model_param["use_gpu"]:
                batch_symp_ts = batch_symp_ts.cuda()
                pos_symp_ts = pos_symp_ts.cuda()
                neg_symp_ts = neg_symp_ts.cuda()

            # forward batch symp
            batch_symp_data = sds_sampler(batch_symp, 1, 20)
            symp_emb = gnn.forward(batch_symp_ts, batch_symp_data)

            pos_symp_data = sds_sampler(pos_symp, 1, 20)
            pos_emb = gnn.forward(pos_symp_ts, pos_symp_data)

            neg_symp_data = sds_sampler(neg_symp, 1, 20)
            neg_emb = gnn.forward(neg_symp_ts, neg_symp_data)

            # create loss
            scores = symp_emb.mul(pos_emb).sum(1) - symp_emb.mul(neg_emb).sum(
                1) + 1.0
            scores[scores < 0] = 0
            loss = scores.mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        if total_loss - last_total_loss > 0:
            print("Loss stops to decrease, converge.")
            break

        last_total_loss = total_loss

    # save model
    torch.save(gnn.state_dict(), "./ckpt/sds_gnn.pt")
    print("Model saved.")
def main(**kwargs):
    # parse parameters
    param = default_config()
    param.update({
        "mode": "sds",
        "top_k": 10,
        "ckpt": "ckpt/gnn.pt",
        "use_gpu": False
    })

    param.update(kwargs)

    # read maps
    symp2id, id2symp = read_symp2id()
    dise2id, id2dise = read_dise2id()

    # read data
    datapath = os.path.join("dataset/EHR/test/data.txt")
    fin = open(datapath, "r", encoding="utf-8")
    lines = fin.readlines()

    data_model = ehr.EHR("dataset/EHR", "train")

    # init retrieval system
    ehr_ret = EHR_retrieval(mode=param["mode"])

    # init and load model
    data_model_param = parse_data_model(data_model)
    param.update(data_model_param)
    param = parse_kwargs(param, kwargs)
    gnn = HGNN(**param)

    if param["use_gpu"]:
        gnn.cuda()

    ckpt_path = param.get("ckpt")
    if ckpt_path is None:
        print("[Warning] Do not set ckpt path, load from the default path.")
        load_ckpt("ckpt/checkpoint.pt", gnn, param["use_gpu"])
    else:
        load_ckpt(ckpt_path, gnn, param["use_gpu"])

    dsd_sampler = DSD_sampler("dataset/EHR")
    usu_sampler = USU_sampler("dataset/EHR")

    gnn.eval()

    emb_dise = gnn.gen_all_dise_emb(dsd_sampler)

    # init result list
    before_list = []
    after_list = []
    real_dise_list = []
    init_symp_list = []
    after_symp_list = []

    result_map_bfo = defaultdict(list)
    result_map_aft = defaultdict(list)
    # this is top_k for evaluation p@N, Rec@N, ...
    top_k_list = [1, 5]

    for i, line in enumerate(lines):
        line_data = line.strip().split()
        uid = line_data[0]
        did = line_data[1]
        real_dise_list.append(did)
        symps = line_data[2:]

        # select the first symptom and do inference
        init_symp = symps[0]
        init_symp_list.append(id2symp[init_symp])

        symp_ar = np.array([[init_symp]])

        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5)

        # calculate statistics
        for top_k in top_k_list:
            pred_top_k = pred_rank[0][:top_k]
            calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_bfo)

        # print("true did:", did)
        # print("before:", pred_rank)
        before_list.append(pred_rank[0])

        rank_symp = ehr_ret(symp_idx=init_symp, top_k=param["top_k"])
        after_symp_list.append([id2symp[str(t)] for t in rank_symp])
        symp_ar = [np.concatenate([[init_symp], rank_symp], 0)]

        # symp_ar = np.array([symps])
        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5)
        for top_k in top_k_list:
            pred_top_k = pred_rank[0][:top_k]
            calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_aft)

        # print("after:", pred_rank)
        after_list.append(pred_rank[0])

        ret_symps = ehr_ret(init_symp, param["top_k"])
        ret_symp_list = []
        for sid in ret_symps:
            ret_symp_list.append(id2symp[str(sid)])

        if i % 100 == 0:
            print("[line]:", i)

    # summary
    bf_log = build_result_log(result_map_bfo, top_k_list)
    af_log = build_result_log(result_map_aft, top_k_list)

    print("[before]: {}".format(bf_log))
    print("[after]: {}".format(af_log))

    # to result csv
    fout = open("retrieval_result_{}.txt".format(param["mode"]),
                "w",
                encoding="utf-8")
    fout.write("did\tbefore_pred\tafter_pred\tinit_symp\taftersymp\n")
    for i in range(len(init_symp_list)):
        wrtline = id2dise[int(real_dise_list[i])] + "\t" + id2dise[int(
            before_list[i][0])] + "\t" + id2dise[int(
                after_list[i]
                [0])] + "\t" + init_symp_list[i] + "\t" + "#".join(
                    after_symp_list[i]) + "\n"
        fout.write(wrtline)

    fin.close()
    fout.close()

    df_res = pd.read_table("retrieval_result_{}.txt".format(param["mode"]))
    df_res.to_excel("retrieval_result_{}.xlsx".format(param["mode"]),
                    encoding="utf-8")
    print("Done")
Esempio n. 6
0
def train(**kwargs):

    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    dataset_name = model_param["dataset"]

    # load hard maps
    if model_param["hard_ratio"] > 0:
        model_param["hard_map"] = np.load("dataset/hard_dise.npy",
                                          allow_pickle=True).item()

    # load training data
    train_data = ehr.EHR("dataset/{}".format(dataset_name), "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/{}".format(dataset_name), "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    gnn = HGNN(**model_param)
    if kwargs["w2v"] is not None:
        if os.path.exists(kwargs["w2v"]):
            # load w2v data
            gnn.load_symp_embed(kwargs["w2v"])
        else:
            from gensim.models import Word2Vec
            # build word2vec embeddings
            filename = "./dataset/EHR/train/data.txt"
            fin = open(filename, "r")
            corpus = []
            for line in fin.readlines():
                corpus.append(line.strip().split()[2:])
            # learn word2vec model
            start_time = time.time()
            w2v_model = Word2Vec(corpus,
                                 size=64,
                                 window=3,
                                 min_count=1,
                                 workers=4,
                                 sg=1)
            w2v_model.save("./ckpt/w2v")
            print("word2vec training done, costs {} secs.".format(time.time() -
                                                                  start_time))

    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"])

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    # init sampler for netative sampling during training.
    dsd_sampler = DSD_sampler("dataset/{}".format(dataset_name))
    print("D-S-D Sampler Inited.")

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward(
                feat, dise, dsd_sampler)

            bpr_loss = create_bpr_loss(pred, pred_neg)

            l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise)
            loss = bpr_loss + model_param["weight_decay"] * l2_loss
            # loss = bpr_loss

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            total_loss += bpr_loss.item()
            # print(idx,total_loss)

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg

        metric_result, eval_log, eval_result = evaluate(
            gnn, val_data_loader, dsd_sampler, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], gnn, "gnn")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/{}".format(dataset_name), "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate(gnn,
                                                  test_data_loader,
                                                  dsd_sampler,
                                                  top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 7
0
def train(**kwargs):

    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load hard maps
    if model_param["hard_ratio"] > 0:
        model_param["hard_map"] = np.load("dataset/hard_dise.npy",
                                          allow_pickle=True).item()

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    gnn = HGNN_DSD(**model_param)
    if kwargs["w2v"] is not None:
        # load w2v data
        gnn.load_symp_embed(kwargs["w2v"])
    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"])

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    # init sampler for netative sampling during training.
    dsd_sampler = DSD_sampler("dataset/EHR")
    print("D-S-D Sampler Inited.")

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward(
                feat, dise, dsd_sampler)

            bpr_loss = create_bpr_loss(pred, pred_neg)

            l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise)
            loss = bpr_loss + model_param["weight_decay"] * l2_loss
            # loss = bpr_loss

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            total_loss += bpr_loss.item()
            # print(idx,total_loss)

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg

        metric_result, eval_log, eval_result = evaluate(
            gnn, val_data_loader, dsd_sampler, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], gnn, "gnn_dsd")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate(gnn,
                                                  test_data_loader,
                                                  dsd_sampler,
                                                  top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 8
0
def main(**kwargs):
    model_param = default_config()
    model_param.update({"top_k":3})

    model_param = parse_kwargs(model_param, kwargs)


    print("Start evaluating on top {} predictions.".format(model_param["top_k"]))

    # load map
    dise2id, id2dise = read_dise2id("dataset/EHR")

    # load train data model
    data_model = ehr.EHR("dataset/EHR","train")

    test_data = ehr.EHR("dataset/EHR","test")
    test_data_loader  = DataLoader(test_data, 
            model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn)

    data_model_param = parse_data_model(data_model)
    model_param.update(data_model_param)

    gnn = HGNN(**model_param)
    if model_param["use_gpu"]:
        gnn.cuda()

    ckpt_path = kwargs.get("ckpt")

    if ckpt_path is None:
        print("[Warning] Do not set ckpt path, load from the default path.")
        load_ckpt("ckpt/checkpoint.pt", gnn, model_param["use_gpu"])
    else:
        load_ckpt(ckpt_path, gnn, model_param["use_gpu"])

    dsd_sampler = DSD_sampler("dataset/EHR")
    usu_sampler = USU_sampler("dataset/EHR")

    gnn.eval()

    emb_dise = gnn.gen_all_dise_emb(dsd_sampler)

    rank_list = None
    dise_list = None

    for idx, (feat, dise) in enumerate(test_data_loader):

        this_dise_list = parse_rank(dise, id2dise)

        if dise_list is None:
            dise_list = this_dise_list
        else:
            dise_list = np.r_[dise_list, this_dise_list]

        # get symps
        symp_list = []
        for x in feat:
            symp_list.append(x["symp"])

        symp_ar = np.array(symp_list)

        # re-sampling users embeddings by their symptoms
        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=model_param["top_k"])

        # parse rank for print
        pred_list = parse_rank(pred_rank, id2dise)

        if rank_list is None:
            rank_list = pred_list
        else:
            rank_list = np.r_[rank_list, pred_list]

    # save results
    res_ar = np.c_[dise_list, rank_list]
    df_res = pd.DataFrame(res_ar)
    col_name = ["GroundTruth"] + ["Pred_"+str(i+1) for i in range(rank_list.shape[1])]
    df_res.columns = col_name
    df_res.to_csv("Test_Results.csv",encoding="utf-8")

    print("Test done, save results in", "Test_Results.csv")