def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") # load validation data val_data = ehr.EHR("dataset/EHR", "val") # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) # init model model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, verbose=1, n_iter_no_change=10, random_state=10) train_feat, train_label = train_data.get_feat_data() print("Start Training.") model.fit(train_feat, train_label) print("Training Finished.") # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_feat, test_label = test_data.get_feat_data() test_metric, test_log, test_result = evaluate_clf(model, test_feat, test_label, top_k_list=[3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model model = TextCNN(**model_param) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if model_param["use_gpu"]: model.cuda() print("Model Inited.") optimizer = torch.optim.Adam(model.parameters(), lr=model_param["lr"], weight_decay=0) for epoch in range(model_param["num_epoch"]): total_loss = 0 model.train() for idx, (feat, dise) in enumerate(train_data_loader): pred = model.forward(feat) if model_param["use_gpu"]: label = torch.LongTensor(dise).cuda() else: label = torch.LongTensor(dise) # label is [1,2,3...,27] loss = F.cross_entropy(pred, label - 1) # multi-class xent loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate_clf( model, val_data_loader, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], model, "textcnn") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate_clf(model, test_data_loader, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): w2v_model_name = "./ckpt/w2v" if os.path.exists(w2v_model_name): print("load word2vec model from", w2v_model_name) # load model directly w2v_model = Word2Vec.load(w2v_model_name) else: # load data filename = "./dataset/EHR/train/data.txt" fin = open(filename, "r") corpus = [] for line in fin.readlines(): corpus.append(line.strip().split()[2:]) # learn word2vec model start_time = time.time() w2v_model = Word2Vec(corpus, size=64, window=3, min_count=1, workers=4, sg=1) w2v_model.save("./ckpt/w2v") print("training done, costs {} secs.".format(time.time() - start_time)) # start training and testing the MLP model setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # let's build a MLP for prediction model_param["w2v_model"] = w2v_model model = MLP(**model_param) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if model_param["use_gpu"]: model.cuda() print("Model Inited.") optimizer = torch.optim.Adam(model.parameters(), lr=model_param["lr"], weight_decay=kwargs["weight_decay"]) for epoch in range(model_param["num_epoch"]): total_loss = 0 model.train() for idx, (feat, dise) in enumerate(train_data_loader): pred = model.forward(feat) if model_param["use_gpu"]: label = torch.LongTensor(dise).cuda() else: label = torch.LongTensor(dise) # label is [1,2,3...,27] loss = F.cross_entropy(pred, label - 1) # multi-class xent loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate_clf( model, val_data_loader, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], model, "med2vec") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate_clf(model, test_data_loader, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.") pass
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # init model data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] gnn = HGNN_SDS(**model_param) if model_param["w2v"] is not None: # load w2v data gnn.load_symp_embed(model_param["w2v"]) if use_gpu: gnn.cuda() print("Model Inited.") sds_sampler = SDS_sampler("dataset/EHR") # load pmi ss mat symp2symp_mat = sp.load_npz(os.path.join("dataset/EHR", "pmi_ss_mat.npz")) symp2symp_mat.setdiag(0) # total number of symptoms num_total_batch = gnn.num_symp // model_param["batch_size"] all_symp_index = np.arange(1, gnn.num_symp + 1) lambda_hard_r = lambda epoch: epoch * model_param[ "hard_ratio"] / model_param["num_epoch"] # build hard map and pos map symp2symp_hard_map = [0] symp2symp_pos_map = [0] for k in all_symp_index: symp2symp_b_ar = symp2symp_mat[k].toarray().flatten() max_index = np.argmax(symp2symp_b_ar) if max_index == 0: symp2symp_pos_map.append(np.random.randint(1, k)) symp2symp_hard_map.append(np.random.randint(1, k)) else: symp2symp_pos_map.append(max_index) symp2symp_b_ar[max_index] = -1 max_2nd_index = np.argmax(symp2symp_b_ar) if max_2nd_index == 0: symp2symp_hard_map.append(np.random.randint(1, k)) else: symp2symp_hard_map.append(max_2nd_index) symp2symp_hard_map = np.array(symp2symp_hard_map) symp2symp_pos_map = np.array(symp2symp_pos_map) print("Pos / Hard symptom map Inited.") optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=model_param["lr"]) last_total_loss = 1e10 for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() np.random.shuffle(all_symp_index) hard_ratio = lambda_hard_r(epoch) for idx in range(num_total_batch): batch_symp = all_symp_index[idx * model_param["batch_size"]:(idx + 1) * model_param["batch_size"]] # get pos symp and neg symp pos_symp = symp2symp_pos_map[batch_symp] # sample neg neg_symp = np.random.randint(1, gnn.num_symp, model_param["batch_size"]) # cope with overlapping in pos and neg symps overlap_index = (neg_symp == pos_symp) overlap_symp = neg_symp[overlap_index] neg_symp[overlap_index] = symp2symp_hard_map[overlap_symp] if hard_ratio > 0: num_hard = int(hard_ratio * model_param["batch_size"]) neg_symp[:num_hard] = symp2symp_hard_map[neg_symp[:num_hard]] batch_symp_ts = torch.LongTensor(batch_symp) pos_symp_ts = torch.LongTensor(pos_symp) neg_symp_ts = torch.LongTensor(neg_symp) if model_param["use_gpu"]: batch_symp_ts = batch_symp_ts.cuda() pos_symp_ts = pos_symp_ts.cuda() neg_symp_ts = neg_symp_ts.cuda() # forward batch symp batch_symp_data = sds_sampler(batch_symp, 1, 20) symp_emb = gnn.forward(batch_symp_ts, batch_symp_data) pos_symp_data = sds_sampler(pos_symp, 1, 20) pos_emb = gnn.forward(pos_symp_ts, pos_symp_data) neg_symp_data = sds_sampler(neg_symp, 1, 20) neg_emb = gnn.forward(neg_symp_ts, neg_symp_data) # create loss scores = symp_emb.mul(pos_emb).sum(1) - symp_emb.mul(neg_emb).sum( 1) + 1.0 scores[scores < 0] = 0 loss = scores.mean() optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) if total_loss - last_total_loss > 0: print("Loss stops to decrease, converge.") break last_total_loss = total_loss # save model torch.save(gnn.state_dict(), "./ckpt/sds_gnn.pt") print("Model saved.")
def main(**kwargs): # parse parameters param = default_config() param.update({ "mode": "sds", "top_k": 10, "ckpt": "ckpt/gnn.pt", "use_gpu": False }) param.update(kwargs) # read maps symp2id, id2symp = read_symp2id() dise2id, id2dise = read_dise2id() # read data datapath = os.path.join("dataset/EHR/test/data.txt") fin = open(datapath, "r", encoding="utf-8") lines = fin.readlines() data_model = ehr.EHR("dataset/EHR", "train") # init retrieval system ehr_ret = EHR_retrieval(mode=param["mode"]) # init and load model data_model_param = parse_data_model(data_model) param.update(data_model_param) param = parse_kwargs(param, kwargs) gnn = HGNN(**param) if param["use_gpu"]: gnn.cuda() ckpt_path = param.get("ckpt") if ckpt_path is None: print("[Warning] Do not set ckpt path, load from the default path.") load_ckpt("ckpt/checkpoint.pt", gnn, param["use_gpu"]) else: load_ckpt(ckpt_path, gnn, param["use_gpu"]) dsd_sampler = DSD_sampler("dataset/EHR") usu_sampler = USU_sampler("dataset/EHR") gnn.eval() emb_dise = gnn.gen_all_dise_emb(dsd_sampler) # init result list before_list = [] after_list = [] real_dise_list = [] init_symp_list = [] after_symp_list = [] result_map_bfo = defaultdict(list) result_map_aft = defaultdict(list) # this is top_k for evaluation p@N, Rec@N, ... top_k_list = [1, 5] for i, line in enumerate(lines): line_data = line.strip().split() uid = line_data[0] did = line_data[1] real_dise_list.append(did) symps = line_data[2:] # select the first symptom and do inference init_symp = symps[0] init_symp_list.append(id2symp[init_symp]) symp_ar = np.array([[init_symp]]) pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5) # calculate statistics for top_k in top_k_list: pred_top_k = pred_rank[0][:top_k] calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_bfo) # print("true did:", did) # print("before:", pred_rank) before_list.append(pred_rank[0]) rank_symp = ehr_ret(symp_idx=init_symp, top_k=param["top_k"]) after_symp_list.append([id2symp[str(t)] for t in rank_symp]) symp_ar = [np.concatenate([[init_symp], rank_symp], 0)] # symp_ar = np.array([symps]) pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5) for top_k in top_k_list: pred_top_k = pred_rank[0][:top_k] calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_aft) # print("after:", pred_rank) after_list.append(pred_rank[0]) ret_symps = ehr_ret(init_symp, param["top_k"]) ret_symp_list = [] for sid in ret_symps: ret_symp_list.append(id2symp[str(sid)]) if i % 100 == 0: print("[line]:", i) # summary bf_log = build_result_log(result_map_bfo, top_k_list) af_log = build_result_log(result_map_aft, top_k_list) print("[before]: {}".format(bf_log)) print("[after]: {}".format(af_log)) # to result csv fout = open("retrieval_result_{}.txt".format(param["mode"]), "w", encoding="utf-8") fout.write("did\tbefore_pred\tafter_pred\tinit_symp\taftersymp\n") for i in range(len(init_symp_list)): wrtline = id2dise[int(real_dise_list[i])] + "\t" + id2dise[int( before_list[i][0])] + "\t" + id2dise[int( after_list[i] [0])] + "\t" + init_symp_list[i] + "\t" + "#".join( after_symp_list[i]) + "\n" fout.write(wrtline) fin.close() fout.close() df_res = pd.read_table("retrieval_result_{}.txt".format(param["mode"])) df_res.to_excel("retrieval_result_{}.xlsx".format(param["mode"]), encoding="utf-8") print("Done")
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) dataset_name = model_param["dataset"] # load hard maps if model_param["hard_ratio"] > 0: model_param["hard_map"] = np.load("dataset/hard_dise.npy", allow_pickle=True).item() # load training data train_data = ehr.EHR("dataset/{}".format(dataset_name), "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/{}".format(dataset_name), "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model gnn = HGNN(**model_param) if kwargs["w2v"] is not None: if os.path.exists(kwargs["w2v"]): # load w2v data gnn.load_symp_embed(kwargs["w2v"]) else: from gensim.models import Word2Vec # build word2vec embeddings filename = "./dataset/EHR/train/data.txt" fin = open(filename, "r") corpus = [] for line in fin.readlines(): corpus.append(line.strip().split()[2:]) # learn word2vec model start_time = time.time() w2v_model = Word2Vec(corpus, size=64, window=3, min_count=1, workers=4, sg=1) w2v_model.save("./ckpt/w2v") print("word2vec training done, costs {} secs.".format(time.time() - start_time)) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if use_gpu: gnn.cuda() print("Model Inited.") # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"]) optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=0) # init sampler for netative sampling during training. dsd_sampler = DSD_sampler("dataset/{}".format(dataset_name)) print("D-S-D Sampler Inited.") for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() for idx, (feat, dise) in enumerate(train_data_loader): pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward( feat, dise, dsd_sampler) bpr_loss = create_bpr_loss(pred, pred_neg) l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise) loss = bpr_loss + model_param["weight_decay"] * l2_loss # loss = bpr_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += bpr_loss.item() # print(idx,total_loss) print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate( gnn, val_data_loader, dsd_sampler, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], gnn, "gnn") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/{}".format(dataset_name), "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate(gnn, test_data_loader, dsd_sampler, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load hard maps if model_param["hard_ratio"] > 0: model_param["hard_map"] = np.load("dataset/hard_dise.npy", allow_pickle=True).item() # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model gnn = HGNN_DSD(**model_param) if kwargs["w2v"] is not None: # load w2v data gnn.load_symp_embed(kwargs["w2v"]) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if use_gpu: gnn.cuda() print("Model Inited.") # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"]) optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=0) # init sampler for netative sampling during training. dsd_sampler = DSD_sampler("dataset/EHR") print("D-S-D Sampler Inited.") for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() for idx, (feat, dise) in enumerate(train_data_loader): pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward( feat, dise, dsd_sampler) bpr_loss = create_bpr_loss(pred, pred_neg) l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise) loss = bpr_loss + model_param["weight_decay"] * l2_loss # loss = bpr_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += bpr_loss.item() # print(idx,total_loss) print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate( gnn, val_data_loader, dsd_sampler, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], gnn, "gnn_dsd") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate(gnn, test_data_loader, dsd_sampler, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def main(**kwargs): model_param = default_config() model_param.update({"top_k":3}) model_param = parse_kwargs(model_param, kwargs) print("Start evaluating on top {} predictions.".format(model_param["top_k"])) # load map dise2id, id2dise = read_dise2id("dataset/EHR") # load train data model data_model = ehr.EHR("dataset/EHR","train") test_data = ehr.EHR("dataset/EHR","test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) data_model_param = parse_data_model(data_model) model_param.update(data_model_param) gnn = HGNN(**model_param) if model_param["use_gpu"]: gnn.cuda() ckpt_path = kwargs.get("ckpt") if ckpt_path is None: print("[Warning] Do not set ckpt path, load from the default path.") load_ckpt("ckpt/checkpoint.pt", gnn, model_param["use_gpu"]) else: load_ckpt(ckpt_path, gnn, model_param["use_gpu"]) dsd_sampler = DSD_sampler("dataset/EHR") usu_sampler = USU_sampler("dataset/EHR") gnn.eval() emb_dise = gnn.gen_all_dise_emb(dsd_sampler) rank_list = None dise_list = None for idx, (feat, dise) in enumerate(test_data_loader): this_dise_list = parse_rank(dise, id2dise) if dise_list is None: dise_list = this_dise_list else: dise_list = np.r_[dise_list, this_dise_list] # get symps symp_list = [] for x in feat: symp_list.append(x["symp"]) symp_ar = np.array(symp_list) # re-sampling users embeddings by their symptoms pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=model_param["top_k"]) # parse rank for print pred_list = parse_rank(pred_rank, id2dise) if rank_list is None: rank_list = pred_list else: rank_list = np.r_[rank_list, pred_list] # save results res_ar = np.c_[dise_list, rank_list] df_res = pd.DataFrame(res_ar) col_name = ["GroundTruth"] + ["Pred_"+str(i+1) for i in range(rank_list.shape[1])] df_res.columns = col_name df_res.to_csv("Test_Results.csv",encoding="utf-8") print("Test done, save results in", "Test_Results.csv")