def test_InferDataset(): config_file = "../../../config.yaml" ip_list_file = "../../../ip_list.txt" config = prepare_config(config_file) ds = InferDataset(config, ip_list_file) loader = Dataloader(ds, batch_size=1, num_workers=1) for data in loader: print(data[0]) break
def test_PairDataset(): config_file = "../../../config.yaml" ip_list_file = "../../../ip_list.txt" config = prepare_config(config_file) ds = TrainPairDataset(config, ip_list_file) loader = Dataloader(ds, batch_size=4, num_workers=1, stream_shuffle_size=100, collate_fn=CollateFn()) pairs = [] start = time.time() for batch_data in loader: pairs.extend(batch_data) print(batch_data) time.sleep(10) print("total time: %s" % (time.time() - start))
num_workers=1, collate_fn=DS.CollateFn(config)) ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() # ---------------- test ----------------------- # log.info("testing ...") pred_dict = evaluate(model, test_loader) test_output_path = os.path.join(config.output_dir, config.task_name) make_dir(test_output_path) test_output_file = os.path.join(test_output_path, "test_pred.npz") log.info("saving test result to %s" % test_output_file) np.savez_compressed(test_output_file, pred_dict['y_pred'].astype(np.float32)) if __name__ == "__main__": parser = argparse.ArgumentParser(description='gnn') parser.add_argument("--config", type=str, default="./config.yaml") parser.add_argument("--task_name", type=str, default="task_name") parser.add_argument("--mode", type=str, default="train") parser.add_argument("--output_path", type=str, default="./") args = parser.parse_args() config = prepare_config(args.config, isCreate=False, isSave=False) make_dir(args.output_path) infer(config, args.output_path)
ds = MolDataset(config) split_idx = ds.get_idx_split() train_ds = Subset(ds, split_idx['train'], mode='train') valid_ds = Subset(ds, split_idx['valid'], mode='valid') test_ds = Subset(ds, split_idx['test'], mode='test') print("Train exapmles: %s" % len(train_ds)) print("Valid exapmles: %s" % len(valid_ds)) print("Test exapmles: %s" % len(test_ds)) for i in range(len(train_ds)): gdata = train_ds[i] print("nfeat: ", np.sum(gdata['node_feat'])) print("edges: ", np.sum(gdata['edge_index'])) print("label: ", gdata['label']) if i == 10: break print("valid data") for i in range(len(valid_ds)): gdata = valid_ds[i] print("nfeat: ", np.sum(gdata['node_feat'])) print("edges: ", np.sum(gdata['edge_index'])) print("label: ", gdata['label']) if i == 10: break if __name__ == "__main__": config = prepare_config("./config.yaml", isCreate=False, isSave=False) test_dataset(config)
import paddle.nn.functional as F import paddle.distributed as dist import pgl from pgl.utils.data import Dataloader from pgl.utils.logger import log from ogb.lsc import PCQM4MEvaluator from ogb.utils import smiles2graph from utils.config import prepare_config, make_dir from utils.logger import prepare_logger, log_to_file import model as M import dataset as DS config = prepare_config("./config.yaml", isCreate=False, isSave=False) env = dist.ParallelEnv() rank = env.rank ip_address = config.ip_address.split(',') os.environ['PADDLE_CURRENT_ENDPOINT'] = ip_address[rank] os.environ['PADDLE_TRAINER_ENDPOINTS'] = config.ip_address reg_criterion = paddle.nn.loss.L1Loss() def data2tensor(batch_dict): feed_dict = {} for key, value in batch_dict.items(): if isinstance(value, pgl.Graph): feed_dict[key] = value.tensor() elif isinstance(value, np.ndarray):
writer.write("\t".join(item) + "\n") p2c_edges_file = os.path.join(config.processed_path, 'paper2conf_edges.txt') log.info("saving paper2conf edges to %s" % p2c_edges_file) with open(p2c_edges_file, 'w') as writer: for item in tqdm.tqdm(paper2conf_edges): writer.write("\t".join(item) + "\n") author_label_file = os.path.join(config.processed_path, 'author_label.txt') log.info("saving author label to %s" % author_label_file) with open(author_label_file, 'w') as writer: for item in tqdm.tqdm(author_label): writer.write("\t".join(item) + "\n") conf_label_file = os.path.join(config.processed_path, 'conf_label.txt') log.info("saving conf label to %s" % conf_label_file) with open(conf_label_file, 'w') as writer: for item in tqdm.tqdm(conf_label): writer.write("\t".join(item) + "\n") if __name__ == "__main__": parser = argparse.ArgumentParser(description='metapath2vec') parser.add_argument('--config', default="./config.yaml", type=str) args = parser.parse_args() config = prepare_config(args.config) main(config)
def main(data_path, data, out_path, config_path, eval_every): start_time = time.time() # read paths trn_data = os.path.join(data_path, f'MIND{data}_train') vld_data = os.path.join(data_path, f'MIND{data}_dev') util_data = os.path.join(data_path, 'utils') trn_paths = set_data_paths(trn_data) vld_paths = set_data_paths(vld_data) util_paths = set_util_paths(util_data) trn_pickle_path = os.path.join(trn_data, 'dataset.pickle') vld_pickle_path = os.path.join(vld_data, 'dataset.pickle') # read configuration file config = prepare_config(config_path, wordEmb_file=util_paths['embedding'], wordDict_file=util_paths['word_dict'], userDict_file=util_paths['uid2index']) # out path num_global = config['pop'] # 7 num_fresh = config['fresh'] # 1 out_path = os.path.join(out_path, f'MIND{data}_dev_pop{num_global}_fresh{num_fresh}') os.makedirs(out_path, exist_ok=True) # set seed = config['seed'] set_seed(seed) epochs = config['epochs'] metrics = {metric: 0. for metric in config['metrics']} # load dictionaries word2idx = load_dict(config['wordDict_file']) uid2idx = load_dict(config['userDict_file']) # load datasets and define dataloaders if os.path.exists(trn_pickle_path): with open(trn_pickle_path, 'rb') as f: trn_set = pickle.load(f) else: trn_selector = NewsSelector(data_type1=data, data_type2='train', num_pop=20, num_fresh=20) trn_set = DataSetTrn(trn_paths['news'], trn_paths['behaviors'], word2idx=word2idx, uid2idx=uid2idx, selector=trn_selector, config=config) with open(trn_pickle_path, 'wb') as f: pickle.dump(trn_set, f) if os.path.exists(vld_pickle_path): with open(vld_pickle_path, 'rb') as f: vld_set = pickle.load(f) else: vld_selector = NewsSelector(data_type1=data, data_type2='dev', num_pop=20, num_fresh=20) vld_set = DataSetTest(vld_paths['news'], vld_paths['behaviors'], word2idx=word2idx, uid2idx=uid2idx, selector=vld_selector, config=config, label_known=True) with open(vld_pickle_path, 'wb') as f: pickle.dump(vld_set, f) trn_loader = DataLoader(trn_set, batch_size=config['batch_size'], shuffle=True, num_workers=8) vld_impr_idx, vld_his, vld_impr, vld_label, vld_pop, vld_fresh =\ vld_set.raw_impr_idxs, vld_set.histories_words, vld_set.imprs_words,\ vld_set.labels, vld_set.pops_words, vld_set.freshs_words # define models, optimizer, loss # TODO: w2v --> BERT model word2vec_emb = np.load(config['wordEmb_file']) model = NRMS(config, word2vec_emb).to(DEVICE) optimizer = optim.Adam(model.parameters(), lr=float(config['learning_rate']), weight_decay=float(config['weight_decay'])) criterion = nn.CrossEntropyLoss() print(f'[{time.time()-start_time:5.2f} Sec] Ready for training...') # train and evaluate for epoch in range(1, epochs+1): start_time = time.time() batch_loss = 0. ''' training ''' for i, (trn_his, trn_pos, trn_neg, trn_pop, trn_fresh) \ in tqdm(enumerate(trn_loader), desc='Training', total=len(trn_loader)): # ready for training model.train() optimizer.zero_grad() # prepare data trn_his, trn_pos, trn_neg, trn_pop, trn_fresh = \ trn_his.to(DEVICE), trn_pos.to(DEVICE), trn_neg.to(DEVICE),\ trn_pop.to(DEVICE), trn_fresh.to(DEVICE) trn_pop = trn_pop[:, :config['pop'], :] trn_fresh = trn_fresh[:, :config['fresh'], :] trn_cand = torch.cat((trn_pos, trn_neg), dim=1) trn_global = torch.cat((trn_pop, trn_fresh), dim=1) trn_gt = torch.zeros(size=(trn_cand.shape[0],)).long().to(DEVICE) # inference if config['global']: trn_user_out = model((trn_his, trn_global), source='pgt') else: trn_user_out = model(trn_his, source='history') trn_cand_out = model(trn_cand, source='candidate') prob = torch.matmul(trn_cand_out, trn_user_out.unsqueeze(2)).squeeze() # training loss = criterion(prob, trn_gt) loss.backward() optimizer.step() batch_loss += loss.item() inter_time = time.time() epoch_loss = batch_loss/(i+1) if epoch % eval_every != 0: result = f'Epoch {epoch:3d} [{inter_time - start_time:5.2f}Sec]' \ f', TrnLoss:{epoch_loss:.4f}' print(result) continue ''' evaluation ''' with open(os.path.join(out_path, f'prediction-{epoch}.txt'), 'w') as f: for j in tqdm(range(len(vld_impr)), desc='Evaluation', total=len(vld_impr)): impr_idx_j = vld_impr_idx[j] vld_his_j = torch.tensor(vld_his[j]).long().to(DEVICE).unsqueeze(0) vld_pop_j = torch.tensor(vld_pop[j]).long().to(DEVICE).unsqueeze(0) vld_fresh_j = torch.tensor(vld_fresh[j]).long().to(DEVICE).unsqueeze(0) vld_pop_j = vld_pop_j[:, :config['pop'], :] vld_fresh_j = vld_fresh_j[:, :config['fresh'], :] vld_global_j = torch.cat((vld_pop_j, vld_fresh_j), dim=1) if config['global']: vld_user_out_j = model((vld_his_j, vld_global_j), source='pgt') else: vld_user_out_j = model(vld_his_j, source='history') vld_cand_j = torch.tensor(vld_impr[j]).long().to(DEVICE).unsqueeze(0) vld_cand_out_j = model(vld_cand_j, source='candidate') scores_j = torch.matmul(vld_cand_out_j, vld_user_out_j.unsqueeze(2)).squeeze() scores_j = scores_j.detach().cpu().numpy() argmax_idx = (-scores_j).argsort() ranks = np.empty_like(argmax_idx) ranks[argmax_idx] = np.arange(1, scores_j.shape[0]+1) ranks_str = ','.join([str(r) for r in list(ranks)]) f.write(f'{impr_idx_j} [{ranks_str}]\n') vld_gt_j = np.array(vld_label[j]) for metric, _ in metrics.items(): if metric == 'auc': score = roc_auc_score(vld_gt_j, scores_j) metrics[metric] += score elif metric == 'mrr': score = mrr_score(vld_gt_j, scores_j) metrics[metric] += score elif metric.startswith('ndcg'): # format like: ndcg@5;10 k = int(metric.split('@')[1]) score = ndcg_score(vld_gt_j, scores_j, k=k) metrics[metric] += score for metric, _ in metrics.items(): metrics[metric] /= len(vld_impr) end_time = time.time() result = f'Epoch {epoch:3d} [{inter_time - start_time:5.2f} / {end_time - inter_time:5.2f} Sec]' \ f', TrnLoss:{epoch_loss:.4f}, ' for enum, (metric, _) in enumerate(metrics.items(), start=1): result += f'{metric}:{metrics[metric]:.4f}' if enum < len(metrics): result += ', ' print(result)
log.info("saving features") np.save( "dataset/%s/soft_mgf_feat.npy" % (args.dataset_name.replace("-", "_")), mgf) if __name__ == "__main__": parser = argparse.ArgumentParser(description='gnn') parser.add_argument("--config", type=str, default="./config.yaml") parser.add_argument("--task_name", type=str, default="task_name") parser.add_argument("--infer_model", type=str, default=None) parser.add_argument("--log_id", type=str, default=None) args = parser.parse_args() if args.infer_model is not None: config = prepare_config(args.config, isCreate=False, isSave=False) config.model_path_for_infer = args.infer_model infer(config) else: config = prepare_config(args.config, isCreate=True, isSave=True) log_to_file(log, config.log_dir, config.log_filename) if config.warm_start_from is not None: log.info("loading model config from %s" % config.pretrained_config_file) pretrained_config = prepare_config(config.pretrained_config_file) pretrained_model_config = pretrained_config.pretrained_model_config else: pretrained_model_config = config.model_config