def main(cfg): set_seed(7) print('load dev') dev_list = [] for i in range(cfg.filenum): dev_list.append(np.load("data/raw/dev-{}.npy".format(i))) validate_dataset = np.concatenate(dev_list, axis=0) print('load config') model_cfg = ModelConfig() print('load news info') news_title = np.load('data/news_info.npy') cfg.mc = model_cfg cfg.result_path = './result/' cfg.checkpoint_path = './checkpoint/' finished = mp.Value('i', 0) assert (cfg.gpus > 1) valid_dataset_list = split_valid_dataset(validate_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): p = mp.Process( target=init_processes, args=(cfg, rank, None, "data/raw/train-{}-new.npy".format(rank), valid_dataset_list[rank], news_title, finished, run, "nccl")) p.start() processes.append(p) for p in processes: p.join()
def main(cfg): set_seed(7) file_num = cfg.filenum cfg.result_path = './result/' print('load dict') news_dict = json.load( open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8')) cfg.news_num = len(news_dict) print('load words dict') word_dict = json.load( open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8')) cfg.word_num = len(word_dict) if cfg.model == 'dssm': model = DSSM(cfg) elif cfg.model == 'gru': model = GRURec(cfg) saved_model_path = os.path.join('./checkpoint/', 'model.ep{0}'.format(cfg.epoch)) print("Load from:", saved_model_path) if not os.path.exists(saved_model_path): print("Not Exist: {}".format(saved_model_path)) return [] model.cpu() pretrained_model = torch.load(saved_model_path, map_location='cpu') print(model.load_state_dict(pretrained_model, strict=False)) for point_num in range(file_num): print("processing {}/raw/test-{}.npy".format(cfg.root, point_num)) valid_dataset = FMData( np.load("{}/raw/test-{}.npy".format(cfg.root, point_num))) dataset_list = split_dataset(valid_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): cur_device = torch.device("cuda:{}".format(rank)) p = mp.Process(target=run, args=(cfg, rank, dataset_list[rank], cur_device, model)) p.start() processes.append(p) for p in processes: p.join() gather(cfg, point_num) gather_all(cfg.result_path, file_num, validate=True, save=True)
def main(cfg): set_seed(7) file_num = cfg.filenum cfg.result_path = './result/' print('load config') model_cfg = ModelConfig(cfg.root) cfg.mc = model_cfg print('load news info') news_title = np.load('{}/news_info.npy'.format(cfg.root)) model = DFN(model_cfg) saved_model_path = os.path.join('./checkpoint/', 'model.ep{0}'.format(cfg.epoch)) print("Load from:", saved_model_path) if not os.path.exists(saved_model_path): print("Not Exist: {}".format(saved_model_path)) return [] model.cpu() pretrained_model = torch.load(saved_model_path, map_location='cpu') print(model.load_state_dict(pretrained_model, strict=False)) for point_num in range(cfg.start_dev, file_num): print("processing {}/raw/{}-{}.npy".format(cfg.root, cfg.type, point_num)) valid_dataset = np.load("{}/raw/{}-{}.npy".format( cfg.root, cfg.type, point_num)) dataset_list = split_dataset(valid_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): cur_device = torch.device("cuda:{}".format(rank)) p = mp.Process(target=run, args=(cfg, rank, dataset_list[rank], cur_device, model, news_title)) p.start() processes.append(p) for p in processes: p.join() gather(cfg, point_num) gather_all(cfg.result_path, file_num, start_file=cfg.start_dev, validate=True, save=True)
def run(cfg, rank, device, finished, train_dataset_path, valid_dataset_file, news_title): """ train and evaluate :param args: config :param rank: process id :param device: device :param train_dataset: dataset instance of a process :return: """ set_seed(7) print("Worker %d is setting dataset ... " % rank) # Build Dataloader train_dataset = RecoData(cfg.mc, np.load(train_dataset_path), news_title) train_data_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True) valid_dataset = RecoData(cfg.mc, valid_dataset_file, news_title) valid_data_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False) # # Build model. model = DFN(cfg.mc) model.to(device) # Build optimizer. steps_one_epoch = len(train_data_loader) train_steps = cfg.epoch * steps_one_epoch print("Total train steps: ", train_steps) optimizer = torch.optim.Adam(params=model.parameters(), lr=cfg.lr) print("Worker %d is working ... " % rank) # Fast check the validation process if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0): validate(cfg, -1, model, device, rank, valid_data_loader, fast_dev=True) logging.warning(model) gather_all(cfg.result_path, 1, validate=True, save=False) # Training and validation for epoch in range(cfg.epoch): # print(model.match_prediction_layer.state_dict()['2.bias']) train(cfg, epoch, rank, model, train_data_loader, optimizer, steps_one_epoch, device) validate(cfg, epoch, model, device, rank, valid_data_loader) # add finished count finished.value += 1 if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0): save_checkpoint_by_epoch(model.state_dict(), epoch, cfg.checkpoint_path) while finished.value < cfg.gpus: time.sleep(1) gather_all(cfg.result_path, cfg.gpus, validate=True, save=False) finished.value = 0
def run(cfg, rank, dev_dataset_file, device, model, news_title): set_seed(7) model.to(device) model.eval() dev_dataset = RecoData(cfg.mc, dev_dataset_file, news_title) valid_data_loader = DataLoader(dev_dataset, batch_size=cfg.batch_size, shuffle=False) if ((cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0)): data_iter = tqdm(enumerate(valid_data_loader), desc="EP_dev:%d" % 1, total=len(valid_data_loader), bar_format="{l_bar}{r_bar}") else: data_iter = enumerate(valid_data_loader) with torch.no_grad(): preds, truths, imp_ids = list(), list(), list() for i, data in data_iter: imp_ids += data[:, 0].cpu().numpy().tolist() data = data.to(device) # 1. Forward pred = model(data[:, 2:]) if pred.dim() > 1: pred = pred.squeeze() try: preds += pred.cpu().numpy().tolist() except: print(data.size()) preds.append(int(pred.cpu().numpy())) truths += data[:, 1].long().cpu().numpy().tolist() tmp_dict = {} tmp_dict['imp'] = imp_ids tmp_dict['labels'] = truths tmp_dict['preds'] = preds with open(cfg.result_path + 'tmp_small_{}.json'.format(rank), 'w', encoding='utf-8') as f: json.dump(tmp_dict, f)
def main(cfg): set_seed(7) # print('load train') # train_list = [] # for i in range(cfg.filenum): # train_list.append(np.load("data/raw/train-{}.npy".format(i))) # train_dataset = FMData(np.concatenate(train_list, axis=0)) print('load dev') dev_list = [] for i in range(cfg.filenum): dev_list.append( np.load("{}/raw/{}-{}.npy".format(cfg.root, cfg.vtype, i))) validate_dataset = FMData(np.concatenate(dev_list, axis=0)) print('load news dict') news_dict = json.load( open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8')) print('load words dict') word_dict = json.load( open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8')) cfg.news_num = len(news_dict) cfg.word_num = len(word_dict) cfg.result_path = './result/' cfg.checkpoint_path = './checkpoint/' finished = mp.Value('i', 0) assert (cfg.gpus > 1) # dataset_list = split_dataset(train_dataset, cfg.gpus) valid_dataset_list = split_valid_dataset(validate_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): p = mp.Process(target=init_processes, args=(cfg, rank, None, "{}/raw/train-{}-new.npy".format(cfg.root, rank), valid_dataset_list[rank], finished, run, "nccl")) p.start() processes.append(p) for p in processes: p.join()