def main(cfg): set_seed(7) file_num = cfg.filenum cfg.result_path = './result/' print('load dict') news_dict = json.load( open('./{}/news.json'.format(cfg.root), 'r', encoding='utf-8')) cfg.news_num = len(news_dict) print('load words dict') word_dict = json.load( open('./{}/word.json'.format(cfg.root), 'r', encoding='utf-8')) cfg.word_num = len(word_dict) if cfg.model == 'dssm': model = DSSM(cfg) elif cfg.model == 'gru': model = GRURec(cfg) saved_model_path = os.path.join('./checkpoint/', 'model.ep{0}'.format(cfg.epoch)) print("Load from:", saved_model_path) if not os.path.exists(saved_model_path): print("Not Exist: {}".format(saved_model_path)) return [] model.cpu() pretrained_model = torch.load(saved_model_path, map_location='cpu') print(model.load_state_dict(pretrained_model, strict=False)) for point_num in range(file_num): print("processing {}/raw/test-{}.npy".format(cfg.root, point_num)) valid_dataset = FMData( np.load("{}/raw/test-{}.npy".format(cfg.root, point_num))) dataset_list = split_dataset(valid_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): cur_device = torch.device("cuda:{}".format(rank)) p = mp.Process(target=run, args=(cfg, rank, dataset_list[rank], cur_device, model)) p.start() processes.append(p) for p in processes: p.join() gather(cfg, point_num) gather_all(cfg.result_path, file_num, validate=True, save=True)
def main(cfg): set_seed(7) file_num = cfg.filenum cfg.result_path = './result/' print('load config') model_cfg = ModelConfig(cfg.root) cfg.mc = model_cfg print('load news info') news_title = np.load('{}/news_info.npy'.format(cfg.root)) model = DFN(model_cfg) saved_model_path = os.path.join('./checkpoint/', 'model.ep{0}'.format(cfg.epoch)) print("Load from:", saved_model_path) if not os.path.exists(saved_model_path): print("Not Exist: {}".format(saved_model_path)) return [] model.cpu() pretrained_model = torch.load(saved_model_path, map_location='cpu') print(model.load_state_dict(pretrained_model, strict=False)) for point_num in range(cfg.start_dev, file_num): print("processing {}/raw/{}-{}.npy".format(cfg.root, cfg.type, point_num)) valid_dataset = np.load("{}/raw/{}-{}.npy".format( cfg.root, cfg.type, point_num)) dataset_list = split_dataset(valid_dataset, cfg.gpus) processes = [] for rank in range(cfg.gpus): cur_device = torch.device("cuda:{}".format(rank)) p = mp.Process(target=run, args=(cfg, rank, dataset_list[rank], cur_device, model, news_title)) p.start() processes.append(p) for p in processes: p.join() gather(cfg, point_num) gather_all(cfg.result_path, file_num, start_file=cfg.start_dev, validate=True, save=True)
def run(cfg, rank, device, finished, train_dataset_path, valid_dataset_file, news_title): """ train and evaluate :param args: config :param rank: process id :param device: device :param train_dataset: dataset instance of a process :return: """ set_seed(7) print("Worker %d is setting dataset ... " % rank) # Build Dataloader train_dataset = RecoData(cfg.mc, np.load(train_dataset_path), news_title) train_data_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, drop_last=True) valid_dataset = RecoData(cfg.mc, valid_dataset_file, news_title) valid_data_loader = DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False) # # Build model. model = DFN(cfg.mc) model.to(device) # Build optimizer. steps_one_epoch = len(train_data_loader) train_steps = cfg.epoch * steps_one_epoch print("Total train steps: ", train_steps) optimizer = torch.optim.Adam(params=model.parameters(), lr=cfg.lr) print("Worker %d is working ... " % rank) # Fast check the validation process if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0): validate(cfg, -1, model, device, rank, valid_data_loader, fast_dev=True) logging.warning(model) gather_all(cfg.result_path, 1, validate=True, save=False) # Training and validation for epoch in range(cfg.epoch): # print(model.match_prediction_layer.state_dict()['2.bias']) train(cfg, epoch, rank, model, train_data_loader, optimizer, steps_one_epoch, device) validate(cfg, epoch, model, device, rank, valid_data_loader) # add finished count finished.value += 1 if (cfg.gpus < 2) or (cfg.gpus > 1 and rank == 0): save_checkpoint_by_epoch(model.state_dict(), epoch, cfg.checkpoint_path) while finished.value < cfg.gpus: time.sleep(1) gather_all(cfg.result_path, cfg.gpus, validate=True, save=False) finished.value = 0