def train_ptb_lm(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version model_check.check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied model_check.check_version() model_type = args.model_type vocab_size = 37484 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 4 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 2 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "gru4rec": num_layers = 1 batch_size = 500 hidden_size = 100 num_steps = 10 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 10 max_epoch = 5 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 0.05 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return with fluid.dygraph.guard(core.CUDAPlace(0)): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed max_epoch = 1 ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load_dygraph(args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) dy_param_updated = dict() dy_param_init = dict() dy_loss = None last_hidden = None data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data batch_len = len(train_data) // batch_size total_batch_size = (batch_len - 1) // num_steps print("total_batch_size:", total_batch_size) log_interval = total_batch_size // 20 bd = [] lr_arr = [base_learning_rate] for i in range(1, max_epoch): bd.append(total_batch_size * i) new_lr = base_learning_rate * (lr_decay**max( i + 1 - epoch_start_decay, 0.0)) lr_arr.append(new_lr) sgd = AdagradOptimizer(parameter_list=ptb_model.parameters(), learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr)) print("parameters:--------------------------------") for para in ptb_model.parameters(): print(para.name) print("parameters:--------------------------------") def eval(model, data): print("begion to eval") total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') model.eval() train_data_iter = reader.get_data_iter(data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) accum_num_recall = 0.0 for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] accum_num_recall += acc_ if batch_id % 1 == 0: print("batch_id:%d recall@20:%.4f" % (batch_id, accum_num_recall / (batch_id + 1))) init_hidden = last_hidden total_loss += out_loss iters += num_steps print("eval finished") ppl = np.exp(total_loss / iters) print("recall@20 ", accum_num_recall / (batch_id + 1)) if args.ce: print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 iters = 0.0 init_hidden_data = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) init_hidden = to_variable(init_hidden_data) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): x_data, y_data = batch x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, num_steps, 1)) x = to_variable(x_data) y = to_variable(y_data) dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden) out_loss = dy_loss.numpy() acc_ = acc.numpy()[0] init_hidden = last_hidden dy_loss.backward() sgd.minimize(dy_loss, grad_clip=grad_clip) ptb_model.clear_gradients() total_loss += out_loss iters += num_steps if batch_id > 0 and batch_id % 100 == 1: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, acc: %.5f, lr: %.5f" % (epoch_id, batch_id, ppl[0], acc_, sgd._global_learning_rate().numpy())) print("one ecpoh finished", epoch_id) print("time cost ", time.time() - start_time) ppl = np.exp(total_loss / iters) print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) if args.ce: print("kpis\ttrain_ppl\t%0.3f" % ppl[0]) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), 'params') fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) eval(ptb_model, test_data)
def train_model_2_dnn( click_seq0_1, embedding_weight, save_model_path=f'./t2_dnngru/load_emb_save_model_{cfg.now_phase}'): model = Model_2_dnngru(click_seq0_1, embedding_weight, gru_steps=cfg.gru_steps, gru_num_layers=1) if not os.path.exists('./t2_dnngru'): # 创建储存中间数据,模型的文件夹 os.makedirs('./t2_dnngru') data_path = f'./t2_dnngru/list6_0-{cfg.now_phase}.pkl' if not os.path.exists(data_path): # 处理数据 user_data, click_info, click_id, click_txt, click_img, y_click_id = data_pre_dnngru( click_seq0_1) pickle.dump((user_data, click_info, click_id, click_txt, click_img, y_click_id), open(data_path, 'wb')) else: user_data, click_info, click_id, click_txt, click_img, y_click_id = pickle.load( open(data_path, 'rb')) batch_len = len(user_data) // cfg.batch_size total_batch_size = (batch_len - 1) // cfg.gru_steps print("total_batch_size:", total_batch_size) # opt = fluid.optimizer.Adam(learning_rate=0.05, parameter_list=model.parameters()) bd = [] lr_arr = [cfg.base_learning_rate] for i in range(1, cfg.max_epoch): bd.append(total_batch_size * i) new_lr = cfg.base_learning_rate * (cfg.lr_decay**max( i + 1 - cfg.epoch_start_decay, 0.0)) lr_arr.append(new_lr) # 定义梯度的clip即取值范围 grad_clip = fluid.clip.GradientClipByGlobalNorm(cfg.max_grad_norm) # 优化器选择adam,会降低训练准确率,选sgd会过拟合 sgd = AdagradOptimizer(parameter_list=model.parameters(), learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), grad_clip=grad_clip) model.train() for epoch in range(cfg.max_epoch): start_time = time.time() train_loader = data_loader_dnngru(user_data, click_info, click_id, click_txt, click_img, y_click_id, cfg.batch_size, cfg.gru_steps) init_hidden_data = np.zeros( (model.gru_num_layers, cfg.batch_size, model.gru_hidden_size), dtype='float32') init_hidden = to_variable(init_hidden_data) for batch_id, data in enumerate(train_loader): (user_data_pp, click_info_pp, click_id_pp, click_txt_pp, click_img_pp, y_click_id_pp) = (data[..., :4], data[..., 4:6], data[..., 6:7], data[..., 7:128 + 7], data[..., 128 + 7:256 + 7], data[..., 256 + 7:256 + 8]) user_data_pp = user_data_pp.astype(int) (user_id_pp, user_age_level_pp, user_gender_pp, user_city_level_pp) = (user_data_pp[:, :, 0], user_data_pp[:, :, 1], user_data_pp[:, :, 2], user_data_pp[:, :, 3]) user_id_pp = to_variable(user_id_pp) user_age_level_pp = to_variable(user_age_level_pp) user_gender_pp = to_variable(user_gender_pp) user_city_level_pp = to_variable(user_city_level_pp) stay_data_pp = to_variable( click_info_pp[..., 0:1].astype('float32')) click_id_pp = to_variable(click_id_pp[..., 0].astype(int)) click_txt_pp = to_variable(click_txt_pp.astype('float32')) click_img_pp = to_variable(click_img_pp.astype('float32')) y_click_id_pp = to_variable(y_click_id_pp.astype(int)) pred_out, last_hidden = model([ user_id_pp, user_age_level_pp, user_gender_pp, user_city_level_pp ], stay_data_pp, [click_id_pp, click_txt_pp, click_img_pp], init_hidden) init_hidden = last_hidden.detach() # 交叉熵 loss = fluid.layers.softmax_with_cross_entropy(logits=pred_out, label=y_click_id_pp, soft_label=False, axis=2) # 计算recall@50 指标 pre_2d = fluid.layers.reshape(pred_out, shape=[-1, cfg.vocab_size]) label_2d = fluid.layers.reshape(y_click_id_pp, shape=[-1, 1]) acc = fluid.layers.accuracy(input=pre_2d, label=label_2d, k=50) acc_ = acc.numpy()[0] # 综合所有batch和序列长度的loss, 与5.2不同 loss = fluid.layers.reduce_mean(loss) loss.backward() sgd.minimize(loss) model.clear_gradients() out_loss = loss.numpy() # 每隔一段时间可以打印信息 if batch_id > 0 and batch_id % 100 == 1: print("-- Epoch:[%d]; Batch:[%d]; loss: %.5f, acc: %.5f" % (epoch, batch_id, out_loss, acc_)) print("one ecpoh finished", epoch) print("time cost ", time.time() - start_time) print("loss: %.5f, acc: %.5f" % (out_loss, acc_)) fluid.save_dygraph(model.state_dict(), save_model_path) print("Saved model to: %s.\n" % save_model_path)