def train(): args = parse_args() config_path = args.config_path train_path = args.train_dir epoch_num = args.epoch_num use_cuda = True if args.use_cuda else False use_parallel = True if args.parallel else False logger.info("reading data begins") user_count, item_count, cat_count = reader.config_read(config_path) #data_reader, max_len = reader.prepare_reader(train_path, args.batch_size) logger.info("reading data completes") avg_cost, pred = network.network(item_count, cat_count, 433) #fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)) base_lr = args.base_lr boundaries = [410000] values = [base_lr, 0.2] sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values)) sgd_optimizer.minimize(avg_cost) def train_loop(main_program): data_reader, max_len = reader.prepare_reader(train_path, args.batch_size) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) feeder = fluid.DataFeeder(feed_list=[ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ], place=place) if use_parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name, main_program=main_program) else: train_exe = exe logger.info("train begins") global_step = 0 PRINT_STEP = 1000 start_time = time.time() loss_sum = 0.0 for id in range(epoch_num): epoch = id + 1 for data in data_reader(): global_step += 1 results = train_exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost.name, pred.name], return_numpy=True) loss_sum += results[0].mean() if global_step % PRINT_STEP == 0: logger.info( "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f" % (epoch, global_step, loss_sum / PRINT_STEP, time.time() - start_time)) start_time = time.time() loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or (global_step < 400000 and global_step % 50000 == 0): save_dir = args.model_dir + "/global_step_" + str( global_step) feed_var_name = [ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ] fetch_vars = [avg_cost, pred] fluid.io.save_inference_model(save_dir, feed_var_name, fetch_vars, exe) train_exe.close() t = fluid.DistributeTranspiler() t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": logger.info("run psever") prog, startup = t.get_pserver_programs(args.current_endpoint) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif args.role == "trainer": logger.info("run trainer") train_loop(t.get_trainer_program())
def train(): args = parse_args() if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED config_path = args.config_path train_path = args.train_dir epoch_num = args.epoch_num use_cuda = True if args.use_cuda else False use_parallel = True if args.parallel else False logger.info("reading data begins") user_count, item_count, cat_count = reader.config_read(config_path) data_reader, max_len = reader.prepare_reader( train_path, args.batch_size * args.num_devices) logger.info("reading data completes") avg_cost, pred, feed_list = network.network(item_count, cat_count) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=5.0)) base_lr = args.base_lr boundaries = [410000] values = [base_lr, 0.2] sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values)) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) loader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=10000, iterable=True) loader.set_sample_list_generator(data_reader, places=place) if use_parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe logger.info("train begins") global_step = 0 PRINT_STEP = 1000 total_time = [] ce_info = [] start_time = time.time() loss_sum = 0.0 for id in range(epoch_num): epoch = id + 1 for data in loader(): global_step += 1 results = train_exe.run(feed=data, fetch_list=[avg_cost.name, pred.name], return_numpy=True) loss_sum += results[0].mean() if global_step % PRINT_STEP == 0: ce_info.append(loss_sum / PRINT_STEP) total_time.append(time.time() - start_time) logger.info( "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f" % (epoch, global_step, loss_sum / PRINT_STEP, time.time() - start_time)) start_time = time.time() loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or (global_step <= 400000 and global_step % 50000 == 0): save_dir = os.path.join(args.model_dir, "global_step_" + str(global_step)) feed_var_name = [ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ] fetch_vars = [avg_cost, pred] fluid.io.save_inference_model(save_dir, feed_var_name, fetch_vars, exe) logger.info("model saved in " + save_dir) if args.enable_ce and global_step >= args.batch_num: break # only for ce if args.enable_ce: gpu_num = get_cards(args) ce_loss = 0 ce_time = 0 try: ce_loss = ce_info[-1] ce_time = total_time[-1] except: print("ce info error") print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, ce_time)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, ce_loss))