def infer(): args = parse_args() model_path = args.model_path use_cuda = True if args.use_cuda else False data_reader, _ = reader.prepare_reader(args.test_path, 32 * 16) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inference_scope = fluid.Scope() exe = fluid.Executor(place) #with fluid.scope_guard(inference_scope): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(model_path, exe) feeder = fluid.DataFeeder( feed_list=feed_target_names, place=place, program=inference_program) loss_sum = 0.0 score = [] count = 0 for data in data_reader(): res = exe.run(inference_program, feed=feeder.feed(data), fetch_list=fetch_targets) loss_sum += res[0] for i in range(len(data)): if data[i][4] > 0.5: score.append([0, 1, res[1][i]]) else: score.append([1, 0, res[1][i]]) count += 1 auc = calc_auc(score) logger.info("TEST --> loss: {}, auc: {}".format(loss_sum / count, auc))
def train_loop(main_program): data_reader, max_len = reader.prepare_reader(train_path, args.batch_size) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) feeder = fluid.DataFeeder(feed_list=[ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ], place=place) if use_parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name, main_program=main_program) else: train_exe = exe logger.info("train begins") global_step = 0 PRINT_STEP = 1000 start_time = time.time() loss_sum = 0.0 for id in range(epoch_num): epoch = id + 1 for data in data_reader(): global_step += 1 results = train_exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost.name, pred.name], return_numpy=True) loss_sum += results[0].mean() if global_step % PRINT_STEP == 0: logger.info( "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f" % (epoch, global_step, loss_sum / PRINT_STEP, time.time() - start_time)) start_time = time.time() loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or (global_step < 400000 and global_step % 50000 == 0): save_dir = args.model_dir + "/global_step_" + str( global_step) feed_var_name = [ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ] fetch_vars = [avg_cost, pred] fluid.io.save_inference_model(save_dir, feed_var_name, fetch_vars, exe) train_exe.close()
def infer(): args = parse_args() model_path = args.model_path use_cuda = True if args.use_cuda else False data_reader, _ = reader.prepare_reader(args.test_path, 32 * 16) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() inference_scope = fluid.Scope() exe = fluid.Executor(place) #with fluid.scope_guard(inference_scope): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(model_path, exe) loader = fluid.io.DataLoader.from_generator(feed_list=[ inference_program.block(0).var(e) for e in feed_target_names ], capacity=10000, iterable=True) loader.set_sample_list_generator(data_reader, places=place) loss_sum = 0.0 score = [] count = 0 for data in loader(): res = exe.run(inference_program, feed=data, fetch_list=fetch_targets) loss_sum += res[0] label_data = list(np.array(data[0]["label"])) for i in range(len(label_data)): if label_data[i] > 0.5: score.append([0, 1, res[1][i]]) else: score.append([1, 0, res[1][i]]) count += 1 auc = calc_auc(score) logger.info("TEST --> loss: {}, auc: {}".format(loss_sum / count, auc))
def train(): args = parse_args() if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED config_path = args.config_path train_path = args.train_dir epoch_num = args.epoch_num use_cuda = True if args.use_cuda else False use_parallel = True if args.parallel else False logger.info("reading data begins") user_count, item_count, cat_count = reader.config_read(config_path) data_reader, max_len = reader.prepare_reader( train_path, args.batch_size * args.num_devices) logger.info("reading data completes") avg_cost, pred, feed_list = network.network(item_count, cat_count) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=5.0)) base_lr = args.base_lr boundaries = [410000] values = [base_lr, 0.2] sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values)) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) loader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=10000, iterable=True) loader.set_sample_list_generator(data_reader, places=place) if use_parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe logger.info("train begins") global_step = 0 PRINT_STEP = 1000 total_time = [] ce_info = [] start_time = time.time() loss_sum = 0.0 for id in range(epoch_num): epoch = id + 1 for data in loader(): global_step += 1 results = train_exe.run(feed=data, fetch_list=[avg_cost.name, pred.name], return_numpy=True) loss_sum += results[0].mean() if global_step % PRINT_STEP == 0: ce_info.append(loss_sum / PRINT_STEP) total_time.append(time.time() - start_time) logger.info( "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f" % (epoch, global_step, loss_sum / PRINT_STEP, time.time() - start_time)) start_time = time.time() loss_sum = 0.0 if (global_step > 400000 and global_step % PRINT_STEP == 0) or (global_step <= 400000 and global_step % 50000 == 0): save_dir = os.path.join(args.model_dir, "global_step_" + str(global_step)) feed_var_name = [ "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label", "mask", "target_item_seq", "target_cat_seq" ] fetch_vars = [avg_cost, pred] fluid.io.save_inference_model(save_dir, feed_var_name, fetch_vars, exe) logger.info("model saved in " + save_dir) if args.enable_ce and global_step >= args.batch_num: break # only for ce if args.enable_ce: gpu_num = get_cards(args) ce_loss = 0 ce_time = 0 try: ce_loss = ce_info[-1] ce_time = total_time[-1] except: print("ce info error") print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, ce_time)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, ce_loss))