def train(): args = parse_args() # add ce if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED print('---------- Configuration Arguments ----------') for key, value in args.__dict__.items(): print(key + ':' + str(value)) if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc, data_list, auc_states = ctr_deepfm_model( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes, args.act, args.reg) optimizer = fluid.optimizer.SGD( learning_rate=args.lr, regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) optimizer.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ] print('---------------------------------------------') for epoch_id in range(args.num_epoch): start = time.time() dataset.set_filelist(train_filelist) exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_list=[loss, auc], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"], print_period=1000, debug=False) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1)) sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) fluid.io.save_persistables(executor=exe, dirname=model_dir, main_program=fluid.default_main_program())
def infer(): args = parse_args() place = fluid.CPUPlace() inference_scope = fluid.Scope() test_files = [ os.path.join(args.test_data_dir, x) for x in os.listdir(args.test_data_dir) ] criteo_dataset = CriteoDataset() criteo_dataset.setup(args.feat_dict) test_reader = paddle.batch( criteo_dataset.test(test_files), batch_size=args.batch_size) startup_program = fluid.framework.Program() test_program = fluid.framework.Program() cur_model_path = os.path.join(args.model_output_dir, 'epoch_' + args.test_epoch) with fluid.scope_guard(inference_scope): with fluid.framework.program_guard(test_program, startup_program): loss, auc, data_list, auc_states = ctr_deepfm_model( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes, args.act, args.reg) exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=data_list, place=place) fluid.io.load_persistables( executor=exe, dirname=cur_model_path, main_program=fluid.default_main_program()) for var in auc_states: # reset auc states set_zero(var.name, scope=inference_scope, place=place) loss_all = 0 num_ins = 0 for batch_id, data_test in enumerate(test_reader()): loss_val, auc_val = exe.run(test_program, feed=feeder.feed(data_test), fetch_list=[loss.name, auc.name]) num_ins += len(data_test) loss_all += loss_val logger.info('TEST --> batch: {} loss: {} auc_val: {}'.format( batch_id, loss_all / num_ins, auc_val)) print( 'The last log info is the total Logloss and AUC for all test data. ' )
def train(): """ do training """ args = parse_args() print(args) if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc, data_list, auc_states = ctr_deepfm_model( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes, args.act, args.reg, args.is_sparse) optimizer = fluid.optimizer.SGD( learning_rate=args.lr, regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) optimizer.minimize(loss) def train_loop(main_program): """ train network """ start_time = time.time() dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ] if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) dataset.set_filelist(train_filelist) exe.train_from_dataset( program=main_program, dataset=dataset, fetch_list=[loss, auc], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"], print_period=5, debug=False) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1)) sys.stderr.write('epoch%d is finished and takes %f s\n' % ( (epoch_id + 1), time.time() - start)) if args.trainer_id == 0: # only trainer 0 save model print("save model in {}".format(model_dir)) fluid.save(main_program, model_dir) print("train time cost {:.4f}".format(time.time() - start_time)) print("finish training") if args.is_local: print("run local training") train_loop(fluid.default_main_program()) else: print("run distribute training") t = fluid.DistributeTranspiler() t.transpile( args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": print("run psever") pserver_prog, pserver_startup = t.get_pserver_programs( args.current_endpoint) exe = fluid.Executor(fluid.CPUPlace()) exe.run(pserver_startup) exe.run(pserver_prog) elif args.role == "trainer": print("run trainer") train_loop(t.get_trainer_program())