def test_save_load_same_result(self): x = np.random.randn(30, 10, 32).astype('float32') weight = np.random.randn(32, 64).astype('float32') with fluid.dygraph.guard(place): dygraph_result = simple_func(x, weight) main_program, startup_program, inputs, outputs = decorated_simple_func( x, weight) exe = fluid.Executor(place) exe.run(startup_program) fluid.save(main_program, "./test_dy2stat_save_load") # set vars to zero so that we can test load in same file for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: tensor = fluid.global_scope().find_var(var.name).get_tensor() tensor.set(np.zeros_like(np.array(tensor)), place) # make sure all the paramerter or optimizer var have been set to zero tensor_np = np.array(fluid.global_scope().find_var( var.name).get_tensor()) self.assertEqual(0, np.sum(np.abs(tensor_np))) fluid.load(main_program, "./test_dy2stat_save_load") static_result = exe.run(main_program, feed={inputs[0].name: x}, fetch_list=outputs) self.assertTrue(np.allclose(dygraph_result.numpy(), static_result))
def checkpoints(epoch, cfg, trainer, name): output_path = os.path.join(cfg.output, 'checkpoints', str(epoch)) if not os.path.exists(output_path): os.makedirs(output_path) fluid.save(trainer.program, os.path.join(output_path, name)) print('save checkpoints {} to {}'.format(name, output_path)) sys.stdout.flush()
def save_model(self, save_dir): if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if self.train_prog is not None: fluid.save(self.train_prog, osp.join(save_dir, 'model')) else: fluid.save(self.test_prog, osp.join(save_dir, 'model')) model_info = self.get_model_info() model_info['status'] = self.status with open(osp.join(save_dir, 'model.yml'), encoding='utf-8', mode='w') as f: yaml.dump(model_info, f) # 评估结果保存 if hasattr(self, 'eval_details'): with open(osp.join(save_dir, 'eval_details.json'), 'w') as f: json.dump(self.eval_details, f) if self.status == 'Prune': # 保存裁剪的shape shapes = {} for block in self.train_prog.blocks: for param in block.all_parameters(): pd_var = fluid.global_scope().find_var(param.name) pd_param = pd_var.get_tensor() shapes[param.name] = np.array(pd_param).shape with open(osp.join(save_dir, 'prune.yml'), encoding='utf-8', mode='w') as f: yaml.dump(shapes, f) # 模型保存成功的标志 open(osp.join(save_dir, '.success'), 'w').close() logging.info("Model saved in {}.".format(save_dir))
def train_loop(args, train_program, reader, data_loader, loss, trainer_id, weight): data_loader.set_batch_generator( convert_python_to_tensor(weight, args.batch_size, reader.train())) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print("CPU_NUM:" + str(os.getenv("CPU_NUM"))) train_exe = exe for pass_id in range(args.num_passes): data_loader.start() time.sleep(10) epoch_start = time.time() batch_id = 0 start = time.time() try: while True: loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = np.mean(loss_val) if batch_id % args.print_batch == 0: logger.info( "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}" .format(pass_id, batch_id, loss_val.mean(), data_loader.queue.size())) if args.with_speed: if batch_id % 500 == 0 and batch_id != 0: elapsed = (time.time() - start) start = time.time() samples = 1001 * args.batch_size * int( os.getenv("CPU_NUM")) logger.info("Time used: {}, Samples/Sec: {}".format( elapsed, samples / elapsed)) if batch_id % args.save_step == 0 and batch_id != 0: model_dir = args.model_output_dir + '/pass-' + str( pass_id) + ('/batch-' + str(batch_id)) if trainer_id == 0: fluid.save(fluid.default_main_program(), model_path=model_dir) print("model saved in %s" % model_dir) batch_id += 1 except fluid.core.EOFException: data_loader.reset() epoch_end = time.time() logger.info("Epoch: {0}, Train total expend: {1} ".format( pass_id, epoch_end - epoch_start)) model_dir = args.model_output_dir + '/pass-' + str(pass_id) if trainer_id == 0: fluid.save(fluid.default_main_program(), model_path=model_dir) print("model saved in %s" % model_dir)
def save_model(exe, program, save_dir, model_name, postfix=''): """save paramters and optimizer related varaibles""" if not os.path.isdir(save_dir): os.makedirs(save_dir) saved_model_name = model_name + postfix fluid.save(program, os.path.join(save_dir, saved_model_name)) return
def save_model(program, model_path, epoch_id, prefix='ppcls'): """ save model to the target path """ model_path = os.path.join(model_path, str(epoch_id)) _mkdir_if_not_exist(model_path) model_prefix = os.path.join(model_path, prefix) fluid.save(program, model_prefix) logger.info("Already save model in {}".format(model_path))
def train_loop(main_program, avg_cost, acc, train_input_data, place, args, train_reader): data_list = [var.name for var in train_input_data] feeder = fluid.DataFeeder(feed_list=data_list, place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) train_exe = exe total_time = 0.0 ce_info = [] for pass_id in range(args.epochs): epoch_idx = pass_id + 1 print("epoch_%d start" % epoch_idx) t0 = time.time() i = 0 for batch_id, data in enumerate(train_reader()): i += 1 loss_val, correct_val = train_exe.run( feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name]) ce_info.append(float(np.mean(correct_val)) / args.batch_size) if i % args.print_batch == 0: logger.info( "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}". format(pass_id, batch_id, np.mean(loss_val), float(np.mean(correct_val)) / args.batch_size)) if args.enable_ce and i > args.step_num: break t1 = time.time() total_time += t1 - t0 print("epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, total_time / epoch_idx)) save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx) fluid.save(fluid.default_main_program(), save_dir) print("model saved in %s" % save_dir) # only for ce if args.enable_ce: ce_acc = 0 try: ce_acc = ce_info[-2] except: print("ce info error") epoch_idx = args.epochs device = get_device(args) if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % (cpu_num, threads_num, ce_acc))
def save(exe, prog, path): """ Load model from the given path. Args: exe (fluid.Executor): The fluid.Executor object. prog (fluid.Program): save weight from which Program object. path (string): the path to save model. """ if os.path.isdir(path): shutil.rmtree(path) logger.info('Save model to {}.'.format(path)) fluid.save(prog, path)
def save_checkpoint(program, ckpt_name): """ Save checkpoint for evaluation or resume training """ ckpt_dir = os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, str(ckpt_name)) print("Save model checkpoint to {}".format(ckpt_dir)) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir
def train(): args = parse_args() # add ce if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED print(args) if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin) optimizer = fluid.optimizer.SGD( learning_rate=args.lr, regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) optimizer.minimize(loss) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) dataset.set_pipe_command('python criteo_reader.py') dataset.set_batch_size(args.batch_size) dataset.set_filelist([ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ]) if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[loss, auc], fetch_info=['loss', 'auc'], debug=False, print_period=args.print_steps) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1), "checkpoint") sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) fluid.save(fluid.default_main_program(), model_dir)
def save_checkpoint(program, ckpt_name, epoch=0): """ Save checkpoint for evaluation or resume training """ ckpt_dir = os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, str(ckpt_name)) print("Save model checkpoint to {}".format(ckpt_dir)) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) epoch_file = os.path.join(ckpt_dir, 'epoch.txt') with open(epoch_file, 'w') as f: f.write('{}'.format(epoch)) fluid.save(program, os.path.join(ckpt_dir, 'model')) return ckpt_dir
def train_loop(main_program): """ train network """ start_time = time.time() dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(dcn_model.data_list) pipe_command = 'python reader.py {}'.format(args.vocab_dir) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, fname) for fname in next(os.walk(args.train_data_dir))[2] ] dataset.set_filelist(train_filelist) if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) exe.train_from_dataset( program=main_program, dataset=dataset, fetch_list=[ dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var ], fetch_info=['total_loss', 'avg_logloss', 'auc'], debug=False, print_period=args.print_steps) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1), "checkpoint") sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) if args.trainer_id == 0: # only trainer 0 save model print("save model in {}".format(model_dir)) fluid.save(main_program, model_dir) print("train time cost {:.4f}".format(time.time() - start_time)) print("finish training")
def save_model(exe, program, save_dir, model_name, postfix=None, save_type='.pdckpt'): """ save_type: '.pdckpt' or '.pdparams', '.pdckpt' for all persistable variables, '.pdparams' for parameters only """ if not os.path.isdir(save_dir): os.makedirs(save_dir) saved_model_name = model_name + postfix fluid.save(program, os.path.join(save_dir, saved_model_name)) return
def save_model(exe, program, save_dir, model_name, postfix=None): """save_model""" #model_path = os.path.join(save_dir, model_name + postfix) #if os.path.isdir(model_path): # shutil.rmtree(model_path) ##fluid.io.save_persistables(exe, model_path, main_program=program) #save_vars = [x for x in program.list_vars() \ # if isinstance(x, fluid.framework.Parameter)] #fluid.io.save_vars(exe, dirname=model_path, main_program=program, vars=save_vars, filename="param") if not os.path.isdir(save_dir): os.makedirs(save_dir) saved_model_name = model_name + postfix fluid.save(program, os.path.join(save_dir, saved_model_name)) return
def local_train(args): # 引入模型的组网 ctr_model = CTR() inputs = ctr_model.input_data(args) avg_cost, auc_var = ctr_model.net(inputs, args) # 选择反向更新优化策略 optimizer = fluid.optimizer.Adam(args.learning_rate) optimizer.minimize(avg_cost) # 在CPU上创建训练的执行器并做参数初始化 exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) # 引入训练数据读取器与训练数据列表 dataset, file_list = get_dataset(inputs, args) logger.info("Training Begin") for epoch in range(args.epochs): # 以文件为粒度进行shuffle random.shuffle(file_list) dataset.set_filelist(file_list) # 使用train_from_dataset实现多线程并发训练 start_time = time.time() exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[auc_var], fetch_info=["Epoch {} auc ".format(epoch)], print_period=100, debug=False) end_time = time.time() logger.info("epoch %d finished, use time=%d\n" % ((epoch), end_time - start_time)) if args.save_model: model_path = os.path.join(str(args.model_path), "epoch_" + str(epoch)) if not os.path.isdir(model_path): os.mkdir(model_path) fluid.save(fluid.default_main_program(), os.path.join(model_path, "checkpoint")) logger.info("Train Success!")
def train_loop(main_program): """ train network """ start_time = time.time() dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ] if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) dataset.set_filelist(train_filelist) exe.train_from_dataset( program=main_program, dataset=dataset, fetch_list=[loss, auc], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"], print_period=5, debug=False) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1)) sys.stderr.write('epoch%d is finished and takes %f s\n' % ( (epoch_id + 1), time.time() - start)) if args.trainer_id == 0: # only trainer 0 save model print("save model in {}".format(model_dir)) fluid.save(main_program, model_dir) print("train time cost {:.4f}".format(time.time() - start_time)) print("finish training")
def save_model(self, save_dir): if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) model_info = self.get_model_info() if self.status == 'Normal': fluid.save(self.train_prog, osp.join(save_dir, 'model')) save_infer_program(self.test_prog, save_dir) model_info['status'] = self.status with open(osp.join(save_dir, 'model.yml'), encoding='utf-8', mode='w') as f: yaml.dump(model_info, f) # The flag of model for saving successfully open(osp.join(save_dir, '.success'), 'w').close() logging.info("Model saved in {}.".format(save_dir))
def train(): if cfg.train_model == 'dnn': model = DNN() inputs = model.input_data() avg_cost, auc_var = model.net(inputs) optimizer = fluid.optimizer.Adam(cfg.learning_rate) optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if cfg.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) dataset, file_list = get_dataset(inputs) logger.info("Training Begin") for epoch in range(cfg.epoches): random.shuffle(file_list) dataset.set_filelist(file_list) start_time = time.time() exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_list=[avg_cost, auc_var], fetch_info=['Epoch {} cost: '.format(epoch + 1), ' - auc: '], print_period=cfg.log_interval, debug=False) end_time = time.time() logger.info("epoch %d finished, use time = %ds \n" % ((epoch + 1), end_time - start_time)) if (epoch + 1) % cfg.save_interval == 0: model_path = os.path.join(str(cfg.save_path), model.name, model.name + "_epoch_" + str(epoch + 1)) if not os.path.isdir(model_path): os.makedirs(model_path) logger.info("saving model to %s \n" % (model_path)) fluid.save(fluid.default_main_program(), os.path.join(model_path, "checkpoint")) logger.info("Done.")
def save_model(self, save_dir): if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) model_info = self.get_model_info() if self.status == 'Normal': fluid.save(self.train_prog, osp.join(save_dir, 'model')) elif self.status == 'Quant': float_prog, _ = slim.quant.convert(self.test_prog, self.exe.place, save_int8=True) test_input_names = [ var.name for var in list(self.test_inputs.values()) ] test_outputs = list(self.test_outputs.values()) fluid.io.save_inference_model(dirname=save_dir, executor=self.exe, params_filename='__params__', feeded_var_names=test_input_names, target_vars=test_outputs, main_program=float_prog) model_info['_ModelInputsOutputs'] = dict() model_info['_ModelInputsOutputs']['test_inputs'] = [ [k, v.name] for k, v in self.test_inputs.items() ] model_info['_ModelInputsOutputs']['test_outputs'] = [ [k, v.name] for k, v in self.test_outputs.items() ] model_info['status'] = self.status with open(osp.join(save_dir, 'model.yml'), encoding='utf-8', mode='w') as f: yaml.dump(model_info, f) # The flag of model for saving successfully open(osp.join(save_dir, '.success'), 'w').close() logging.info("Model saved in {}.".format(save_dir))
def test_ptb_rnn_cpu_bfloat16(self): seed = 90 hidden_size = 10 vocab_size = 500 num_layers = 1 num_steps = 3 init_scale = 0.1 batch_size = 4 batch_num = 100 with new_program_scope(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed ptb_model = PtbModel("ptb_model", hidden_size=hidden_size, vocab_size=vocab_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale) place = self.set_place() exe = fluid.Executor(place) sgd = SGDOptimizer(learning_rate=1e-3) x = fluid.layers.data(name="x", shape=[-1, num_steps], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data(name="init_hidden", shape=[1], dtype='float32') init_cell = fluid.layers.data(name="init_cell", shape=[1], dtype='float32') static_loss, static_last_hidden, static_last_cell = ptb_model( x, y, init_hidden, init_cell) sgd = paddle.static.amp.bf16.decorate_bf16( sgd, amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( custom_fp32_list={'transpose2', 'concat'}), use_bf16_guard=False, use_pure_bf16=True) sgd.minimize(static_loss, framework.default_startup_program()) out = exe.run(framework.default_startup_program()) for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, num_steps, 1)) y_data = y_data.reshape((-1, 1)) #TODO investigate initializing model with "float32" instead of "uint16" as it was before # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) init_hidden_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='uint16') init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='uint16') fetch_list = [ static_loss, static_last_hidden, static_last_cell ] out = exe.run(fluid.default_main_program(), feed={ "x": x_data, "y": y_data, "init_hidden": init_hidden_data, "init_cell": init_cell_data }, fetch_list=fetch_list) # get value before save main_program = framework.default_main_program() base_map = {} for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) # make sure all the paramerter or optimizer var have been update self.assertTrue(np.sum(np.abs(t)) != 0) base_map[var.name] = t fluid.save(main_program, "./test_1") # set var to zero for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: ten = fluid.global_scope().find_var(var.name).get_tensor() ten.set(np.zeros_like(np.array(ten)), place) new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) # make sure all the paramerter or optimizer var have been set to zero self.assertTrue(np.sum(np.abs(new_t)) == 0) fluid.load(main_program, "./test_1.pdparams", exe) for var in main_program.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: new_t = np.array(fluid.global_scope().find_var( var.name).get_tensor()) base_t = base_map[var.name] self.assertTrue(np.array_equal(new_t, base_t))
def save_model(program, model_path): """ save model to the target path """ fluid.save(program, model_path) logger.info("Already save model in {}".format(model_path))
def main(args): """ Main Function """ if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) else: place = fluid.CPUPlace() exe = fluid.Executor(place) task_name = args.task_name.lower() processor = reader.EmoTectProcessor(data_dir=args.data_dir, vocab_path=args.vocab_path, random_seed=args.random_seed) #num_labels = len(processor.get_labels()) num_labels = args.num_labels if not (args.do_train or args.do_val or args.do_infer): raise ValueError("For args `do_train`, `do_val` and `do_infer`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch) num_train_examples = processor.get_num_examples(phase="train") max_train_steps = args.epoch * num_train_examples // args.batch_size + 1 print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) train_program = fluid.Program() if args.random_seed is not None: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_loader, loss, accuracy, num_seqs = create_model( args, num_labels=num_labels, is_prediction=False) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr) sgd_optimizer.minimize(loss) if args.verbose: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val: if args.do_train: test_data_generator = processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1) else: test_data_generator = processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_loader, loss, accuracy, num_seqs = create_model( args, num_labels=num_labels, is_prediction=False) test_prog = test_prog.clone(for_test=True) if args.do_infer: infer_data_generator = processor.data_generator( batch_size=args.batch_size, phase='infer', epoch=1) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): infer_loader, probs, _ = create_model(args, num_labels=num_labels, is_prediction=True) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog) elif args.do_val or args.do_infer: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or infer!") utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog) if args.do_train: train_exe = exe train_loader.set_sample_list_generator(train_data_generator) else: train_exe = None if args.do_val: test_exe = exe test_loader.set_sample_list_generator(test_data_generator) if args.do_infer: test_exe = exe infer_loader.set_sample_list_generator(infer_data_generator) if args.do_train: train_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() ce_info = [] while True: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [] outputs = train_exe.run(program=train_program, fetch_list=fetch_list, return_numpy=False) if steps % args.skip_steps == 0: np_loss, np_acc, np_num_seqs = outputs np_loss = np.array(np_loss) np_acc = np.array(np_acc) np_num_seqs = np.array(np_num_seqs) total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train loader queue size: %d, " % train_loader.queue.size( ) print(verbose) time_end = time.time() used_time = time_end - time_begin print("step: %d, avg loss: %f, " "avg acc: %f, speed: %f steps/s" % (steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([ np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time ]) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.save_checkpoint_dir, "step_" + str(steps)) fluid.save(train_program, save_path) if steps % args.validation_steps == 0: # evaluate on dev set if args.do_val: evaluate(test_exe, test_prog, test_loader, [loss.name, accuracy.name, num_seqs.name], "dev") except fluid.core.EOFException: print("final step: %d " % steps) if args.do_val: evaluate(test_exe, test_prog, test_loader, [loss.name, accuracy.name, num_seqs.name], "dev") save_path = os.path.join(args.save_checkpoint_dir, "step_" + str(steps)) fluid.save(train_program, save_path) train_loader.reset() break if args.do_train and args.enable_ce: card_num = get_cards() ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\teach_step_duration_%s_card%s\t%s" % (task_name, card_num, ce_time)) print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc)) # evaluate on test set if not args.do_train and args.do_val: print("Final test result:") evaluate(test_exe, test_prog, test_loader, [loss.name, accuracy.name, num_seqs.name], "test") # infer if args.do_infer: print("Final infer result:") infer(test_exe, test_prog, infer_loader, [probs.name], "infer")
def train(): ce_time = [] ce_ppl = [] max_epoch = args.max_epoch kl_w = args.kl_start lr_w = args.learning_rate best_valid_nll = 1e100 # +inf best_epoch_id = -1 decay_cnt = 0 max_decay = args.max_decay decay_factor = 0.5 decay_ts = 2 steps_not_improved = 0 for epoch_id in range(max_epoch): start_time = time.time() if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, args.sort_cache, args.cache_num, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size, args.sort_cache, args.cache_num) total_loss = 0 total_rec_loss = 0 total_kl_loss = 0 word_count = 0.0 batch_count = 0.0 batch_times = [] for batch_id, batch in enumerate(train_data_iter): batch_start_time = time.time() kl_w = min(1.0, kl_w + anneal_r) kl_weight = kl_w input_data_feed, src_word_num, dec_word_sum = prepare_input( batch, kl_weight, lr_w) fetch_outs = exe.run( program=train_program, feed=input_data_feed, fetch_list=[loss.name, kl_loss.name, rec_loss.name], use_program_cache=False) cost_train = np.array(fetch_outs[0]) kl_cost_train = np.array(fetch_outs[1]) rec_cost_train = np.array(fetch_outs[2]) total_loss += cost_train * batch_size total_rec_loss += rec_cost_train * batch_size total_kl_loss += kl_cost_train * batch_size word_count += dec_word_sum batch_count += batch_size batch_end_time = time.time() batch_time = batch_end_time - batch_start_time batch_times.append(batch_time) if batch_id > 0 and batch_id % 200 == 0: print("-- Epoch:[%d]; Batch:[%d]; Time: %.4f s; " "kl_weight: %.4f; kl_loss: %.4f; rec_loss: %.4f; " "nll: %.4f; ppl: %.4f" % (epoch_id, batch_id, batch_time, kl_w, total_kl_loss / batch_count, total_rec_loss / batch_count, total_loss / batch_count, np.exp(total_loss / word_count))) ce_ppl.append(np.exp(total_loss / word_count)) end_time = time.time() epoch_time = end_time - start_time ce_time.append(epoch_time) print( "\nTrain epoch:[%d]; Epoch Time: %.4f; avg_time: %.4f s/step\n" % (epoch_id, epoch_time, sum(batch_times) / len(batch_times))) val_nll, val_ppl = eval(valid_data) print("dev ppl", val_ppl) test_nll, test_ppl = eval(test_data) print("test ppl", test_ppl) if val_nll < best_valid_nll: best_valid_nll = val_nll steps_not_improved = 0 best_nll = test_nll best_ppl = test_ppl best_epoch_id = epoch_id save_path = os.path.join(args.model_path, "epoch_" + str(best_epoch_id), "checkpoint") print("save model {}".format(save_path)) fluid.save(main_program, save_path) else: steps_not_improved += 1 if steps_not_improved == decay_ts: old_lr = lr_w lr_w *= decay_factor steps_not_improved = 0 new_lr = lr_w print('-----\nchange lr, old lr: %f, new lr: %f\n-----' % (old_lr, new_lr)) dir_name = args.model_path + "/epoch_" + str(best_epoch_id) fluid.load(main_program, dir_name, exe) decay_cnt += 1 if decay_cnt == max_decay: break print('\nbest testing nll: %.4f, best testing ppl %.4f\n' % (best_nll, best_ppl)) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def testLoadStaticModel(self): # static mode a = fluid.data(name="a", shape=[10, 10]) conv_in = fluid.data(name="conv_in", shape=[None, 10, 10, 10]) fc_out1 = fluid.layers.fc(a, 10) fc_out2 = fluid.layers.fc(a, 20) conv_out_1 = fluid.layers.conv2d(conv_in, num_filters=10, filter_size=5, act="relu") conv_out_2 = fluid.layers.conv2d(conv_in, num_filters=10, filter_size=5, act="relu") conv3d_in = fluid.data(name='conv3d_in', shape=[None, 3, 12, 32, 32], dtype='float32') conv3d_out_1 = fluid.layers.conv3d(input=conv3d_in, num_filters=2, filter_size=3, act="relu") conv3d_out_2 = fluid.layers.conv3d(input=conv3d_in, num_filters=2, filter_size=3, act="relu") batchnorm_in = fluid.data(name="batchnorm_in", shape=[None, 10], dtype='float32') batchnorm_out_1 = fluid.layers.batch_norm(batchnorm_in) batchnorm_out_2 = fluid.layers.batch_norm(batchnorm_in) emb_in = fluid.data(name='emb_in', shape=[None, 10], dtype='int64') emb_out_1 = fluid.embedding(emb_in, [1000, 100]) emb_out_2 = fluid.embedding(emb_in, [2000, 200]) layernorm = fluid.data(name="ln", shape=[None, 10], dtype='float32') layernorm_1 = fluid.layers.layer_norm(layernorm) layernorm_2 = fluid.layers.layer_norm(layernorm) nce_in = fluid.data(name="nce_in", shape=[None, 100], dtype='float32') nce_label = fluid.data(name="nce_label", shape=[None, 10], dtype='int64') nce_out_1 = fluid.layers.nce(nce_in, nce_label, 10000) nce_out_2 = fluid.layers.nce(nce_in, nce_label, 10000) prelu_in = fluid.data(name="prelu_in", shape=[None, 5, 10, 10], dtype='float32') prelu_out_1 = fluid.layers.prelu(prelu_in, "channel") prelu_out_2 = fluid.layers.prelu(prelu_in, "channel") bilinear_tensor_pro_x = fluid.data("t1", shape=[None, 5], dtype="float32") bilinear_tensor_pro_y = fluid.data("t2", shape=[None, 4], dtype="float32") bilinear_tensor_pro_out_1 = fluid.layers.bilinear_tensor_product( x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000) bilinear_tensor_pro_out_2 = fluid.layers.bilinear_tensor_product( x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000) conv2d_trans_in = fluid.data(name="conv2d_trans_in", shape=[None, 10, 10, 10]) conv2d_trans_out_1 = fluid.layers.conv2d_transpose(conv2d_trans_in, num_filters=10, filter_size=5, act="relu") conv2d_trans_out_2 = fluid.layers.conv2d_transpose(conv2d_trans_in, num_filters=10, filter_size=5, act="relu") conv3d_trans_in = fluid.data(name='conv3d_trans_in', shape=[None, 3, 12, 32, 32], dtype='float32') conv3d_trans_out_1 = fluid.layers.conv3d_transpose( input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu") conv3d_trans_out_2 = fluid.layers.conv3d_transpose( input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu") groupnorm_in = fluid.data(name='groupnorm_in', shape=[None, 8, 32, 32], dtype='float32') groupnorm_out1 = fluid.layers.group_norm(input=groupnorm_in, groups=4) groupnorm_out2 = fluid.layers.group_norm(input=groupnorm_in, groups=4) ''' spec_norm = fluid.data(name='spec_norm', shape=[2, 8, 32, 32], dtype='float32') spe_norm_out_1 = fluid.layers.spectral_norm(weight=spec_norm, dim=1, power_iters=2) spe_norm_out_2 = fluid.layers.spectral_norm(weight=spec_norm, dim=1, power_iters=2) ''' nodes_vector = fluid.data(name='vectors', shape=[None, 10, 5], dtype='float32') edge_set = fluid.data(name='edge_set', shape=[None, 10, 2], dtype='float32') tree_conv_out1 = fluid.contrib.layers.tree_conv( nodes_vector, edge_set, 6, 1, 2) tree_conv_out2 = fluid.contrib.layers.tree_conv( nodes_vector, edge_set, 6, 1, 2) para1 = fluid.layers.create_parameter([100, 100], 'float32', name="weight_test_1") para2 = fluid.layers.create_parameter([20, 200], 'float32', name="weight_test_2") para_list = fluid.default_main_program().list_vars() exe = fluid.Executor(fluid.CPUPlace( ) if not fluid.is_compiled_with_cuda() else fluid.CUDAPlace(0)) out = exe.run(framework.default_startup_program()) fluid.save(framework.default_main_program(), "./test_1") para_dict = fluid.load_program_state("./test_1") new_dict = {} for k, v in para_dict.items(): #print( k, v.shape ) if k.startswith("fc"): name = k.replace("fc", "linear", 1) new_dict[name] = v else: new_dict[k] = v with fluid.dygraph.guard(): class MyTest(fluid.dygraph.Layer): def __init__(self): super(MyTest, self).__init__() self.linear1 = Linear(10, 10) self.lienar2 = Linear(10, 20) self.conv2d_1 = Conv2D(num_channels=10, num_filters=10, filter_size=5, act="relu") self.conv2d_2 = Conv2D(num_channels=10, num_filters=10, filter_size=5, act="relu") self.conv3d_1 = Conv3D(num_channels=3, num_filters=2, filter_size=3, act="relu") self.conv3d_2 = Conv3D(num_channels=3, num_filters=2, filter_size=3, act="relu") self.batch_norm_1 = BatchNorm(10) self.batch_norm_2 = BatchNorm(10) self.emb1 = Embedding([1000, 100]) self.emb2 = Embedding([2000, 200]) self.layer_norm_1 = LayerNorm([10]) self.layer_norm_2 = LayerNorm(10) self.nce1 = NCE(10000, 100) self.nce2 = NCE(10000, 100) self.prelu1 = PRelu("channel", channel=5) self.prelu2 = PRelu("channel", channel=5) self.group_norm1 = GroupNorm(8, 4) self.gourp_norm2 = GroupNorm(8, 4) self.w_1 = self.create_parameter([100, 100], dtype='float32', attr="weight_test_1") self.w_2 = self.create_parameter([20, 200], dtype='float32', attr="weight_test_2") my_test = MyTest() my_test.set_dict(new_dict, use_structured_name=False) for k, v in my_test.state_dict().items(): self.assertTrue(np.array_equal(v.numpy(), new_dict[v.name]))
def train_static(args, batch_generator): paddle.manual_seed(SEED) paddle.framework.random._manual_program_seed(SEED) train_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): # define input and reader input_field_names = util.encoder_data_input_fields + \ util.decoder_data_input_fields[:-1] + util.label_data_input_fields input_descs = util.get_input_descs(args) input_slots = [{ "name": name, "shape": input_descs[name][0], "dtype": input_descs[name][1] } for name in input_field_names] input_field = util.InputField(input_slots) # Define DataLoader data_loader = fluid.io.DataLoader.from_generator( input_field.feed_list, capacity=60) data_loader.set_batch_generator(batch_generator, places=place) # define model transformer = Transformer( args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, args.d_inner_hid, args.prepostprocess_dropout, args.attention_dropout, args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, args.weight_sharing, args.bos_idx, args.eos_idx) logits = transformer(*input_field.feed_list[:7]) # define loss criterion = CrossEntropyCriterion(args.label_smooth_eps) lbl_word, lbl_weight = input_field.feed_list[7:] sum_cost, avg_cost, token_num = criterion(logits, lbl_word, lbl_weight) # define optimizer learning_rate = fluid.layers.learning_rate_scheduler.noam_decay( args.d_model, args.warmup_steps, args.learning_rate) optimizer = fluid.optimizer.Adam( learning_rate=learning_rate, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps)) optimizer.minimize(avg_cost) # the best cross-entropy value with label smoothing loss_normalizer = -((1. - args.label_smooth_eps) * np.log( (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log( args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) step_idx = 0 total_batch_num = 0 avg_loss = [] exe = fluid.Executor(place) exe.run(startup_prog) for pass_id in range(args.epoch): batch_id = 0 for feed_dict in data_loader: outs = exe.run(program=train_prog, feed=feed_dict, fetch_list=[sum_cost.name, token_num.name]) if step_idx % args.print_step == 0: sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[ 1]) total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num avg_loss.append(total_avg_cost) if step_idx == 0: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) avg_batch_time = time.time() else: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, speed: %.2f steps/s" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]), args.print_step / (time.time() - avg_batch_time))) avg_batch_time = time.time() batch_id += 1 step_idx += 1 total_batch_num = total_batch_num + 1 if step_idx == STEP_NUM: if args.save_dygraph_model_path: model_path = os.path.join(args.save_static_model_path, "transformer") fluid.save(train_prog, model_path) break return np.array(avg_loss)
end_level = 3 for i in range(start_level, end_level + 1): if i == 0: w = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.conv.weight' % (i,)] scale = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.gn.weight' % (i,)] offset = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.gn.bias' % (i,)] copy_conv_gn('mask_feat_head.convs_all_levels.%d.conv0' % (i,), w, None, scale, offset) continue for j in range(i): w = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.conv.weight' % (i, j)] scale = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.gn.weight' % (i, j)] offset = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.gn.bias' % (i, j)] copy_conv_gn('mask_feat_head.convs_all_levels.%d.conv%d' % (i, j), w, None, scale, offset) w = mask_feat_head_dic['mask_feat_head.conv_pred.0.conv.weight'] scale = mask_feat_head_dic['mask_feat_head.conv_pred.0.gn.weight'] offset = mask_feat_head_dic['mask_feat_head.conv_pred.0.gn.bias'] copy_conv_gn('mask_feat_head.conv_pred.0', w, None, scale, offset) import os if not os.path.exists('output/'): os.mkdir('output/') if not os.path.exists('output/solov2_light_448_r50_fpn_8gpu_3x/'): os.mkdir('output/solov2_light_448_r50_fpn_8gpu_3x/') fluid.save(fluid.default_startup_program(), 'output/solov2_light_448_r50_fpn_8gpu_3x/model_final') print('\nDone.')
def save_model(): if not os.path.exists(cfg.model_path): os.makedirs(cfg.model_path) fluid.save(program=fluid.default_main_program(), model_path=os.path.join(cfg.model_path, "model")) print("Saved model to: %s" % cfg.model_path)
def train(args): if args.enable_ce: SEED = 102 fluid.default_startup_program().random_seed = SEED fluid.default_main_program().random_seed = SEED use_cuda = True if args.use_cuda else False parallel = True if args.parallel else False print("use_cuda:", use_cuda, "parallel:", parallel) train_reader, vocab_size = utils.construct_train_data( args.train_dir, args.vocab_path, args.batch_size * get_cards(args)) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim, args.hidden_size) # Train program train_input_data, cos_pos, avg_cost, acc = ssr.train() # Optimization to minimize lost optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr) optimizer.minimize(avg_cost) data_list = [var.name for var in train_input_data] print(data_list) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) loader = fluid.io.DataLoader.from_generator(feed_list=train_input_data, capacity=10000, iterable=True) loader.set_sample_list_generator(train_reader, places=place) if parallel: train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe total_time = 0.0 ce_info = [] for pass_id in range(args.epochs): epoch_idx = pass_id + 1 print("epoch_%d start" % epoch_idx) t0 = time.time() i = 0 for batch_id, data in enumerate(loader()): i += 1 loss_val, correct_val = train_exe.run( feed=data, fetch_list=[avg_cost.name, acc.name]) ce_info.append(float(np.mean(correct_val)) / args.batch_size) if i % args.print_batch == 0: logger.info( "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}". format(pass_id, batch_id, np.mean(loss_val), float(np.mean(correct_val)) / args.batch_size)) if args.enable_ce and i > args.step_num: break t1 = time.time() total_time += t1 - t0 print("epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, total_time / epoch_idx)) save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx) fluid.save(fluid.default_main_program(), model_path=save_dir) print("model saved in %s" % save_dir) # only for ce if args.enable_ce: ce_acc = 0 try: ce_acc = ce_info[-2] except: print("ce info error") epoch_idx = args.epochs device = get_device(args) if args.use_cuda: gpu_num = device[1] print("kpis\teach_pass_duration_gpu%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc)) else: cpu_num = device[1] threads_num = device[2] print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" % (cpu_num, threads_num, total_time / epoch_idx)) print("kpis\ttrain_acc_cpu%s_thread%s\t%s" % (cpu_num, threads_num, ce_acc))
def main(args): """ Main Function """ ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) reader = task_reader.ClassifyReader(vocab_path=args.vocab_path, label_map_config=args.label_map_config, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, random_seed=args.random_seed) if not (args.do_train or args.do_val or args.do_infer): raise ValueError("For args `do_train`, `do_val` and `do_infer`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = reader.data_generator( input_file=args.train_set, batch_size=args.batch_size, epoch=args.epoch, shuffle=True, phase="train") num_train_examples = reader.get_num_examples(args.train_set) max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): # create ernie_pyreader train_pyreader, ernie_inputs, labels = ernie_pyreader( args, pyreader_name='train_pyreader') # get ernie_embeddings if args.use_paddle_hub: embeddings = ernie_encoder_with_paddle_hub( ernie_inputs, args.max_seq_len) else: embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) # user defined model based on ernie embeddings loss, accuracy, num_seqs = create_model(args, embeddings, labels=labels, is_prediction=False) optimizer = fluid.optimizer.Adam(learning_rate=args.lr) optimizer.minimize(loss) if args.verbose: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val: test_data_generator = reader.data_generator(input_file=args.dev_set, batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): # create ernie_pyreader test_pyreader, ernie_inputs, labels = ernie_pyreader( args, pyreader_name='eval_reader') # get ernie_embeddings if args.use_paddle_hub: embeddings = ernie_encoder_with_paddle_hub( ernie_inputs, args.max_seq_len) else: embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) # user defined model based on ernie embeddings loss, accuracy, num_seqs = create_model(args, embeddings, labels=labels, is_prediction=False) test_prog = test_prog.clone(for_test=True) if args.do_infer: infer_data_generator = reader.data_generator( input_file=args.test_set, batch_size=args.batch_size, phase='infer', epoch=1, shuffle=False) infer_prog = fluid.Program() with fluid.program_guard(infer_prog, startup_prog): with fluid.unique_name.guard(): infer_pyreader, ernie_inputs, labels = ernie_pyreader( args, pyreader_name="infer_pyreader") # get ernie_embeddings if args.use_paddle_hub: embeddings = ernie_encoder_with_paddle_hub( ernie_inputs, args.max_seq_len) else: embeddings = ernie_encoder(ernie_inputs, ernie_config=ernie_config) probs = create_model(args, embeddings, labels=labels, is_prediction=True) infer_prog = infer_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=train_program) elif args.do_val: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=test_prog) elif args.do_infer: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=infer_prog) if args.do_train: train_exe = exe train_pyreader.set_batch_generator(train_data_generator) else: train_exe = None if args.do_val: test_exe = exe test_pyreader.set_batch_generator(test_data_generator) if args.do_infer: test_exe = exe infer_pyreader.set_batch_generator(infer_data_generator) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() while True: try: steps += 1 if steps % args.skip_steps == 0: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [] outputs = train_exe.run(program=train_program, fetch_list=fetch_list, return_numpy=False) if steps % args.skip_steps == 0: np_loss, np_acc, np_num_seqs = outputs np_loss = np.array(np_loss) np_acc = np.array(np_acc) np_num_seqs = np.array(np_num_seqs) total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size( ) print(verbose) time_end = time.time() used_time = time_end - time_begin print("step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps), "checkpoint") fluid.save(train_program, save_path) if steps % args.validation_steps == 0: # evaluate dev set if args.do_val: evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps), "checkpoint") fluid.save(train_program, save_path) train_pyreader.reset() break # final eval on dev set if args.do_val: print("Final validation result:") evaluate(exe, test_prog, test_pyreader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_infer: print("Final test result:") infer(exe, infer_prog, infer_pyreader, [probs.name], "infer")
def save_controller(self, program, output_dir): fluid.save(program, output_dir)