def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight, lr): py_reader.decorate_tensor_provider( convert_python_to_tensor(weight, args.batch_size, reader.train())) place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) print("CPU_NUM:" + str(os.getenv("CPU_NUM"))) train_exe = exe for pass_id in range(args.num_passes): py_reader.start() time.sleep(10) epoch_start = time.time() batch_id = 0 start = time.time() try: while True: loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = np.mean(loss_val) if batch_id % args.print_batch == 0: logger.info( "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}" .format(pass_id, batch_id, loss_val.mean(), py_reader.queue.size())) if args.with_speed: if batch_id % 500 == 0 and batch_id != 0: elapsed = (time.time() - start) start = time.time() samples = 1001 * args.batch_size * int( os.getenv("CPU_NUM")) logger.info("Time used: {}, Samples/Sec: {}".format( elapsed, samples / elapsed)) lr.step() if batch_id % args.save_step == 0 and batch_id != 0: model_dir = args.model_output_dir + '/pass-' + str( pass_id) + ('/batch-' + str(batch_id)) if trainer_id == 0: paddle.static.save(exe, model_dir, train_program) print("model saved in %s" % model_dir) batch_id += 1 except paddle.fluid.core.EOFException: py_reader.reset() epoch_end = time.time() logger.info("Epoch: {0}, Train total expend: {1} ".format( pass_id, epoch_end - epoch_start)) model_dir = args.model_output_dir + '/pass-' + str(pass_id) if trainer_id == 0: paddle.static.save(exe, model_dir, train_program) print("model saved in %s" % model_dir)
def prepare_reader(is_train, pyreader, args, pass_id=0): if is_train: reader = train(data_dir=args.data_dir, pass_id_as_seed=pass_id) else: reader = val(data_dir=args.data_dir) if is_train: bs = args.batch_size / get_device_num() else: bs = 16 pyreader.decorate_paddle_reader(paddle.batch(reader, batch_size=bs))
def prepare_reader(is_train, pyreader, args, pass_id=1): # NOTE: always use infinite reader for dist training if is_train: reader = train( data_dir=args.data_dir, pass_id_as_seed=pass_id, infinite=True) else: reader = val(data_dir=args.data_dir) if is_train: bs = args.batch_size / get_device_num() else: bs = 16 pyreader.decorate_paddle_reader(paddle.batch(reader, batch_size=bs))
def retrain(modelpath): model = module.Module(module_dir=args.hub_module_path) feed_list, fetch_list, program, generator = model( sign_name="feature_map", trainable=False) test_program = program.clone() # get the dog cat dataset train_reader = paddle.batch(reader.train(args.data_dir), batch_size=32) val_reader = paddle.batch(reader.val(args.data_dir), batch_size=32) with fluid.program_guard(main_program=program): with fluid.unique_name.guard(generator): img = feed_list[0] label = fluid.layers.data(name="label", shape=[1], dtype="int64") feature_map = fetch_list[0] fc = fluid.layers.fc(input=feature_map, size=2, act="softmax") cost = fluid.layers.cross_entropy(input=fc, label=label) avg_cost = fluid.layers.mean(cost) acc = fluid.layers.accuracy(input=fc, label=label) # define the loss optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_cost) # running on gpu place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder(feed_list=[img, label], place=place) exe = fluid.Executor(place) # init all param exe.run(fluid.default_startup_program()) step = 0 sample_num = 0 epochs = 50 # start to train for i in range(epochs): for batch in train_reader(): cost, accuracy = exe.run( feed=feeder.feed(batch), fetch_list=[avg_cost.name, acc.name]) step += 1 print( "epoch %d and step %d: train cost is %.2f, train acc is %.2f%%" % (i, step, cost, accuracy * 100)) for iter, batch in enumerate(val_reader()): cost, accuracy = exe.run( feed=feeder.feed(batch), fetch_list=[avg_cost.name, acc.name]) print("batch %d: val cost is %.2f, val acc is %.2f%%" % (iter, cost, accuracy * 100))
def quantize(args): val_reader = reader.train() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() assert os.path.exists(args.model_path), "args.model_path doesn't exist" assert os.path.isdir(args.model_path), "args.model_path must be a dir" exe = fluid.Executor(place) quant_post(executor=exe, model_dir=args.model_path, quantize_model_path=args.save_path, sample_generator=val_reader, model_filename=args.model_filename, params_filename=args.params_filename, batch_size=args.batch_size, batch_nums=args.batch_num)
def prepare_reader(epoch_id, train_py_reader, train_bs, val_bs, trn_dir, img_dim, min_scale, rect_val, args): train_reader = reader.train(traindir="%s/%strain" % (args.data_dir, trn_dir), sz=img_dim, min_scale=min_scale, shuffle_seed=epoch_id + 1) train_py_reader.decorate_paddle_reader( paddle.batch(train_reader, batch_size=train_bs)) test_reader = reader.test(valdir="%s/%svalidation" % (args.data_dir, trn_dir), bs=val_bs * DEVICE_NUM, sz=img_dim, rect_val=rect_val) test_batched_reader = paddle.batch(test_reader, batch_size=val_bs * DEVICE_NUM) return test_batched_reader
def create_net(self, tokens=None): """Create a network for training by tokens. """ if tokens is None: tokens = self.init_tokens() bottleneck_params_list = get_bottleneck_params_list(tokens) startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, bottleneck_params_list=bottleneck_params_list) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, bottleneck_params_list=bottleneck_params_list) test_prog = test_prog.clone(for_test=True) train_batch_size = batch_size / 4 test_batch_size = batch_size train_reader = paddle.batch(reader.train(), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) with fluid.program_guard(train_prog, startup_prog): train_py_reader.decorate_paddle_reader(train_reader) with fluid.program_guard(test_prog, startup_prog): test_py_reader.decorate_paddle_reader(test_reader) return startup_prog, train_prog, test_prog, ( train_cost, train_acc1, train_acc5, global_lr), (test_cost, test_acc1, test_acc5), train_py_reader, test_py_reader
def prepare_reader(epoch_id, train_py_reader, train_bs, val_bs, trn_dir, img_dim, min_scale, rect_val, args=None): num_trainers = args.dist_env["num_trainers"] if args.update_method != 'local' else 1 trainer_id = args.dist_env["trainer_id"] if args.update_method != 'local' else 0 train_reader = reader.train( traindir="%s/%strain" % (args.data_dir, trn_dir), sz=img_dim, min_scale=min_scale, shuffle_seed=epoch_id + 1, rank_id=trainer_id, size=num_trainers) train_py_reader.decorate_paddle_reader( paddle.batch( train_reader, batch_size=train_bs)) test_reader = reader.test( valdir="%s/%svalidation" % (args.data_dir, trn_dir), bs=val_bs * DEVICE_NUM, sz=img_dim, rect_val=rect_val) test_batched_reader = paddle.batch( test_reader, batch_size=val_bs * DEVICE_NUM) return test_batched_reader
def train_async(args): # parameters from arguments logging.debug('enter train') model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() tmp_prog = fluid.Program() if args.enable_ce: assert args.model == "ResNet50" assert args.loss_name == "arcmargin" np.random.seed(0) startup_prog.random_seed = 1000 train_prog.random_seed = 1000 tmp_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_feas, image, label = build_program(is_train=False, main_prog=tmp_prog, startup_prog=startup_prog, args=args) test_prog = tmp_prog.clone(for_test=True) train_fetch_list = [ global_lr.name, train_cost.name, train_acc1.name, train_acc5.name ] test_fetch_list = [test_feas.name] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) logging.debug('after run startup program') if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: devicenum = get_gpu_num() else: devicenum = int(os.environ.get('CPU_NUM', 1)) assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size // devicenum test_batch_size = args.test_batch_size train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.test(args), batch_size=test_batch_size, drop_last=False) test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_py_reader.decorate_paddle_reader(train_reader) train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=args.use_gpu, loss_name=train_cost.name) totalruntime = 0 train_py_reader.start() iter_no = 0 train_info = [0, 0, 0, 0] while iter_no <= args.total_iter_num: t1 = time.time() lr, loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 lr = np.mean(np.array(lr)) train_info[0] += np.mean(np.array(loss)) train_info[1] += np.mean(np.array(acc1)) train_info[2] += np.mean(np.array(acc5)) train_info[3] += 1 if iter_no % args.display_iter_step == 0: avgruntime = totalruntime / args.display_iter_step avg_loss = train_info[0] / train_info[3] avg_acc1 = train_info[1] / train_info[3] avg_acc5 = train_info[2] / train_info[3] print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\ "acc1 %.4f, acc5 %.4f, time %2.2f sec" % \ (fmt_time(), iter_no, lr, avg_loss, avg_acc1, avg_acc5, avgruntime)) sys.stdout.flush() totalruntime = 0 if iter_no % 1000 == 0: train_info = [0, 0, 0, 0] totalruntime += period if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, data in enumerate(test_reader()): t1 = time.time() [feas] = exe.run(test_prog, fetch_list=test_fetch_list, feed=test_feeder.feed(data)) label = np.asarray([x[1] for x in data]) f.append(feas) l.append(label) t2 = time.time() period = t2 - t1 if batch_id % 20 == 0: print("[%s] testbatch %d, time %2.2f sec" % \ (fmt_time(), batch_id, period)) f = np.vstack(f) l = np.hstack(l) recall = recall_topk(f, l, k=1) print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \ (fmt_time(), len(f), iter_no, recall)) sys.stdout.flush() if iter_no % args.save_iter_step == 0 and iter_no != 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(iter_no)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) iter_no += 1 # This is for continuous evaluation only if args.enable_ce: # Use the mean cost/acc for training print("kpis\ttrain_cost\t{}".format(avg_loss)) print("kpis\ttest_recall\t{}".format(recall))
def train(args): # parameters from arguments model_name = args.model pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() train_py_reader, train_cost, train_acc, image, predition, feature_map = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc, image, predition, feature_map = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.create_module: assert pretrained_model, "need a pretrained module to create a hub module" sign1 = hub.create_signature("classification", inputs=[image], outputs=[predition]) sign2 = hub.create_signature("feature_map", inputs=[image], outputs=[feature_map]) sign3 = hub.create_signature(inputs=[image], outputs=[predition]) hub.create_module(sign_arr=[sign1, sign2, sign3], program=train_prog, module_dir="hub_module_" + args.model) exit() visible_device = os.getenv('CUDA_VISIBLE_DEVICES') if visible_device: device_num = len(visible_device.split(',')) else: device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n') train_batch_size = args.batch_size / device_num test_batch_size = 16 train_reader = paddle.batch(reader.train(), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name) train_fetch_list = [train_cost.name, train_acc.name] test_fetch_list = [test_cost.name, test_acc.name] params = nets.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() loss, acc = train_exe.run(fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc = np.mean(np.array(acc)) train_info[0].append(loss) train_info[1].append(acc) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc {3}, time {4}".format(pass_id, batch_id, loss, acc, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc = np.array(train_info[1]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc = np.mean(acc) test_info[0].append(loss) test_info[1].append(acc) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc {3},time {4}".format(pass_id, test_batch_id, loss, acc, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc = np.array(test_info[1]).mean() print("End pass {0}, train_loss {1}, train_acc {2}, " "test_loss {3}, test_acc {4}".format(pass_id, train_loss, train_acc, test_loss, test_acc)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog)
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization and use_ngraph: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_reader = reader.train(batch_size=train_batch_size, shuffle_seed=shuffle_seed) test_reader = reader.val(batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) if not use_ngraph: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = args.with_mem_opt build_strategy.enable_inplace = args.with_inplace build_strategy.fuse_all_reduce_ops=1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_num exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 time_record=[] try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() time_record.append(t2 - t1) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(t2-t1) if batch_id % 10 == 0: period = np.mean(time_record) time_record=[] print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def parallel_exe(args, train_file_list, val_file_list, data_args, learning_rate, batch_size, num_passes, model_save_dir='model', pretrained_model=None): image_shape = [3, data_args.resize_h, data_args.resize_w] if data_args.dataset == 'coco': num_classes = 81 elif data_args.dataset == 'pascalvoc': num_classes = 21 devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') gt_box = fluid.layers.data(name='gt_box', shape=[4], dtype='float32', lod_level=1) gt_label = fluid.layers.data(name='gt_label', shape=[1], dtype='int32', lod_level=1) difficult = fluid.layers.data(name='gt_difficult', shape=[1], dtype='int32', lod_level=1) locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) nmsed_out = fluid.layers.detection_output(locs, confs, box, box_var, nms_threshold=0.45) loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(test_program): map_eval = fluid.evaluator.DetectionMAP(nmsed_out, gt_label, gt_box, difficult, num_classes, overlap_threshold=0.5, evaluate_difficult=False, ap_version=args.ap_version) if data_args.dataset == 'coco': # learning rate decay in 12, 19 pass, respectively if '2014' in train_file_list: epocs = 82783 / batch_size boundaries = [epocs * 12, epocs * 19] elif '2017' in train_file_list: epocs = 118287 / batch_size boundaries = [epcos * 12, epocs * 19] elif data_args.dataset == 'pascalvoc': epocs = 19200 / batch_size boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100] values = [ learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate * 0.1, learning_rate * 0.01 ] optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) fluid.default_startup_program.random_seed = 1000 exe.run(fluid.default_startup_program()) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if args.parallel: train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu, loss_name=loss.name) train_reader = paddle.batch(reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch(reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label, difficult]) def save_model(postfix): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print 'save models to %s' % (model_path) fluid.io.save_persistables(exe, model_path) best_map = 0. def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) test_map = None for data in test_reader(): test_map = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Test {0}, map {1}".format(pass_id, test_map[0])) train_num = 0 total_train_time = 0.0 total_iters = 0 for pass_id in range(num_passes): every_pass_loss = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): batch_start = time.time() if iter == args.iterations: break if len(data) < devices_num: continue if args.parallel: loss_v, = train_exe.run(fetch_list=[loss.name], feed=feeder.feed(data)) else: loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) loss_v = np.mean(np.array(loss_v)) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, time.time() - batch_start)) if iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - batch_start pass_duration += batch_duration train_num += len(data) every_pass_loss.append(loss_v) iter += 1 total_iters += 1 #test(pass_id, best_map) total_train_time += pass_duration print("Pass:%d, Loss:%f, Handle Images Duration: %f\n" % (pass_id, np.mean(every_pass_loss), pass_duration)) if pass_id == num_passes - 1: examples_per_sec = train_num / total_train_time train_cost_kpi.add_record(np.mean(every_pass_loss)) train_speed_kpi.add_record( np.array(examples_per_sec, dtype='float')) four_card_speed_kpi.add_record( np.array(examples_per_sec, dtype='float')) if args.gpu_card_num == 1: train_cost_kpi.persist() train_speed_kpi.persist() else: four_card_speed_kpi.persist() print("Best test map {0}".format(best_map))
def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight): py_reader.decorate_tensor_provider( convert_python_to_tensor(weight, args.batch_size, reader.train())) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True print("CPU_NUM:" + str(os.getenv("CPU_NUM"))) exec_strategy.num_threads = int(os.getenv("CPU_NUM")) build_strategy = fluid.BuildStrategy() if int(os.getenv("CPU_NUM")) > 1: build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce train_exe = fluid.ParallelExecutor(use_cuda=False, loss_name=loss.name, main_program=train_program, build_strategy=build_strategy, exec_strategy=exec_strategy) for pass_id in range(args.num_passes): py_reader.start() time.sleep(10) epoch_start = time.time() batch_id = 0 start = time.time() try: while True: loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = np.mean(loss_val) if batch_id % args.print_batch == 0: logger.info( "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}" .format(pass_id, batch_id, loss_val.mean(), py_reader.queue.size())) if args.with_speed: if batch_id % 500 == 0 and batch_id != 0: elapsed = (time.time() - start) start = time.time() samples = 1001 * args.batch_size * int( os.getenv("CPU_NUM")) logger.info("Time used: {}, Samples/Sec: {}".format( elapsed, samples / elapsed)) if batch_id % args.save_step == 0 and batch_id != 0: model_dir = args.model_output_dir + '/pass-' + str( pass_id) + ('/batch-' + str(batch_id)) if trainer_id == 0: fluid.io.save_params(executor=exe, dirname=model_dir) print("model saved in %s" % model_dir) batch_id += 1 except fluid.core.EOFException: py_reader.reset() epoch_end = time.time() logger.info("Epoch: {0}, Train total expend: {1} ".format( pass_id, epoch_end - epoch_start)) model_dir = args.model_output_dir + '/pass-' + str(pass_id) if trainer_id == 0: fluid.io.save_params(executor=exe, dirname=model_dir) print("model saved in %s" % model_dir)
def train(): # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) devices_num = get_device_num() if cfg.use_gpu else 1 print("Found {} CUDA/CPU devices.".format(devices_num)) if cfg.debug or args.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 random.seed(0) np.random.seed(0) if not os.path.exists(cfg.model_save_dir): os.makedirs(cfg.model_save_dir) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) if cfg.use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() model = YOLOv3(3, is_train=True) if cfg.pretrain: restore, _ = fluid.load_dygraph(cfg.pretrain) model.block.set_dict(restore) if cfg.finetune: restore, _ = fluid.load_dygraph(cfg.finetune) model.set_dict(restore, use_structured_name=True) if args.use_data_parallel: model = fluid.dygraph.parallel.DataParallel(model, strategy) boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) learning_rate = cfg.learning_rate values = [learning_rate * (gamma ** i) for i in range(step_num + 1)] lr = fluid.dygraph.PiecewiseDecay( boundaries=boundaries, values=values, begin=args.start_iter) lr = fluid.layers.linear_lr_warmup( learning_rate=lr, warmup_steps=cfg.warm_up_iter, start_lr=0.0, end_lr=cfg.learning_rate, ) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum, parameter_list=model.parameters() ) start_time = time.time() snapshot_loss = 0 snapshot_time = 0 total_sample = 0 input_size = cfg.input_size shuffle = True shuffle_seed = None total_iter = cfg.max_iter - cfg.start_iter mixup_iter = total_iter - cfg.no_mixup_iter random_sizes = [cfg.input_size] if cfg.random_shape: random_sizes = [32 * i for i in range(10,20)] train_reader = reader.train( input_size, batch_size=cfg.batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, total_iter=total_iter * devices_num, mixup_iter=mixup_iter * devices_num, random_sizes=random_sizes, use_multiprocess_reader=cfg.use_multiprocess_reader, num_workers=cfg.worker_num) if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader) smoothed_loss = SmoothedValue() for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() img = np.array([x[0] for x in data]).astype('float32') img = to_variable(img) gt_box = np.array([x[1] for x in data]).astype('float32') gt_box = to_variable(gt_box) gt_label = np.array([x[2] for x in data]).astype('int32') gt_label = to_variable(gt_label) gt_score = np.array([x[3] for x in data]).astype('float32') gt_score = to_variable(gt_score) loss = model(img, gt_box, gt_label, gt_score, None, None) smoothed_loss.add_value(np.mean(loss.numpy())) snapshot_loss += loss.numpy() snapshot_time += start_time - prev_start_time total_sample += 1 print("Iter {:d}, loss {:.6f}, time {:.5f}".format( iter_id, smoothed_loss.get_mean_value(), start_time-prev_start_time)) if args.use_data_parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() loss.backward() optimizer.minimize(loss) model.clear_gradients() save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters and iter_id > 1 and iter_id % cfg.snapshot_iter == 0: fluid.save_dygraph(model.state_dict(), args.model_save_dir + "/yolov3_{}".format(iter_id))
def get_model(args, is_train, main_prog, startup_prog): pyreader = None class_dim = 1000 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] if is_train: reader = train(data_dir=args.data_dir) else: reader = val(data_dir=args.data_dir) trainer_count = int(os.getenv("PADDLE_TRAINERS", "1")) with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): pyreader = fluid.layers.py_reader( capacity=args.batch_size * args.gpus, shapes=([-1] + dshape, (-1, 1)), dtypes=('float32', 'int64'), name="train_reader" if is_train else "test_reader", use_double_buffer=True) input, label = fluid.layers.read_file(pyreader) model_def = models.__dict__[args.model](layers=50, is_train=is_train) predict = model_def.net(input, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) batch_acc1 = fluid.layers.accuracy(input=predict, label=label, k=1) batch_acc5 = fluid.layers.accuracy(input=predict, label=label, k=5) optimizer = None if is_train: start_lr = args.learning_rate # n * worker * repeat end_lr = args.learning_rate * trainer_count * args.multi_batch_repeat total_images = 1281167 / trainer_count step = int( total_images / (args.batch_size * args.gpus * args.multi_batch_repeat) + 1) warmup_steps = step * 5 # warmup 5 passes epochs = [30, 60, 80] bd = [step * e for e in epochs] base_lr = end_lr lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] # NOTE: we put weight decay in layers config, and remove # weight decay on bn layers, so don't add weight decay in # optimizer config. optimizer = fluid.optimizer.Momentum( learning_rate=utils.learning_rate.lr_warmup( fluid.layers.piecewise_decay(boundaries=bd, values=lr), warmup_steps, start_lr, end_lr), momentum=0.9) optimizer.minimize(avg_cost) batched_reader = None pyreader.decorate_paddle_reader( paddle.batch(reader, batch_size=args.batch_size)) return avg_cost, optimizer, [batch_acc1, batch_acc5], batched_reader, pyreader
def train(args, data_args, train_params, train_file_list, val_file_list): model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model use_gpu = args.use_gpu parallel = args.parallel enable_ce = args.enable_ce is_shuffle = True if not use_gpu: devices_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: devices_num = fluid.core.get_cuda_device_count() batch_size = train_params['batch_size'] epoc_num = train_params['epoc_num'] batch_size_per_device = batch_size // devices_num num_workers = 8 startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if enable_ce: import random random.seed(0) np.random.seed(0) is_shuffle = False startup_prog.random_seed = 111 train_prog.random_seed = 111 test_prog.random_seed = 111 train_py_reader, loss = build_program( main_prog=train_prog, startup_prog=startup_prog, train_params=train_params, is_train=True) test_py_reader, map_eval, _, _ = build_program( main_prog=test_prog, startup_prog=startup_prog, train_params=train_params, is_train=False) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) if parallel: loss.persistable = True build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=use_gpu, loss_name=loss.name, build_strategy=build_strategy) train_reader = reader.train(data_args, train_file_list, batch_size_per_device, shuffle=is_shuffle, num_workers=num_workers, enable_ce=enable_ce) test_reader = reader.test(data_args, val_file_list, batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) def save_model(postfix, main_prog): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path, main_program=main_prog) best_map = 0. def test(epoc_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) every_epoc_map=[] # for CE test_py_reader.start() try: batch_id = 0 while True: test_map, = exe.run(test_prog, fetch_list=[accum_map]) if batch_id % 10 == 0: every_epoc_map.append(test_map) print("Batch {0}, map {1}".format(batch_id, test_map)) batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() mean_map = np.mean(every_epoc_map) print("Epoc {0}, test map {1}".format(epoc_id, test_map[0])) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model', test_prog) return best_map, mean_map total_time = 0.0 for epoc_id in range(epoc_num): train_reader = reader.train(data_args, train_file_list, batch_size_per_device, shuffle=is_shuffle, num_workers=num_workers, enable_ce=enable_ce) train_py_reader.decorate_paddle_reader(train_reader) epoch_idx = epoc_id + 1 start_time = time.time() prev_start_time = start_time every_epoc_loss = [] batch_id = 0 train_py_reader.start() while True: try: prev_start_time = start_time start_time = time.time() if parallel: loss_v, = train_exe.run(fetch_list=[loss.name]) else: loss_v, = exe.run(train_prog, fetch_list=[loss]) loss_v = np.mean(np.array(loss_v)) every_epoc_loss.append(loss_v) if batch_id % 10 == 0: print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format( epoc_id, batch_id, loss_v, start_time - prev_start_time)) batch_id += 1 except (fluid.core.EOFException, StopIteration): train_reader().close() train_py_reader.reset() break end_time = time.time() total_time += end_time - start_time if epoc_id % 10 == 0 or epoc_id == epoc_num - 1: best_map, mean_map = test(epoc_id, best_map) print("Best test map {0}".format(best_map)) # save model save_model(str(epoc_id), train_prog) if enable_ce: train_avg_loss = np.mean(every_epoc_loss) if devices_num == 1: print("kpis train_cost %s" % train_avg_loss) print("kpis test_acc %s" % mean_map) print("kpis train_speed %s" % (total_time / epoch_idx)) else: print("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) print("kpis test_acc_card%s %s" % (devices_num, mean_map)) print("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx))
def train(train_file_list, data_args, init_model_path, save_dir, dev_file_list=None): optimizer = paddle.optimizer.Momentum( momentum=cfg.TRAIN.MOMENTUM, learning_rate=cfg.TRAIN.LEARNING_RATE, regularization=paddle.optimizer.L2Regularization( rate=cfg.TRAIN.L2REGULARIZATION), learning_rate_decay_a=cfg.TRAIN.LEARNING_RATE_DECAY_A, learning_rate_decay_b=cfg.TRAIN.LEARNING_RATE_DECAY_B, learning_rate_schedule=cfg.TRAIN.LEARNING_RATE_SCHEDULE) cost, detect_out = vgg_ssd_net.net_conf("train") parameters = paddle.parameters.create(cost) if init_model_path is not None: assert os.path.isfile(init_model_path), "Invalid model." parameters.init_from_tar(gzip.open(init_model_path)) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, extra_layers=[detect_out], update_equation=optimizer) feeding = {"image": 0, "bbox": 1} train_reader = paddle.batch( reader.train(data_args, train_file_list), batch_size=cfg.TRAIN.BATCH_SIZE) # generate a batch image each time if dev_file_list is not None: dev_reader = paddle.batch(reader.test(data_args, dev_file_list), batch_size=cfg.TRAIN.BATCH_SIZE) def event_handler(event): if isinstance(event, paddle.event.EndIteration): if (event.batch_id + 1) % 1 == 0: print("Pass %d, Batch %d, TrainCost %f, Detection mAP=%f" % (event.pass_id, event.batch_id + 1, event.cost, event.metrics["detection_evaluator"])) sys.stdout.flush() if isinstance(event, paddle.event.EndPass): if not (event.pass_id + 1) % 20: with gzip.open( os.path.join(save_dir, "params_pass_%05d.tar.gz" % event.pass_id), "w") as f: trainer.save_parameter_to_tar(f) if dev_file_list is not None: result = trainer.test(reader=dev_reader, feeding=feeding) print("Test with Pass %d, TestCost: %f, Detection mAP=%g" % (event.pass_id, result.cost, result.metrics["detection_evaluator"])) trainer.train(reader=train_reader, event_handler=event_handler, num_passes=cfg.TRAIN.NUM_PASS, feeding=feeding)
def train(args, data_args, train_params, train_file_list, val_file_list): model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model use_gpu = args.use_gpu parallel = args.parallel enable_ce = args.enable_ce is_shuffle = True if not use_gpu: devices_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: devices_num = fluid.core.get_cuda_device_count() batch_size = train_params['batch_size'] epoc_num = train_params['epoc_num'] batch_size_per_device = batch_size // devices_num num_workers = 8 startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() train_py_reader, loss = build_program(main_prog=train_prog, startup_prog=startup_prog, train_params=train_params, is_train=True) test_py_reader, map_var, _, _ = build_program(main_prog=test_prog, startup_prog=startup_prog, train_params=train_params, is_train=False) test_prog = test_prog.clone(for_test=True) for param in train_prog.global_block().all_parameters(): if 'conv' in param.name: print param.name, param.shape place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) test_reader = reader.test(data_args, val_file_list, batch_size) test_py_reader.decorate_paddle_reader(test_reader) train_reader = reader.train(data_args, train_file_list, batch_size_per_device, shuffle=is_shuffle, use_multiprocess=args.use_multiprocess, num_workers=num_workers, enable_ce=enable_ce) train_py_reader.decorate_paddle_reader(train_reader) train_fetch_list = [("loss", loss.name)] val_fetch_list = [("map", map_var.name)] compressor = Compressor(place, fluid.global_scope(), train_prog, train_reader=train_py_reader, train_feed_list=None, train_fetch_list=train_fetch_list, eval_program=test_prog, eval_reader=test_py_reader, eval_feed_list=None, eval_fetch_list=val_fetch_list, train_optimizer=None) compressor.config('./compress.yaml') compressor.run()
def train(args, data_args, train_file_list, learning_rate, batch_size, num_passes, model_save_dir, pretrained_model=None, with_memory_optimization=None): image_shape = [3, data_args.resize_h, data_args.resize_w] devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) image = fluid.layers.data(name='image',shape=[3,1920,1080], dtype='float32') size=[1920,1080] ground_truth = fluid.layers.data(name='ground_truth',shape=[1,size[1],size[0]], dtype='float32') csr_net = net.CSRNet(image,size) cost = fluid.layers.cos_sim(csr_net,ground_truth) avg_cost = fluid.layers.mean(x=cost) epocs = 2859 / batch_size optimizer = fluid.optimizer.SGD(1e-6) optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if with_memory_optimization: fluid.memory_optimize(fluid.default_main_program()) if args.parallel: train_exe = fluid.ParallelExecutor( use_cuda=args.use_gpu, loss_name=avg_cost.name) train_reader = paddle.batch( reader.train(data_args, train_file_list), batch_size=batch_size) feeder = fluid.DataFeeder( place=place, feed_list=[image, ground_truth]) def save_model(postfix): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print ('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path) best_map = 0. train_num = 0 total_train_time = 0.0 for pass_id in range(num_passes): start_time = time.time() prev_start_time = start_time # end_time = 0 every_pass_loss = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): print data[0][1] print np.array(data[0][1]) prev_start_time = start_time start_time = time.time() if args.for_model_ce and iter == args.iterations: break if len(data) < (devices_num): print("There are too few data to train on all devices.") continue if args.parallel: loss_v, = train_exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) else: loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost]) # end_time = time.time() loss_v = np.mean(np.array(loss_v)) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, start_time - prev_start_time)) if args.for_model_ce and iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - start_time pass_duration += batch_duration train_num += len(data) every_pass_loss.append(loss_v) iter += 1 total_train_time += pass_duration if args.for_model_ce and pass_id == num_passes - 1: examples_per_sec = train_num / total_train_time cost = np.mean(every_pass_loss) with open("train_speed_factor.txt", 'w') as f: f.write('{:f}\n'.format(examples_per_sec)) with open("train_cost_factor.txt", 'a+') as f: f.write('{:f}\n'.format(cost)) #best_map = test(pass_id, best_map) if pass_id % 10 == 0 or pass_id == num_passes - 1: save_model(str(pass_id))
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] if cfg.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 import random random.seed(0) np.random.seed(0) devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True if cfg.enable_ce: use_random = False model = model_builder.RCNN( add_conv_body_func=resnet.add_ResNet50_conv4_body, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, use_pyreader=cfg.use_pyreader, use_random=use_random) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = losses boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = exponential_with_warmup_decay(learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) fetch_list = fetch_list + [lr] for var in fetch_list: var.persistable = True #fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list)) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe shuffle = True if cfg.enable_ce: shuffle = False if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=shuffle) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) except (StopIteration, fluid.core.EOFException): py_reader.reset() def train_loop(): start_time = time.time() prev_start_time = start_time start = start_time train_stats = TrainingStats(cfg.log_window, keys) for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() # only for ce if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) return np.mean(every_pass_loss) if cfg.use_pyreader: train_loop_pyreader() else: train_loop() save_model('model_final')
def train(data_path=None, model_type=ModelType.create_classification(), batch_size=100, num_passes=50, class_num=None, num_workers=1, use_gpu=False): ''' Train the DNN. ''' paddle.init(use_gpu=use_gpu, trainer_count=num_workers) # network config input_layer = paddle.layer.data(name='input_layer', type=paddle.data_type.dense_vector(feature_dim)) dnn = create_dnn(input_layer) prediction = None label = None cost = None if args.model_type.is_classification(): prediction = paddle.layer.fc(input=dnn, size=class_num, act=paddle.activation.Softmax()) label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(class_num)) cost = paddle.layer.classification_cost(input=prediction, label=label) elif args.model_type.is_regression(): prediction = paddle.layer.fc(input=dnn, size=1, act=paddle.activation.Linear()) label = paddle.layer.data(name='label', type=paddle.data_type.dense_vector(1)) cost = paddle.layer.mse_cost(input=prediction, label=label) # create parameters parameters = paddle.parameters.create(cost) # create optimizer optimizer = paddle.optimizer.Momentum(momentum=0) trainer = paddle.trainer.SGD( cost=cost, extra_layers=paddle.evaluator.auc(input=prediction, label=label), parameters=parameters, update_equation=optimizer) feeding = {'input_layer': 0, 'label': 1} # event_handler to print training and testing info def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: print "Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics) if isinstance(event, paddle.event.EndPass): result = trainer.test( reader=paddle.batch(reader.test(data_path, feature_dim+1, args.model_type.is_classification()), batch_size=batch_size), feeding=feeding) print "Test %d, Cost %f, %s" % (event.pass_id, result.cost, result.metrics) model_desc = "{type}".format( type=str(args.model_type)) with open("%sdnn_%s_pass_%05d.tar" % (args.model_output_prefix, model_desc, event.pass_id), "w") as f: parameters.to_tar(f) # training trainer.train( reader=paddle.batch( paddle.reader.shuffle(reader.train(data_path, feature_dim+1, args.model_type.is_classification()), buf_size=batch_size*10), batch_size=batch_size), feeding=feeding, event_handler=event_handler, num_passes=num_passes)
def train_async(args): # parameters from arguments logging.debug('enter train') model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) startup_prog = fluid.Program() train_prog = fluid.Program() tmp_prog = fluid.Program() train_loader, train_cost, global_lr, train_feas, train_label = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_loader, test_feas = build_program(is_train=False, main_prog=tmp_prog, startup_prog=startup_prog, args=args) test_prog = tmp_prog.clone(for_test=True) train_fetch_list = [ global_lr.name, train_cost.name, train_feas.name, train_label.name ] test_fetch_list = [test_feas.name] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers <= 1 and args.use_gpu: places = fluid.framework.cuda_places() else: places = place exe.run(startup_prog) if checkpoint is not None: fluid.load(program=train_prog, model_path=checkpoint, executor=exe) if pretrained_model: load_params(exe, train_prog, pretrained_model) if args.use_gpu: devicenum = get_gpu_num() else: devicenum = int(os.environ.get('CPU_NUM', 1)) assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size / devicenum test_batch_size = args.test_batch_size train_loader.set_sample_generator(reader.train(args), batch_size=train_batch_size, drop_last=True, places=places) test_loader.set_sample_generator(reader.test(args), batch_size=test_batch_size, drop_last=False, places=place) train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=args.use_gpu, loss_name=train_cost.name) totalruntime = 0 iter_no = 0 train_info = [0, 0, 0] while iter_no <= args.total_iter_num: for train_batch in train_loader(): t1 = time.time() lr, loss, feas, label = train_exe.run(feed=train_batch, fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 lr = np.mean(np.array(lr)) train_info[0] += np.mean(np.array(loss)) train_info[1] += recall_topk(feas, label, k=1) train_info[2] += 1 if iter_no % args.display_iter_step == 0: avgruntime = totalruntime / args.display_iter_step avg_loss = train_info[0] / train_info[2] avg_recall = train_info[1] / train_info[2] print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\ "recall %.4f, time %2.2f sec" % \ (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime)) sys.stdout.flush() totalruntime = 0 if iter_no % 1000 == 0: train_info = [0, 0, 0] totalruntime += period if iter_no % args.test_iter_step == 0 and iter_no != 0: f, l = [], [] for batch_id, test_batch in enumerate(test_loader()): t1 = time.time() [feas] = exe.run(test_prog, feed=test_batch, fetch_list=test_fetch_list) label = np.asarray(test_batch[0]['label']) label = np.squeeze(label) f.append(feas) l.append(label) t2 = time.time() period = t2 - t1 if batch_id % 20 == 0: print("[%s] testbatch %d, time %2.2f sec" % \ (fmt_time(), batch_id, period)) f = np.vstack(f) l = np.hstack(l) recall = recall_topk(f, l, k=1) print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \ (fmt_time(), len(f), iter_no, recall)) sys.stdout.flush() if iter_no % args.save_iter_step == 0 and iter_no != 0: model_path = os.path.join(model_save_dir, model_name, str(iter_no)) fluid.save(program=train_prog, model_path=model_path) iter_no += 1
def compress(args): image_shape = [int(m) for m in args.image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # model definition model = models.__dict__[args.model]() if args.model is "GoogleNet": out0, out1, out2 = model.net(input=image, class_dim=args.class_dim) cost0 = fluid.layers.cross_entropy(input=out0, label=label) cost1 = fluid.layers.cross_entropy(input=out1, label=label) cost2 = fluid.layers.cross_entropy(input=out2, label=label) avg_cost0 = fluid.layers.mean(x=cost0) avg_cost1 = fluid.layers.mean(x=cost1) avg_cost2 = fluid.layers.mean(x=cost2) avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2 acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5) else: out = model.net(input=image, class_dim=args.class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) val_program = fluid.default_main_program().clone() if args.quant_only: boundaries = [ args.total_images / args.batch_size * 10, args.total_images / args.batch_size * 16 ] values = [1e-4, 1e-5, 1e-6] else: boundaries = [ args.total_images / args.batch_size * 30, args.total_images / args.batch_size * 60, args.total_images / args.batch_size * 90 ] values = [0.1, 0.01, 0.001, 0.0001] opt = fluid.optimizer.Momentum( momentum=0.9, learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries, values=values), regularization=fluid.regularizer.L2Decay(4e-5)) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if args.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) fluid.io.load_vars(exe, args.pretrained_model, predicate=if_exist) val_reader = paddle.batch(reader.val(), batch_size=args.batch_size) val_feed_list = [('image', image.name), ('label', label.name)] val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)] train_reader = paddle.batch(reader.train(), batch_size=args.batch_size, drop_last=True) train_feed_list = [('image', image.name), ('label', label.name)] train_fetch_list = [('loss', avg_cost.name)] teacher_programs = [] distiller_optimizer = None if args.teacher_model: teacher_model = models.__dict__[args.teacher_model]() # define teacher program teacher_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(teacher_program, startup_program): img = teacher_program.global_block()._clone_variable( image, force_persistable=False) predict = teacher_model.net(img, class_dim=args.class_dim, conv1_name='res_conv1', fc_name='res_fc') exe.run(startup_program) assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): return os.path.exists( os.path.join(args.teacher_pretrained_model, var.name)) fluid.io.load_vars(exe, args.teacher_pretrained_model, main_program=teacher_program, predicate=if_exist) distiller_optimizer = opt teacher_programs.append(teacher_program.clone(for_test=True)) com_pass = Compressor(place, fluid.global_scope(), fluid.default_main_program(), train_reader=train_reader, train_feed_list=train_feed_list, train_fetch_list=train_fetch_list, eval_program=val_program, eval_reader=val_reader, eval_feed_list=val_feed_list, eval_fetch_list=val_fetch_list, teacher_programs=teacher_programs, train_optimizer=opt, distiller_optimizer=distiller_optimizer) com_pass.config(args.compress_config) com_pass.run()
def train(args, config, train_params, train_file_list): batch_size = train_params["batch_size"] epoc_num = train_params["epoc_num"] optimizer_method = train_params["optimizer_method"] use_pyramidbox = train_params["use_pyramidbox"] use_gpu = args.use_gpu model_save_dir = args.model_save_dir pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) batch_size_per_device = batch_size // devices_num iters_per_epoc = train_params["train_images"] // batch_size num_workers = 8 is_shuffle = True startup_prog = fluid.Program() train_prog = fluid.Program() #only for ce if args.enable_ce: SEED = 102 startup_prog.random_seed = SEED train_prog.random_seed = SEED num_workers = 1 pretrained_model = "" if args.batch_num != None: iters_per_epoc = args.batch_num train_py_reader, fetches, loss = build_program(train_params=train_params, main_prog=train_prog, startup_prog=startup_prog, args=args) if with_memory_optimization: fluid.memory_optimize(train_prog) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) start_epoc = 0 if pretrained_model: if pretrained_model.isdigit(): start_epoc = int(pretrained_model) + 1 pretrained_model = os.path.join(model_save_dir, pretrained_model) print("Resume from %s " % (pretrained_model)) if not os.path.exists(pretrained_model): raise ValueError( "The pre-trained model path [%s] does not exist." % (pretrained_model)) def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, main_program=train_prog, predicate=if_exist) train_reader = reader.train(config, train_file_list, batch_size_per_device, shuffle=is_shuffle, num_workers=num_workers) train_py_reader.decorate_paddle_reader(train_reader) if args.parallel: train_exe = fluid.ParallelExecutor(main_program=train_prog, use_cuda=use_gpu, loss_name=loss.name) def save_model(postfix, program): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path, main_program=program) total_time = 0.0 epoch_idx = 0 face_loss = 0 head_loss = 0 for pass_id in range(start_epoc, epoc_num): epoch_idx += 1 start_time = time.time() prev_start_time = start_time end_time = 0 batch_id = 0 train_py_reader.start() while True: try: prev_start_time = start_time start_time = time.time() if args.parallel: fetch_vars = train_exe.run( fetch_list=[v.name for v in fetches]) else: fetch_vars = exe.run(train_prog, fetch_list=fetches) end_time = time.time() fetch_vars = [np.mean(np.array(v)) for v in fetch_vars] face_loss = fetch_vars[0] head_loss = fetch_vars[1] if batch_id % 10 == 0: if not args.use_pyramidbox: print( "Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}". format(pass_id, batch_id, face_loss, start_time - prev_start_time)) else: print("Pass {:d}, batch {:d}, face loss {:.6f}, " \ "head loss {:.6f}, " \ "time {:.5f}".format(pass_id, batch_id, face_loss, head_loss, start_time - prev_start_time)) batch_id += 1 except (fluid.core.EOFException, StopIteration): train_py_reader.reset() break epoch_end_time = time.time() total_time += epoch_end_time - start_time save_model(str(pass_id), train_prog) # only for ce if args.enable_ce: gpu_num = get_cards(args) print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_face_loss_card%s\t%s" % (gpu_num, face_loss)) print("kpis\ttrain_head_loss_card%s\t%s" % (gpu_num, head_loss))
def train_async(args): # parameters from arguments logging.debug('enter train') model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() tmp_prog = fluid.Program() train_loader, train_cost, train_acc1, train_acc5, global_lr = build_program( main_prog=train_prog, startup_prog=startup_prog, args=args) train_fetch_list = [ global_lr.name, train_cost.name, train_acc1.name, train_acc5.name ] place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if num_trainers <= 1 and args.use_gpu: places = fluid.framework.cuda_places() else: places = place exe.run(startup_prog) logging.debug('after run startup program') if checkpoint is not None: fluid.load(program=train_prog, model_path=checkpoint, executor=exe) if pretrained_model: load_pretrain(train_prog, pretrained_model) if args.use_gpu: devicenum = get_gpu_num() else: devicenum = 1 assert (args.train_batch_size % devicenum) == 0 train_batch_size = args.train_batch_size // devicenum train_loader.set_sample_generator( reader.train(args), batch_size=train_batch_size, drop_last=True, places=places) train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=args.use_gpu, loss_name=train_cost.name) totalruntime = 0 iter_no = 0 train_info = [0, 0, 0, 0] while iter_no <= args.total_iter_num: for train_batch in train_loader(): t1 = time.time() lr, loss, acc1, acc5 = train_exe.run(feed=train_batch, fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 lr = np.mean(np.array(lr)) train_info[0] += np.mean(np.array(loss)) train_info[1] += np.mean(np.array(acc1)) train_info[2] += np.mean(np.array(acc5)) train_info[3] += 1 if iter_no % args.display_iter_step == 0: avgruntime = totalruntime / args.display_iter_step avg_loss = train_info[0] / train_info[3] avg_acc1 = train_info[1] / train_info[3] avg_acc5 = train_info[2] / train_info[3] print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\ "acc1 %.4f, acc5 %.4f, time %2.2f sec" % \ (fmt_time(), iter_no, lr, avg_loss, avg_acc1, avg_acc5, avgruntime)) sys.stdout.flush() totalruntime = 0 if iter_no % args.display_iter_step == 0: train_info = [0, 0, 0, 0] totalruntime += period if iter_no % args.save_iter_step == 0 and iter_no != 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(iter_no)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.save(program=train_prog, model_path=model_path) iter_no += 1
def train(args): # parameters from arguments model_name = args.model pretrained_fp32_model = args.pretrained_fp32_model checkpoint = args.checkpoint model_save_dir = args.model_save_dir data_dir = args.data_dir activation_quant_type = args.act_quant_type weight_quant_type = args.wt_quant_type print("Using %s as the actiavtion quantize type." % activation_quant_type) print("Using %s as the weight quantize type." % weight_quant_type) startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() _, _, train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) image, out, test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) main_graph = IrGraph(core.Graph(train_prog.desc), for_test=False) test_graph = IrGraph(core.Graph(test_prog.desc), for_test=True) if pretrained_fp32_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_fp32_model, var.name)) fluid.io.load_vars( exe, pretrained_fp32_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: visible_device = os.getenv('CUDA_VISIBLE_DEVICES') if visible_device: device_num = len(visible_device.split(',')) else: device_num = subprocess.check_output( ['nvidia-smi', '-L']).decode().count('\n') else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 1 if activation_quant_type == 'abs_max' else 8 train_reader = paddle.batch( reader.train(data_dir=data_dir), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(data_dir=data_dir), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name, global_lr.name] test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] # 1. Make some quantization transforms in the graph before training and testing. # According to the weight and activation quantization type, the graph will be added # some fake quantize operators and fake dequantize operators. transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) if checkpoint: load_persistable_nodes(exe, checkpoint, main_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=train_cost.name, build_strategy=build_strategy) test_prog = test_graph.to_program() params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5, lr = exe.run(binary, fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, loss, acc1, acc5, "%.6f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, loss, acc1, acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, train_loss, train_acc1, train_acc5, test_loss, test_acc1, test_acc5)) sys.stdout.flush() save_checkpoint_path = os.path.join(model_save_dir, model_name, str(pass_id)) if not os.path.isdir(save_checkpoint_path): os.makedirs(save_checkpoint_path) save_persistable_nodes(exe, save_checkpoint_path, main_graph) model_path = os.path.join(model_save_dir, model_name, args.act_quant_type) float_path = os.path.join(model_path, 'float') int8_path = os.path.join(model_path, 'int8') mobile_path = os.path.join(model_path, 'mobile') if not os.path.isdir(model_path): os.makedirs(model_path) # 2. Freeze the graph after training by adjusting the quantize # operators' order for the inference. freeze_pass = QuantizationFreezePass( scope=fluid.global_scope(), place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() fluid.io.save_inference_model( dirname=float_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=server_program) # 3. Convert the weights into int8_t type. # (This step is optional.) convert_int8_pass = ConvertToInt8Pass(scope=fluid.global_scope(), place=place) convert_int8_pass.apply(test_graph) server_int8_program = test_graph.to_program() fluid.io.save_inference_model( dirname=int8_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=server_int8_program) # 4. Convert the freezed graph for paddle-mobile execution. # (This step is optional.) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) mobile_program = test_graph.to_program() fluid.io.save_inference_model( dirname=mobile_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=mobile_program)
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] devices_num = get_device_num() total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True model = model_builder.RCNN( add_conv_body_func=resnet. add_ResNet50_conv4_body, # res4: [-1, 1024, 84, 84] add_roi_box_head_func=resnet. add_ResNet_roi_conv5_head, # res5: [-1, 2048, 7, 7] use_pyreader=cfg.use_pyreader, use_random=use_random) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = losses boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = exponential_with_warmup_decay(learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) fetch_list = fetch_list + [lr] for var in fetch_list: var.persistable = True gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and cfg.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) # the process is fast when num_threads is 1 for multi-process training exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe shuffle = True # NOTE: do not shuffle dataset when using multi-process training shuffle_seed = None if num_trainers > 1: shuffle_seed = 1 if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=shuffle, shuffle_seed=shuffle_seed) if num_trainers > 1: assert shuffle_seed is not None, "If num_trainers > 1, the shuffle_seed must be set, because the order of batch data generated by reader must be the same in the respective processes" train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: if num_trainers > 1: shuffle = False train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - first_start_time last_loss = np.array(outs[0]).mean() except (StopIteration, fluid.core.EOFException): py_reader.reset() def train_loop(): train_stats = TrainingStats(cfg.log_window, keys) start_time = time.time() for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() stats = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(stats) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.use_pyreader: train_loop_pyreader() else: train_loop() save_model('model_final')
def train(args, train_file_list, val_file_list, data_args, learning_rate, batch_size, num_passes, model_save_dir, pretrained_model=None): image_shape = [3, data_args.resize_h, data_args.resize_w] num_classes = 2 devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') gt_box = fluid.layers.data(name='gt_box', shape=[4], dtype='float32', lod_level=1) gt_label = fluid.layers.data(name='gt_label', shape=[1], dtype='int32', lod_level=1) difficult = fluid.layers.data(name='gt_difficult', shape=[1], dtype='int32', lod_level=1) locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) nmsed_out = fluid.layers.detection_output(locs, confs, box, box_var, nms_threshold=args.nms_threshold) loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(test_program): map_eval = fluid.evaluator.DetectionMAP(nmsed_out, gt_label, gt_box, difficult, num_classes, overlap_threshold=0.5, evaluate_difficult=False, ap_version=args.ap_version) epocs = 4800 / batch_size boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100] values = [ learning_rate, learning_rate * 0.5, learning_rate * 0.25, learning_rate * 0.1, learning_rate * 0.01 ] optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) if args.parallel: train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu, loss_name=loss.name) train_reader = paddle.batch(reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch(reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label, difficult]) def save_model(postfix): model_path = os.path.join(model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) print('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path) best_map = 0. def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) for batch_id, data in enumerate(test_reader()): test_map, = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if batch_id % 20 == 0: print("Batch {0}, map {1}".format(batch_id, test_map)) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Pass {0}, test map {1}".format(pass_id, test_map)) return best_map train_num = 0 total_train_time = 0.0 for pass_id in range(num_passes): start_time = time.time() prev_start_time = start_time # end_time = 0 every_pass_loss = [] iter = 0 pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() if args.for_model_ce and iter == args.iterations: break if len(data) < (devices_num * 2): print("There are too few data to train on all devices.") continue if args.parallel: loss_v, = train_exe.run(fetch_list=[loss.name], feed=feeder.feed(data)) else: loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) # end_time = time.time() loss_v = np.mean(np.array(loss_v)) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, start_time - prev_start_time)) if args.for_model_ce and iter >= args.skip_batch_num or pass_id != 0: batch_duration = time.time() - start_time pass_duration += batch_duration train_num += len(data) every_pass_loss.append(loss_v) iter += 1 total_train_time += pass_duration if args.for_model_ce and pass_id == num_passes - 1: examples_per_sec = train_num / total_train_time cost = np.mean(every_pass_loss) with open("train_speed_factor.txt", 'w') as f: f.write('{:f}\n'.format(examples_per_sec)) with open("train_cost_factor.txt", 'a+') as f: f.write('{:f}\n'.format(cost)) best_map = test(pass_id, best_map) if pass_id % 10 == 0 or pass_id == num_passes - 1: save_model(str(pass_id)) print("Best test map {0}".format(best_map))
def train(): if cfg.debug or args.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 random.seed(0) np.random.seed(0) if not os.path.exists(cfg.model_save_dir): os.makedirs(cfg.model_save_dir) model = YOLOv3() model.build_model() input_size = cfg.input_size loss = model.loss() loss.persistable = True devices_num = get_device_num() print("Found {} CUDA devices.".format(devices_num)) learning_rate = cfg.learning_rate boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] optimizer = fluid.optimizer.Momentum( learning_rate=exponential_with_warmup_decay( learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor), regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrain: if not os.path.exists(cfg.pretrain): print("Pretrain weights not found: {}".format(cfg.pretrain)) def if_exist(var): return os.path.exists(os.path.join(cfg.pretrain, var.name)) \ and var.name.find('yolo_output') < 0 fluid.io.load_vars(exe, cfg.pretrain, predicate=if_exist) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False #gc and memory optimize may conflict syncbn = cfg.syncbn if (syncbn and devices_num <= 1) or num_trainers > 1: print("Disable syncbn in single device") syncbn = False build_strategy.sync_batch_norm = syncbn exec_strategy = fluid.ExecutionStrategy() if cfg.use_gpu and num_trainers > 1: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) exec_strategy.num_threads = 1 compile_program = fluid.compiler.CompiledProgram(fluid.default_main_program( )).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) random_sizes = [cfg.input_size] if cfg.random_shape: random_sizes = [32 * i for i in range(10, 20)] total_iter = cfg.max_iter - cfg.start_iter mixup_iter = total_iter - cfg.no_mixup_iter shuffle = True if args.enable_ce: shuffle = False shuffle_seed = None # NOTE: yolov3 is a special model, if num_trainers > 1, each process # trian the completed dataset. # if num_trainers > 1: shuffle_seed = 1 train_reader = reader.train( input_size, batch_size=cfg.batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, total_iter=total_iter * devices_num, mixup_iter=mixup_iter * devices_num, random_sizes=random_sizes, use_multiprocess_reader=cfg.use_multiprocess_reader) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) fetch_list = [loss] py_reader.start() smoothed_loss = SmoothedValue() try: start_time = time.time() prev_start_time = start_time snapshot_loss = 0 snapshot_time = 0 for iter_id in range(cfg.start_iter, cfg.max_iter): prev_start_time = start_time start_time = time.time() losses = exe.run(compile_program, fetch_list=[v.name for v in fetch_list]) smoothed_loss.add_value(np.mean(np.array(losses[0]))) snapshot_loss += np.mean(np.array(losses[0])) snapshot_time += start_time - prev_start_time lr = np.array(fluid.global_scope().find_var('learning_rate') .get_tensor()) print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format( iter_id, lr[0], smoothed_loss.get_mean_value(), start_time - prev_start_time)) sys.stdout.flush() if (iter_id + 1) % cfg.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) print("Snapshot {} saved, average loss: {}, \ average time: {}".format( iter_id + 1, snapshot_loss / float(cfg.snapshot_iter), snapshot_time / float(cfg.snapshot_iter))) if args.enable_ce and iter_id == cfg.max_iter - 1: if devices_num == 1: print("kpis\ttrain_cost_1card\t%f" % (snapshot_loss / float(cfg.snapshot_iter))) print("kpis\ttrain_duration_1card\t%f" % (snapshot_time / float(cfg.snapshot_iter))) else: print("kpis\ttrain_cost_8card\t%f" % (snapshot_loss / float(cfg.snapshot_iter))) print("kpis\ttrain_duration_8card\t%f" % (snapshot_time / float(cfg.snapshot_iter))) snapshot_loss = 0 snapshot_time = 0 except fluid.core.EOFException: py_reader.reset() save_model('model_final')
def parallel_do(args, train_file_list, val_file_list, data_args, learning_rate, batch_size, num_passes, model_save_dir, pretrained_model=None): image_shape = [3, data_args.resize_h, data_args.resize_w] if data_args.dataset == 'coco': num_classes = 81 elif data_args.dataset == 'pascalvoc': num_classes = 21 image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') gt_box = fluid.layers.data(name='gt_box', shape=[4], dtype='float32', lod_level=1) gt_label = fluid.layers.data(name='gt_label', shape=[1], dtype='int32', lod_level=1) difficult = fluid.layers.data(name='gt_difficult', shape=[1], dtype='int32', lod_level=1) if args.parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl) with pd.do(): image_ = pd.read_input(image) gt_box_ = pd.read_input(gt_box) gt_label_ = pd.read_input(gt_label) difficult_ = pd.read_input(difficult) locs, confs, box, box_var = mobile_net(num_classes, image_, image_shape) loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box, box_var) nmsed_out = fluid.layers.detection_output(locs, confs, box, box_var, nms_threshold=0.45) loss = fluid.layers.reduce_sum(loss) pd.write_output(loss) pd.write_output(nmsed_out) loss, nmsed_out = pd() loss = fluid.layers.mean(loss) else: locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) nmsed_out = fluid.layers.detection_output(locs, confs, box, box_var, nms_threshold=0.45) loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) with fluid.program_guard(test_program): map_eval = fluid.evaluator.DetectionMAP(nmsed_out, gt_label, gt_box, difficult, num_classes, overlap_threshold=0.5, evaluate_difficult=False, ap_version=args.ap_version) if data_args.dataset == 'coco': # learning rate decay in 12, 19 pass, respectively if '2014' in train_file_list: boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19] elif '2017' in train_file_list: boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19] elif data_args.dataset == 'pascalvoc': boundaries = [40000, 60000] values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25] optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_reader = paddle.batch(reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch(reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, gt_box, gt_label, difficult]) def test(pass_id): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) test_map = None for data in test_reader(): test_map = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) print("Test {0}, map {1}".format(pass_id, test_map[0])) for pass_id in range(num_passes): start_time = time.time() prev_start_time = start_time end_time = 0 for batch_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() loss_v = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) end_time = time.time() if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v[0], start_time - prev_start_time)) test(pass_id) if pass_id % 10 == 0 or pass_id == num_passes - 1: model_path = os.path.join(model_save_dir, str(pass_id)) print 'save models to %s' % (model_path) fluid.io.save_persistables(exe, model_path)