def main(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.log_path: file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Running with args : {}'.format(args)) config = RNNConfig(args) if not os.path.exists(args.save_model_dir): mkpath(args.save_model_dir) # define train program main_program = fluid.Program() startup_program = fluid.Program() if args.enable_ce: startup_program.random_seed = SEED with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): res_vars = lm_model.lm_model(config.hidden_size, config.vocab_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=args.use_dataloader) if args.use_dataloader: dataloader = res_vars[-1] res_vars = res_vars[:-1] loss, last_hidden, last_cell, feed_order = res_vars fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=config.max_grad_norm)) learning_rate = fluid.layers.create_global_var( name="learning_rate", shape=[1], value=1.0, dtype='float32', persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) # define inference program inference_program = fluid.Program() inference_startup_program = fluid.Program() with fluid.program_guard(inference_program, inference_startup_program): with fluid.unique_name.guard(): lm_model.lm_model(config.hidden_size, config.vocab_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=False) # Some op behaves differently for train and inference, we need to call # this clone function to ensure every op is right for inference. inference_program = inference_program.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) exe.run(startup_program) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load(main_program, args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) device_count = len(fluid.cuda_places()) if args.use_gpu else len( fluid.cpu_places()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_count exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = True try: fluid.require_version(min_version='1.7.0') build_strategy.enable_auto_fusion = args.enable_auto_fusion except Exception as e: logger.info("PaddlePaddle version 1.7.0 or higher is " "required when you want to enable fusion_group.") if args.parallel: train_program = fluid.compiler.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_program = fluid.compiler.CompiledProgram(main_program) data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data def generate_init_data(): batch_size = config.batch_size * device_count init_hidden = np.zeros( (batch_size, config.num_layers, config.hidden_size), dtype='float32') init_cell = np.zeros( (batch_size, config.num_layers, config.hidden_size), dtype='float32') return init_hidden, init_cell def generate_new_lr(epoch_id=0, device_count=1): new_lr = config.base_learning_rate * (config.lr_decay**max( epoch_id + 1 - config.epoch_start_decay, 0.0)) lr = np.ones((device_count), dtype='float32') * new_lr return lr def prepare_input(batch, init_hidden=None, init_cell=None, epoch_id=0, with_lr=True, device_count=1): x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) res = {} res['x'] = x res['y'] = y if init_hidden is not None: res['init_hidden'] = init_hidden if init_cell is not None: res['init_cell'] = init_cell if with_lr: res['learning_rate'] = generate_new_lr(epoch_id, device_count) return res def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, config.batch_size * device_count, config.num_steps) total_loss = 0.0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=False) fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=False) cost_eval = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_eval iters += config.num_steps ppl = np.exp(total_loss / iters) return ppl def get_log_interval(data_len): num_batchs = data_len // config.batch_size epoch_size = (num_batchs - 1) // config.num_steps log_interval = max(1, epoch_size // 10) return log_interval def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) train_data_iter = reader.get_data_iter( train_data, config.batch_size * device_count, config.num_steps) total_loss = 0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, with_lr=True, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break ppl = np.exp(total_loss / iters) return ppl def train_an_epoch_dataloader(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) init_hidden, init_cell = generate_init_data() total_loss = 0 iters = 0 dataloader.start() batch_id = 0 try: while True: data_feeds = {} if batch_id == 0: batch_time = 0 batch_start_time = time.time() else: batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() new_lr = generate_new_lr(epoch_id, device_count) data_feeds['learning_rate'] = new_lr data_feeds["init_hidden"] = init_hidden data_feeds["init_cell"] = init_cell fetch_outs = exe.run(train_program, feed=data_feeds, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and (log_interval == 0 or batch_id % log_interval == 0): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) batch_id += 1 # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break except fluid.core.EOFException: dataloader.reset() batch_times.append(time.time() - batch_start_time) ppl = np.exp(total_loss / iters) return ppl def train(): if args.use_dataloader: def data_gen(): data_iter_size = config.batch_size train_batches = reader.get_data_iter(train_data, data_iter_size, config.num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) yield x, y dataloader.set_batch_generator(data_gen) total_time = 0.0 for epoch_id in range(config.max_epoch): batch_times = [] epoch_start_time = time.time() if args.use_dataloader: train_ppl = train_an_epoch_dataloader(epoch_id, batch_times) else: train_ppl = train_an_epoch(epoch_id, batch_times) epoch_time = time.time() - epoch_start_time total_time += epoch_time print( "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n" % (epoch_id, epoch_time, train_ppl[0], len(batch_times) / sum(batch_times))) # FIXME(zjl): ppl[0] increases as batch_size increases. # We should find a better way to calculate ppl by normalizing batch_size. if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[ 0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return if epoch_id == config.max_epoch - 1 and args.enable_ce: # kpis print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" % (args.rnn_model, device_count, total_time / config.max_epoch)) print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" % (args.rnn_model, device_count, train_ppl[0])) if not args.profile: # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100 # Just skip to avoid error def is_valid_data(data, batch_size, num_steps): data_len = len(data) batch_len = data_len // batch_size epoch_size = (batch_len - 1) // num_steps return epoch_size >= 1 valid_data_valid = is_valid_data(valid_data, config.batch_size, config.num_steps) if valid_data_valid: valid_ppl = eval(valid_data) print("Valid ppl: %.5f" % valid_ppl[0]) else: print( 'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}' .format(len(valid_data), config.batch_size, config.num_steps)) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id)) if not os.path.exists(save_model_dir): mkpath(save_model_dir) save_model_dir = os.path.join(save_model_dir, 'params') fluid.save(main_program, save_model_dir) print("Saved model to: %s.\n" % save_model_dir) with profile_context(args.profile, args.profiler_path): train() test_ppl = eval(test_data) print("Test ppl:", test_ppl[0])
def __init__(self, args, network): """ 对训练过程进行初始化 :param args: 预测参数 :param network: 网络构建对象 """ self.args = args self.logger = UtilLogging(args, __name__) self.app_name = self.args["app_name"] self.network = network output_dir = get_fullurl('result_predict', self.app_name, 'dir') if not os.path.exists(output_dir): os.mkdir(output_dir) ''' 创建预测过程 ''' self.logger.info("Initializing training process...") self.predict_main_prog = fluid.Program() self.predict_startup_prog = fluid.Program() with fluid.program_guard(self.predict_main_prog, self.predict_startup_prog): # 使用 fluid.unique_name.guard() 实现与test program的参数共享 with fluid.unique_name.guard(): # 初始化网络结构 self.logger.info("Initializing predict neural network...") predict_data_loader, predict_fetch_data = self._init_predict_model() self.logger.info("Training neural network initialized.") # 属性化 self.predict_data_loader = predict_data_loader self.predict_fetch_data = predict_fetch_data self.logger.info("Training process initialized.") ''' 读取保存的模型 ''' # 定义执行器 self.executor = fluid.Executor(engine_utils.get_executor_run_places(self.args)) # 执行初始化 self.executor.run(self.predict_startup_prog) # 读取保存的模型 self._load_process(self.executor, self.predict_main_prog) self.logger.info("Validation process initialized.") ''' 过程并行化 ''' USE_PARALLEL = self.args["use_parallel"] # 备份原program,因为compiled_program没有保存 self.origin_train_prog = self.predict_main_prog if USE_PARALLEL: self.logger.info("Initialize parallel processes...") # 设置并行训练的策略 # 这里可以用参数配置,不过要改的东西很多,所以先写死吧 build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce # 构建并行过程 self.predict_main_prog = fluid.CompiledProgram(self.predict_main_prog).with_data_parallel( places=engine_utils.get_data_run_places(self.args), build_strategy=build_strategy) self.logger.info("Parallel processes initialized.")
def train(): learning_rate = cfg.learning_rate image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size] if cfg.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 import random random.seed(0) np.random.seed(0) devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) total_batch_size = devices_num * cfg.TRAIN.im_per_batch use_random = True if cfg.enable_ce: use_random = False model = model_builder.RCNN( add_conv_body_func=resnet.add_ResNet50_conv4_body, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, use_pyreader=cfg.use_pyreader, use_random=use_random) model.build_model(image_shape) losses, keys = model.loss() loss = losses[0] fetch_list = losses boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] lr = exponential_with_warmup_decay(learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor) optimizer = fluid.optimizer.Momentum( learning_rate=lr, regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) fetch_list = fetch_list + [lr] for var in fetch_list: var.persistable = True #fluid.memory_optimize(fluid.default_main_program(), skip_opt_set=set(fetch_list)) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrained_model: def if_exist(var): return os.path.exists(os.path.join(cfg.pretrained_model, var.name)) fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist) if cfg.parallel: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu), loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe shuffle = True if cfg.enable_ce: shuffle = False if cfg.use_pyreader: train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch, total_batch_size=total_batch_size, padding_total=cfg.TRAIN.padding_minibatch, shuffle=shuffle) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) else: train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle) feeder = fluid.DataFeeder(place=place, feed_list=model.feeds()) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) except (StopIteration, fluid.core.EOFException): py_reader.reset() def train_loop(): start_time = time.time() prev_start_time = start_time start = start_time train_stats = TrainingStats(cfg.log_window, keys) for iter_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list], feed=feeder.feed(data)) stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) if (iter_id + 1) == cfg.max_iter: break end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() # only for ce if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) return np.mean(every_pass_loss) if cfg.use_pyreader: train_loop_pyreader() else: train_loop() save_model('model_final')
def main(): env = os.environ cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # build program model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] train_feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(train_feed_vars) loss = train_fetches['loss'] start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, fluid.default_startup_program()): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] test_feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(test_feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) teacher_cfg = load_config(FLAGS.teacher_config) merge_config(FLAGS.opt) teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): teacher_feed_vars = OrderedDict() for name, var in train_feed_vars.items(): teacher_feed_vars[name] = teacher_program.global_block( )._clone_variable(var, force_persistable=False) model = create(teacher_arch) train_fetches = model.train(teacher_feed_vars) teacher_loss = train_fetches['loss'] exe.run(teacher_startup_program) assert FLAGS.teacher_pretrained, "teacher_pretrained should be set" checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained) teacher_program = teacher_program.clone(for_test=True) target_number = len(model.yolo_head.anchor_masks) data_name_map = { 'image': 'image', 'gt_bbox': 'gt_bbox', 'gt_class': 'gt_class', 'gt_score': 'gt_score' } for i in range(target_number): data_name_map['target{}'.format(i)] = 'target{}'.format(i) merge(teacher_program, fluid.default_main_program(), data_name_map, place) output_names = [ [ 'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0', 'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0', 'strided_slice_4.tmp_0', 'transpose_0.tmp_0' ], [ 'strided_slice_5.tmp_0', 'strided_slice_6.tmp_0', 'strided_slice_7.tmp_0', 'strided_slice_8.tmp_0', 'strided_slice_9.tmp_0', 'transpose_2.tmp_0' ], [ 'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0', 'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0', 'strided_slice_14.tmp_0', 'transpose_4.tmp_0' ], ] yolo_output_names = [] for i in range(target_number): yolo_output_names.extend(output_names[i]) assert cfg.use_fine_grained_loss, \ "Only support use_fine_grained_loss=True, Please set it in config file or '-o use_fine_grained_loss=true'" distill_loss = split_distill(yolo_output_names, 1000, target_number) loss = distill_loss + loss lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') lr = lr_builder() opt = optim_builder(lr) opt.minimize(loss) exe.run(fluid.default_startup_program()) checkpoint.load_params(exe, fluid.default_main_program(), cfg.pretrain_weights) assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios), \ "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) distill_prog = pruner.prune(fluid.default_main_program(), fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] base_flops = flops(eval_prog) eval_prog = pruner.prune(eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 parallel_main = fluid.CompiledProgram(distill_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.CompiledProgram(eval_prog) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() map_type = cfg.map_type if 'map_type' in cfg else '11point' best_box_ap_list = [0.0, 0] #[map, iter] cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) train_loader.start() for step_id in range(start_iter, cfg.max_iters): teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run( parallel_main, fetch_list=[ 'teacher_' + teacher_loss.name, distill_loss.name, loss.name, lr.name ]) if step_id % cfg.log_iter == 0: logger.info( "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}" .format(step_id, lr_np[0], loss_np[0], distill_loss_np[0], teacher_loss_np[0])) if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1: save_name = str( step_id) if step_id != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, distill_prog, os.path.join(save_dir, save_name)) # eval results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = step_id checkpoint.save(exe, distill_prog, os.path.join("./", "best_model")) logger.info("Best test box ap: {}, in step: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def train(args): # parse config config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') train_model = models.get_model(args.model_name, train_config, mode='train') valid_model = models.get_model(args.model_name, valid_config, mode='valid') # build model startup = fluid.Program() train_prog = fluid.Program() if args.fix_random_seed: startup.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup): with fluid.unique_name.guard(): train_model.build_input(use_dataloader=True) train_model.build_model() # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label train_feeds = train_model.feeds() train_fetch_list = train_model.fetches() train_loss = train_fetch_list[0] optimizer = train_model.optimizer() optimizer.minimize(train_loss) train_dataloader = train_model.dataloader() valid_prog = fluid.Program() with fluid.program_guard(valid_prog, startup): with fluid.unique_name.guard(): valid_model.build_input(use_dataloader=True) valid_model.build_model() valid_feeds = valid_model.feeds() valid_fetch_list = valid_model.fetches() valid_dataloader = valid_model.dataloader() place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) if args.pretrain: train_model.load_pretrain_params(exe, args.pretrain, train_prog) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() compiled_train_prog = fluid.compiler.CompiledProgram( train_prog).with_data_parallel(loss_name=train_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_valid_prog = fluid.compiler.CompiledProgram( valid_prog).with_data_parallel(share_vars_from=compiled_train_prog, build_strategy=build_strategy, exec_strategy=exec_strategy) # get reader bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \ "shoud be the same as that " \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) valid_config.VALID.batch_size = int(valid_config.VALID.batch_size / bs_denominator) train_reader = get_reader(args.model_name.upper(), 'train', train_config) valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config) # get metrics train_metrics = get_metrics(args.model_name.upper(), 'train', train_config) valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config) epochs = args.epoch or train_model.epoch_num() exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_dataloader.set_sample_list_generator(train_reader, places=exe_places) valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places) train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, train_fetch_list, train_metrics, epochs=epochs, log_interval=args.log_interval, valid_interval=args.valid_interval, save_dir=args.save_dir, save_model_name=args.model_name, fix_random_seed=args.fix_random_seed, compiled_test_prog=compiled_valid_prog, test_dataloader=valid_dataloader, test_fetch_list=valid_fetch_list, test_metrics=valid_metrics)
def build_model(self): img = fluid.layers.data(name='img', shape=[784], dtype='float32') noise = fluid.layers.data(name='noise', shape=[self.cfg.noise_size], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='float32') g_trainer = GTrainer(noise, label, self.cfg) d_trainer = DTrainer(img, label, self.cfg) # prepare enviorment place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) const_n = np.random.uniform( low=-1.0, high=1.0, size=[self.cfg.batch_size, self.cfg.noise_size]).astype('float32') if self.cfg.init_model: utility.init_checkpoints(self.cfg, exe, g_trainer, "net_G") utility.init_checkpoints(self.cfg, exe, d_trainer, "net_D") ### memory optim build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = False g_trainer_program = fluid.CompiledProgram( g_trainer.program).with_data_parallel( loss_name=g_trainer.g_loss.name, build_strategy=build_strategy) d_trainer_program = fluid.CompiledProgram( d_trainer.program).with_data_parallel( loss_name=d_trainer.d_loss.name, build_strategy=build_strategy) t_time = 0 losses = [[], []] for epoch_id in range(self.cfg.epoch): for batch_id, data in enumerate(self.train_reader()): if len(data) != self.cfg.batch_size: continue noise_data = np.random.uniform( low=-1.0, high=1.0, size=[self.cfg.batch_size, self.cfg.noise_size]).astype('float32') real_image = np.array(list(map(lambda x: x[0], data))).reshape( [-1, 784]).astype('float32') real_label = np.ones(shape=[real_image.shape[0], 1], dtype='float32') fake_label = np.zeros(shape=[real_image.shape[0], 1], dtype='float32') s_time = time.time() generate_image = exe.run(g_trainer.infer_program, feed={'noise': noise_data}, fetch_list=[g_trainer.fake]) d_real_loss = exe.run(d_trainer_program, feed={ 'img': real_image, 'label': real_label }, fetch_list=[d_trainer.d_loss])[0] d_fake_loss = exe.run(d_trainer_program, feed={ 'img': generate_image, 'label': fake_label }, fetch_list=[d_trainer.d_loss])[0] d_loss = d_real_loss + d_fake_loss losses[1].append(d_loss) for _ in six.moves.xrange(self.cfg.num_generator_time): g_loss = exe.run(g_trainer_program, feed={'noise': noise_data}, fetch_list=[g_trainer.g_loss])[0] losses[0].append(g_loss) batch_time = time.time() - s_time t_time += batch_time if batch_id % self.cfg.print_freq == 0: image_path = self.cfg.output + '/images' if not os.path.exists(image_path): os.makedirs(image_path) generate_const_image = exe.run(g_trainer.infer_program, feed={'noise': const_n}, fetch_list={g_trainer.fake })[0] generate_image_reshape = np.reshape( generate_const_image, (self.cfg.batch_size, -1)) total_images = np.concatenate( [real_image, generate_image_reshape]) fig = utility.plot(total_images) print( 'Epoch ID={} Batch ID={} D_loss={} G_loss={} Batch_time_cost={:.2f}' .format(epoch_id, batch_id, d_loss[0], g_loss[0], batch_time)) plt.title('Epoch ID={}, Batch ID={}'.format( epoch_id, batch_id)) plt.savefig('{}/{:04d}_{:04d}.png'.format( image_path, epoch_id, batch_id), bbox_inches='tight') plt.close(fig) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, exe, g_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, exe, d_trainer, "net_D")
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version cfg.use_gpu = False check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage(cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: assert six.PY3, "VisualDL requires Python >= 3.5" from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def train(args): print("pretraining start") ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='train_reader', ernie_config=ernie_config) scheduled_lr = optimization(loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) fluid.memory_optimize(input_program=train_program, skip_opt_set=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model( pyreader_name='test_reader', ernie_config=ernie_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) print("theoretical memory usage: ") print( fluid.contrib.memory_usage(program=train_program, batch_size=args.batch_size // args.max_seq_len)) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = ErnieDataReader(filelist=args.train_filelist, batch_size=args.batch_size, vocab_path=args.vocab_path, voc_size=ernie_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() if args.use_fast_executor: exec_strategy.use_experimental_executor = True exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = min(10, args.skip_steps) build_strategy = fluid.BuildStrategy() build_strategy.remove_unnecessary_lock = False train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, loss_name=total_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, main_program=train_program, num_trainers=nccl2_num_trainers, trainer_id=nccl2_trainer_id) if args.valid_filelist and args.valid_filelist != "": predict = predict_wrapper(args, exe, ernie_config, test_prog=test_prog, pyreader=test_pyreader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_pyreader.decorate_tensor_provider(data_reader.data_generator()) train_pyreader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += nccl2_num_trainers skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: train_exe.run(fetch_list=[]) continue if steps % skip_steps != 0: train_exe.run(fetch_list=[]) else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = train_exe.run( fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ]) acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) print("feed_queue size", train_pyreader.queue.size()) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress( ) print("current learning_rate:%f" % np_lr[0]) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file, mask_type)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if args.valid_filelist and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_pyreader.reset() break
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict): # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: logging.info("init fluid.framework.default_startup_program") exe.run(fluid.framework.default_startup_program()) logging.info("begin reader") train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], # count start and end tokens out max_length=ModelHyperParams.max_length - 2, clip_last_batch=False) logging.info("begin read multiple") train_data = read_multiple(reader=train_data.batch_generator, count=dev_count if args.use_token_batch else 1) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=train_progm, build_strategy=build_strategy) data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields if args.val_file_pattern is not None: test = test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, util_input_names, sum_cost, token_num) # the best cross-entropy value with label smoothing loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) logging.info("begin train:") init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() logging.info("pass_id:{0}".format(pass_id)) avg_batch_time = time.time() for batch_id, data in enumerate(train_data()): logging.info("batch_id:{0} data_len:{1}".format( batch_id, len(data))) feed_list = [] total_num_token = 0 if args.local: lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate( split_data(data, num_part=dev_count)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_kv_pairs = data_input_dict.items( ) + util_input_dict.items() if args.local: feed_kv_pairs += { lr_scheduler.learning_rate.name: lr_rate }.items() feed_list.append(dict(feed_kv_pairs)) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token #outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], # feed=feed_list) outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name] if batch_id % 100 == 0 else [], feed=feed_list) if batch_id % 100 == 0 and batch_id > 0: sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num logging.info( "epoch: %d, batch: %d, avg loss: %f, normalized loss: %f," " ppl: %f" % (pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) logging.info("speed: {0} batch/s".format( 100.0 / (time.time() - avg_batch_time))) """ if batch_id > 0 and batch_id % 1000 == 0: fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint")) """ init = True if batch_id % 100 == 0 and batch_id > 0: avg_batch_time = time.time() time_consumed = time.time() - pass_start_time # Validate and save the model for inference. if args.val_file_pattern is not None: val_avg_cost, val_ppl = test() logging.info( "epoch: %d, val avg loss: %f, val normalized loss: %f, val ppl: %f," " consumed %fs" % (pass_id, val_avg_cost, val_avg_cost - loss_normalizer, val_ppl, time_consumed)) else: logging.info("epoch: %d, consumed %fs" % (pass_id, time_consumed)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "pass_" + str(pass_id) + ".checkpoint")) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) print("kpis\ttest_cost_card%d\t%f" % (dev_count, val_avg_cost)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def do_train(args): """ Main Function """ ernie_config = ErnieConfig(args.ernie_config_path) ernie_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = 1 else: dev_count = min(multiprocessing.cpu_count(), args.cpu_num) if (dev_count < args.cpu_num): print("WARNING: The total CPU NUM in this machine is %d, which is less than cpu_num parameter you set. " "Change the cpu_num from %d to %d"%(dev_count, args.cpu_num, dev_count)) os.environ['CPU_NUM'] = str(dev_count) place = fluid.CPUPlace() exe = fluid.Executor(place) startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): # user defined model based on ernie embeddings train_ret = creator.create_ernie_model(args, ernie_config) # ernie pyreader train_pyreader = creator.create_pyreader(args, file_name=args.train_data, feed_list=train_ret['feed_list'], model="ernie", place=place) test_program = train_program.clone(for_test=True) test_pyreader = creator.create_pyreader(args, file_name=args.test_data, feed_list=train_ret['feed_list'], model="ernie", place=place) optimizer = fluid.optimizer.Adam(learning_rate=args.base_learning_rate) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) optimizer.minimize(train_ret["avg_cost"]) lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) print("Device count: %d" % dev_count) exe.run(startup_prog) # load checkpoints if args.init_checkpoint and args.init_pretraining_params: print("WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, startup_prog) elif args.init_pretraining_params: utils.init_pretraining_params(exe, args.init_pretraining_params, startup_prog) if dev_count>1 and not args.use_cuda: device = "GPU" if args.use_cuda else "CPU" print("%d %s are used to train model"%(dev_count, device)) # multi cpu/gpu config exec_strategy = fluid.ExecutionStrategy() build_strategy = fluid.BuildStrategy() compiled_prog = fluid.compiler.CompiledProgram(train_program).with_data_parallel( loss_name=train_ret['avg_cost'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_prog = fluid.compiler.CompiledProgram(train_program) # start training steps = 0 for epoch_id in range(args.epoch): for data in train_pyreader(): steps += 1 if steps % args.print_steps == 0: fetch_list = [ train_ret["avg_cost"], train_ret["precision"], train_ret["recall"], train_ret["f1_score"], ] else: fetch_list = [] start_time = time.time() outputs = exe.run(program=compiled_prog, feed=data[0], fetch_list=fetch_list) end_time = time.time() if steps % args.print_steps == 0: loss, precision, recall, f1_score = [np.mean(x) for x in outputs] print("[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f, " "pyreader queue_size: %d " % (steps, loss, precision, recall, f1_score, end_time - start_time, train_pyreader.queue.size())) if steps % args.save_steps == 0: save_path = os.path.join(args.model_save_dir, "step_" + str(steps)) print("\tsaving model as %s" % (save_path)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: evaluate(exe, test_program, test_pyreader, train_ret) save_path = os.path.join(args.model_save_dir, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program)
def main(args): devices = os.getenv("CUDA_VISIBLE_DEVICES") or "" devices_num = len(devices.split(",")) step_per_epoch = int(args.trainset_num * args.train_portion / args.batch_size) is_shuffle = True startup_prog = fluid.Program() data_prog = fluid.Program() test_prog = fluid.Program() image_shape = [int(m) for m in args.image_shape.split(",")] logger.info("Constructing graph...") with fluid.unique_name.guard(): with fluid.program_guard(data_prog, startup_prog): image_train = fluid.data(name="image_train", shape=[None] + image_shape, dtype="float32") label_train = fluid.data(name="label_train", shape=[None, 1], dtype="int64") image_val = fluid.data(name="image_val", shape=[None] + image_shape, dtype="float32") label_val = fluid.data(name="label_val", shape=[None, 1], dtype="int64") train_loader = fluid.io.DataLoader.from_generator( feed_list=[image_train, label_train], capacity=64, use_double_buffer=True, iterable=True) valid_loader = fluid.io.DataLoader.from_generator( feed_list=[image_val, label_val], capacity=64, use_double_buffer=True, iterable=True) learning_rate = fluid.layers.cosine_decay(args.learning_rate, 4 * step_per_epoch, args.epochs) # Pytorch CosineAnnealingLR learning_rate = learning_rate / args.learning_rate * ( args.learning_rate - args.learning_rate_min) + args.learning_rate_min arch_progs_list, fetch = architect.compute_unrolled_step( image_train, label_train, image_val, label_val, data_prog, startup_prog, learning_rate, args) train_prog = data_prog.clone() with fluid.program_guard(train_prog, startup_prog): logits, loss = model(image_train, label_train, args.init_channels, args.class_num, args.layers, name="model") top1 = fluid.layers.accuracy(input=logits, label=label_train, k=1) top5 = fluid.layers.accuracy(input=logits, label=label_train, k=5) logger.info("param size = {:.6f}MB".format( utility.count_parameters_in_MB( train_prog.global_block().all_parameters(), 'model'))) test_prog = train_prog.clone(for_test=True) model_var = utility.get_parameters( train_prog.global_block().all_parameters(), 'model')[1] clip = fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) follower_opt = fluid.optimizer.MomentumOptimizer( learning_rate, args.momentum, regularization=fluid.regularizer.L2DecayRegularizer( args.weight_decay), grad_clip=clip) follower_opt.minimize(loss, parameter_list=[v.name for v in model_var]) logger.info("Construct graph done") place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) train_reader, valid_reader = reader.train_search( batch_size=args.batch_size, train_portion=args.train_portion, is_shuffle=is_shuffle, args=args) places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_loader.set_batch_generator(train_reader, places=places) valid_loader.set_batch_generator(valid_reader, places=places) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 4 * devices_num build_strategy = fluid.BuildStrategy() if args.with_mem_opt: learning_rate.persistable = True loss.persistable = True top1.persistable = True top5.persistable = True build_strategy.enable_inplace = True build_strategy.memory_optimize = True arch_progs_list[0] = fluid.CompiledProgram( arch_progs_list[0]).with_data_parallel(loss_name=fetch[0].name, build_strategy=build_strategy, exec_strategy=exec_strategy) arch_progs_list[1] = fluid.CompiledProgram( arch_progs_list[1]).with_data_parallel(loss_name=fetch[1].name, build_strategy=build_strategy, exec_strategy=exec_strategy) parallel_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel( build_strategy=build_strategy, exec_strategy=exec_strategy) def save_model(postfix, program): model_path = os.path.join(args.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) logger.info('save models to %s' % (model_path)) fluid.io.save_persistables(exe, model_path, main_program=program) best_acc = 0 for epoch_id in range(args.epochs): # get genotype genotype(test_prog, exe, place) train_fetch_list = [learning_rate, loss, top1, top5] train_top1 = train(epoch_id, train_loader, valid_loader, train_fetch_list, arch_progs_list, parallel_train_prog, exe) logger.info("Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1)) valid_fetch_list = [loss, top1, top5] valid_top1 = valid(epoch_id, valid_loader, valid_fetch_list, compiled_test_prog, exe) if valid_top1 > best_acc: best_acc = valid_top1 logger.info("Epoch {}, valid_acc {:.6f}, best_valid_acc {:6f}".format( epoch_id, valid_top1, best_acc))
def main(): if FLAGS.eval is False: raise ValueError( "Currently only supports `--eval==True` while training in `quantization`." ) env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env \ and 'PADDLE_TRAINERS_NUM' in env \ and int(env['PADDLE_TRAINERS_NUM']) > 1 num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) if FLAGS.use_pact: feed_vars['image'].stop_gradient = False train_fetches = model.train(feed_vars) loss = train_fetches['loss'] lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True build_strategy.fuse_all_reduce_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' sync_bn = False build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) not_quant_pattern = [] if FLAGS.not_quant_pattern: not_quant_pattern = FLAGS.not_quant_pattern config = { 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', 'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'], 'not_quant_pattern': not_quant_pattern } ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' if cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) if FLAGS.use_pact: act_preprocess_func = pact optimizer_func = get_optimizer executor = exe else: act_preprocess_func = None optimizer_func = None executor = None # insert quantize op in train_prog, return type is CompiledProgram train_prog_quant = quant_aware(train_prog, place, config, scope=None, act_preprocess_func=act_preprocess_func, optimizer_func=optimizer_func, executor=executor, for_test=False) compiled_train_prog = train_prog_quant.with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: # insert quantize op in eval_prog eval_prog = quant_aware(eval_prog, place, config, scope=None, act_preprocess_func=act_preprocess_func, optimizer_func=optimizer_func, executor=executor, for_test=True) compiled_eval_prog = fluid.CompiledProgram(eval_prog) start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num, num_trainers=num_trainers) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_iter, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it save_checkpoint(exe, eval_prog, os.path.join(save_dir, "best_model"), train_prog) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def main(): env = os.environ cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) # build program model = create(main_arch) inputs_def = cfg['TrainReader']['inputs_def'] train_feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(train_feed_vars) loss = train_fetches['loss'] start_iter = 0 train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) # get all student variables student_vars = [] for v in fluid.default_main_program().list_vars(): try: student_vars.append((v.name, v.shape)) except: pass # uncomment the following lines to print all student variables # print("="*50 + "student_model_vars" + "="*50) # print(student_vars) eval_prog = fluid.Program() with fluid.program_guard(eval_prog, fluid.default_startup_program()): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] test_feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(test_feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) teacher_cfg = load_config(FLAGS.teacher_config) merge_config(FLAGS.opt) teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): teacher_feed_vars = OrderedDict() for name, var in train_feed_vars.items(): teacher_feed_vars[name] = teacher_program.global_block( )._clone_variable( var, force_persistable=False) model = create(teacher_arch) train_fetches = model.train(teacher_feed_vars) teacher_loss = train_fetches['loss'] # get all teacher variables teacher_vars = [] for v in teacher_program.list_vars(): try: teacher_vars.append((v.name, v.shape)) except: pass # uncomment the following lines to print all teacher variables # print("="*50 + "teacher_model_vars" + "="*50) # print(teacher_vars) exe.run(teacher_startup_program) assert FLAGS.teacher_pretrained, "teacher_pretrained should be set" checkpoint.load_params(exe, teacher_program, FLAGS.teacher_pretrained) teacher_program = teacher_program.clone(for_test=True) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) data_name_map = { 'target0': 'target0', 'target1': 'target1', 'target2': 'target2', 'image': 'image', 'gt_bbox': 'gt_bbox', 'gt_class': 'gt_class', 'gt_score': 'gt_score' } merge(teacher_program, fluid.default_main_program(), data_name_map, place) yolo_output_names = [ 'strided_slice_0.tmp_0', 'strided_slice_1.tmp_0', 'strided_slice_2.tmp_0', 'strided_slice_3.tmp_0', 'strided_slice_4.tmp_0', 'transpose_0.tmp_0', 'strided_slice_5.tmp_0', 'strided_slice_6.tmp_0', 'strided_slice_7.tmp_0', 'strided_slice_8.tmp_0', 'strided_slice_9.tmp_0', 'transpose_2.tmp_0', 'strided_slice_10.tmp_0', 'strided_slice_11.tmp_0', 'strided_slice_12.tmp_0', 'strided_slice_13.tmp_0', 'strided_slice_14.tmp_0', 'transpose_4.tmp_0' ] distill_pairs = [['teacher_conv2d_6.tmp_1', 'conv2d_20.tmp_1'], ['teacher_conv2d_14.tmp_1', 'conv2d_28.tmp_1'], ['teacher_conv2d_22.tmp_1', 'conv2d_36.tmp_1']] distill_loss = l2_distill( distill_pairs, 100) if not cfg.use_fine_grained_loss else split_distill( yolo_output_names, 1000) loss = distill_loss + loss lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') lr = lr_builder() opt = optim_builder(lr) opt.minimize(loss) exe.run(fluid.default_startup_program()) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, fluid.default_main_program(), FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, fluid.default_main_program(), cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params( exe, fluid.default_main_program(), cfg.pretrain_weights, ignore_params=ignore_params) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 parallel_main = fluid.CompiledProgram(fluid.default_main_program( )).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.CompiledProgram(eval_prog) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() map_type = cfg.map_type if 'map_type' in cfg else '11point' best_box_ap_list = [0.0, 0] #[map, iter] cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) train_loader.start() for step_id in range(start_iter, cfg.max_iters): teacher_loss_np, distill_loss_np, loss_np, lr_np = exe.run( parallel_main, fetch_list=[ 'teacher_' + teacher_loss.name, distill_loss.name, loss.name, lr.name ]) if step_id % cfg.log_iter == 0: logger.info( "step {} lr {:.6f}, loss {:.6f}, distill_loss {:.6f}, teacher_loss {:.6f}". format(step_id, lr_np[0], loss_np[0], distill_loss_np[0], teacher_loss_np[0])) if step_id % cfg.snapshot_iter == 0 and step_id != 0 or step_id == cfg.max_iters - 1: save_name = str( step_id) if step_id != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, fluid.default_main_program(), os.path.join(save_dir, save_name)) if FLAGS.save_inference: feeded_var_names = ['image', 'im_size'] targets = list(fetches.values()) fluid.io.save_inference_model(save_dir + '/infer', feeded_var_names, targets, exe, eval_prog) # eval results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg) resolution = None box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = step_id checkpoint.save(exe, fluid.default_main_program(), os.path.join(save_dir, "best_model")) if FLAGS.save_inference: feeded_var_names = ['image', 'im_size'] targets = list(fetches.values()) fluid.io.save_inference_model(save_dir + '/infer', feeded_var_names, targets, exe, eval_prog) logger.info("Best test box ap: {}, in step: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization and use_ngraph: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_reader = reader.train(batch_size=train_batch_size, shuffle_seed=shuffle_seed) test_reader = reader.val(batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) if not use_ngraph: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = args.with_mem_opt build_strategy.enable_inplace = args.with_inplace build_strategy.fuse_all_reduce_ops=1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_num exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 time_record=[] try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() time_record.append(t2 - t1) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(t2-t1) if batch_id % 10 == 0: period = np.mean(time_record) time_record=[] print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def train(cfg): # startup_prog = fluid.Program() # train_prog = fluid.Program() drop_last = True dataset = SegDataset( file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, loss, lr, pred, grts, masks, image = build_model( phase=ModelPhase.TRAIN) data_loader.set_sample_generator( data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) cfg.update_from_file(args.teacher_cfg_file) # teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): _, teacher_loss, _, _, _, _, _ = build_model( teacher_program, teacher_startup_program, phase=ModelPhase.TRAIN, image=image, label=grts, mask=masks) exe.run(teacher_startup_program) teacher_program = teacher_program.clone(for_test=True) ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR assert ckpt_dir is not None print('load teacher model:', ckpt_dir) if os.path.exists(ckpt_dir): try: fluid.load(teacher_program, os.path.join(ckpt_dir, 'model'), exe) except: fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program) # cfg = load_config(FLAGS.config) cfg.update_from_file(args.cfg_file) data_name_map = { 'image': 'image', 'label': 'label', 'mask': 'mask', } merge(teacher_program, fluid.default_main_program(), data_name_map, place) distill_pairs = [[ 'teacher_bilinear_interp_2.tmp_0', 'bilinear_interp_0.tmp_0' ]] def distill(pairs, weight): """ Add 3 pairs of distillation losses, each pair of feature maps is the input of teacher and student's yolov3_loss respectively """ loss = l2_loss(pairs[0][0], pairs[0][1]) weighted_loss = loss * weight return weighted_loss distill_loss = distill(distill_pairs, 0.1) cfg.update_from_file(args.cfg_file) optimizer = solver.Solver(None, None) all_loss = loss + distill_loss lr = optimizer.optimise(all_loss) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=all_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, fluid.default_main_program()) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, fluid.default_main_program(), cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) #fetch_list = [avg_loss.name, lr.name] fetch_list = [ loss.name, 'teacher_' + teacher_loss.name, distill_loss.name, lr.name ] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, t_loss, d_loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) avg_t_loss += np.mean(np.array(t_loss)) avg_d_loss += np.mean(np.array(d_loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_t_loss /= args.log_steps avg_d_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, avg_t_loss, avg_d_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info("Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(fluid.default_main_program(), epoch) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(fluid.default_main_program(), 'final')
def search_mobilenetv2(config, args, image_size, is_server=True): if is_server: ### start a server and a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, lstm_num_layers=1, hidden_size=10, temperature=1.0) else: ### start a client rl_nas = RLNAS(key='lstm', configs=config, is_sync=False, server_addr=(args.server_address, args.port), lstm_num_layers=1, hidden_size=10, temperature=1.0, controller_batch_size=1, controller_decay_steps=1000, controller_decay_rate=0.8, is_server=False) image_shape = [3, image_size, image_size] for step in range(args.search_steps): archs = rl_nas.next_archs(1)[0][0] train_program = fluid.Program() test_program = fluid.Program() startup_program = fluid.Program() train_loader, avg_cost, acc_top1, acc_top5 = build_program( train_program, startup_program, image_shape, archs, args) test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( test_program, startup_program, image_shape, archs, args, is_test=True) test_program = test_program.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) if args.data == 'cifar10': train_reader = paddle.fluid.io.batch(paddle.reader.shuffle( paddle.dataset.cifar.train10(cycle=False), buf_size=1024), batch_size=args.batch_size, drop_last=True) test_reader = paddle.fluid.io.batch( paddle.dataset.cifar.test10(cycle=False), batch_size=args.batch_size, drop_last=False) elif args.data == 'imagenet': train_reader = paddle.fluid.io.batch(imagenet_reader.train(), batch_size=args.batch_size, drop_last=True) test_reader = paddle.fluid.io.batch(imagenet_reader.val(), batch_size=args.batch_size, drop_last=False) train_loader.set_sample_list_generator( train_reader, places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places()) test_loader.set_sample_list_generator(test_reader, places=place) build_strategy = fluid.BuildStrategy() train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(test_loader()): test_fetches = [ test_avg_cost.name, test_acc_top1.name, test_acc_top5.name ] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) rl_nas.reward(np.float32(finally_reward[1]))
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL) data_loader.set_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) wandb.log({'epoch': epoch, 'loss': loss}) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) wandb.log( { 'Train/loss': avg_loss, 'Train/lr': lr[0], 'Train/speed': speed }, step=step) sys.stdout.flush() avg_loss = 0.0 timer.restart() # NOTE : used for benchmark, profiler tools if args.is_profiler and epoch == 1 and step == args.log_steps: profiler.start_profiler("All") elif args.is_profiler and epoch == 1 and step == args.log_steps + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, epoch) save_infer_program(test_prog, ckpt_dir) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) wandb.log( { 'Evaluate/mean_iou': mean_iou, 'Evaluate/mean_acc': mean_acc }, step=step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info( "Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, 'final') save_infer_program(test_prog, ckpt_dir)
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) if not FLAGS.dist or trainer_id == 0: print_total_cfg(cfg) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'train_feed' not in cfg: train_feed = create(main_arch + 'TrainFeed') else: train_feed = create(cfg.train_feed) if FLAGS.eval: if 'eval_feed' not in cfg: eval_feed = create(main_arch + 'EvalFeed') else: eval_feed = create(cfg.eval_feed) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) train_pyreader, feed_vars = create_feed(train_feed) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) eval_pyreader, feed_vars = create_feed(eval_feed) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) eval_pyreader.decorate_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_box', 'gt_label', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_box'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(train_feed, (cfg.max_iters - start_iter) * devices_num, FLAGS.dataset_dir) train_pyreader.decorate_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_pyreader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use tb-paddle to log data if FLAGS.use_tb: from tb_paddle import SummaryWriter tb_writer = SummaryWriter(FLAGS.tb_log_dir) tb_loss_step = 0 tb_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use tb-paddle to log loss if FLAGS.use_tb: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): tb_writer.add_scalar(loss_name, loss_value, tb_loss_step) tb_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation results = eval_run(exe, compiled_eval_prog, eval_pyreader, eval_keys, eval_values, eval_cls) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, eval_feed, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type) # use tb_paddle to log mAP if FLAGS.use_tb: tb_writer.add_scalar("mAP", box_ap_stats[0], tb_mAP_step) tb_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_pyreader.reset()
def train(args): print("pretraining start") bert_config = BertConfig(args.bert_config_path) bert_config.print_config() train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) scheduled_lr, loss_scaling = optimization( loss=total_loss, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, next_sent_acc, mask_lm_loss, total_loss = create_model( bert_config=bert_config) test_prog = test_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) print("Device count %d" % dev_count) nccl2_num_trainers = 1 nccl2_trainer_id = 0 print("args.is_distributed:", args.is_distributed) if args.is_distributed: worker_endpoints_env = os.getenv("worker_endpoints") worker_endpoints = worker_endpoints_env.split(",") trainers_num = len(worker_endpoints) current_endpoint = os.getenv("current_endpoint") trainer_id = worker_endpoints.index(current_endpoint) if trainer_id == 0: print("train_id == 0, sleep 60s") time.sleep(60) print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \ trainer_id:{}".format(worker_endpoints, trainers_num, current_endpoint, trainer_id)) # prepare nccl2 env. config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, trainers=worker_endpoints_env, current_endpoint=current_endpoint, program=train_program, startup_program=startup_prog) nccl2_num_trainers = trainers_num nccl2_trainer_id = trainer_id exe = fluid.Executor(place) exe.run(startup_prog) if args.init_checkpoint and args.init_checkpoint != "": init_checkpoint(exe, args.init_checkpoint, train_program, args.use_fp16) data_reader = DataReader(data_dir=args.data_dir, batch_size=args.batch_size, in_tokens=args.in_tokens, vocab_path=args.vocab_path, voc_size=bert_config['vocab_size'], epoch=args.epoch, max_seq_len=args.max_seq_len, generate_neg_sample=args.generate_neg_sample) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if not sys.platform == "win32": build_strategy.num_trainers = nccl2_num_trainers elif nccl2_num_trainers > 1: raise ValueError( "Windows platform doesn't support distributed training!") build_strategy.trainer_id = nccl2_trainer_id # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=total_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) if args.validation_set_dir and args.validation_set_dir != "": predict = predict_wrapper(args, exe, bert_config, test_prog=test_prog, data_loader=test_data_loader, fetch_list=[ next_sent_acc.name, mask_lm_loss.name, total_loss.name ]) train_data_loader.set_batch_generator(data_reader.data_generator()) train_data_loader.start() steps = 0 cost = [] lm_cost = [] acc = [] time_begin = time.time() while steps < args.num_train_steps: try: steps += 1 skip_steps = args.skip_steps * nccl2_num_trainers if nccl2_trainer_id != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) continue if steps % args.skip_steps != 0: if use_ngraph: exe.run(fetch_list=[], program=train_program) else: exe.run(fetch_list=[], program=train_compiled_program) else: fetch_list = [ next_sent_acc.name, mask_lm_loss.name, total_loss.name, scheduled_lr.name ] if args.use_fp16: fetch_list.append(loss_scaling.name) if use_ngraph: outputs = exe.run(fetch_list=fetch_list, program=train_program) else: outputs = exe.run(fetch_list=fetch_list, program=train_compiled_program) if args.use_fp16: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr, np_scaling = outputs else: each_next_acc, each_mask_lm_cost, each_total_cost, np_lr = outputs acc.extend(each_next_acc) lm_cost.extend(each_mask_lm_cost) cost.extend(each_total_cost) time_end = time.time() used_time = time_end - time_begin epoch, current_file_index, total_file, current_file = data_reader.get_progress( ) if args.verbose: verbose = "feed_queue size: %d, " % train_data_loader.queue.size( ) verbose += "current learning_rate: %f, " % np_lr[0] if args.use_fp16: verbose += "loss scaling: %f" % np_scaling[0] print(verbose) print( "epoch: %d, progress: %d/%d, step: %d, loss: %f, " "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s" % (epoch, current_file_index, total_file, steps, np.mean(np.array(cost)), np.mean(np.exp( np.array(lm_cost))), np.mean(np.array(acc)), skip_steps / used_time, current_file)) cost = [] lm_cost = [] acc = [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if args.validation_set_dir and steps % args.validation_steps == 0: vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict( ) print("[validation_set] epoch: %d, step: %d, " "loss: %f, global ppl: %f, batch-averged ppl: %f, " "next_sent_acc: %f, speed: %f steps/s" % (epoch, steps, np.mean(np.array(vali_cost) / vali_steps), np.exp(np.mean(np.array(vali_lm_cost) / vali_steps)), np.mean(np.exp(np.array(vali_lm_cost) / vali_steps)), np.mean(np.array(vali_acc) / vali_steps), vali_speed)) except fluid.core.EOFException: train_data_loader.reset() break
def build_model(self): data_shape = [-1, 3, self.cfg.image_size, self.cfg.image_size] image_real = fluid.layers.data( name='image_real', shape=data_shape, dtype='float32') label_org = fluid.layers.data( name='label_org', shape=[self.cfg.c_dim], dtype='float32') label_trg = fluid.layers.data( name='label_trg', shape=[self.cfg.c_dim], dtype='float32') label_org_ = fluid.layers.data( name='label_org_', shape=[self.cfg.c_dim], dtype='float32') label_trg_ = fluid.layers.data( name='label_trg_', shape=[self.cfg.c_dim], dtype='float32') gen_trainer = GTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) dis_trainer = DTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) # prepare environment place = fluid.CUDAPlace(0) if self.cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if self.cfg.init_model: utility.init_checkpoints(self.cfg, exe, gen_trainer, "net_G") utility.init_checkpoints(self.cfg, exe, dis_trainer, "net_D") ### memory optim build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False gen_trainer_program = fluid.CompiledProgram( gen_trainer.program).with_data_parallel( loss_name=gen_trainer.g_loss.name, build_strategy=build_strategy) dis_trainer_program = fluid.CompiledProgram( dis_trainer.program).with_data_parallel( loss_name=dis_trainer.d_loss.name, build_strategy=build_strategy) t_time = 0 for epoch_id in range(self.cfg.epoch): batch_id = 0 for i in range(self.batch_num): image, label_org = next(self.train_reader()) label_trg = copy.deepcopy(label_org) np.random.shuffle(label_trg) label_org_ = list( map(lambda x: (x * 2.0 - 1.0) * self.cfg.thres_int, label_org)) label_trg_ = list( map(lambda x: (x * 2.0 - 1.0) * self.cfg.thres_int, label_trg)) tensor_img = fluid.LoDTensor() tensor_label_org = fluid.LoDTensor() tensor_label_trg = fluid.LoDTensor() tensor_label_org_ = fluid.LoDTensor() tensor_label_trg_ = fluid.LoDTensor() tensor_img.set(image, place) tensor_label_org.set(label_org, place) tensor_label_trg.set(label_trg, place) tensor_label_org_.set(label_org_, place) tensor_label_trg_.set(label_trg_, place) label_shape = tensor_label_trg.shape s_time = time.time() # optimize the discriminator network if (batch_id + 1) % self.cfg.num_discriminator_time != 0: fetches = [ dis_trainer.d_loss.name, dis_trainer.d_loss_real.name, dis_trainer.d_loss_fake.name, dis_trainer.d_loss_cls.name, dis_trainer.d_loss_gp.name ] d_loss, d_loss_real, d_loss_fake, d_loss_cls, d_loss_gp = exe.run( dis_trainer_program, fetch_list=fetches, feed={ "image_real": tensor_img, "label_org": tensor_label_org, "label_org_": tensor_label_org_, "label_trg": tensor_label_trg, "label_trg_": tensor_label_trg_ }) batch_time = time.time() - s_time t_time += batch_time print("epoch{}: batch{}: \n\ d_loss: {}; d_loss_real: {}; d_loss_fake: {}; d_loss_cls: {}; d_loss_gp: {} \n\ Batch_time_cost: {:.2f}" .format(epoch_id, batch_id, d_loss[0], d_loss_real[ 0], d_loss_fake[0], d_loss_cls[0], d_loss_gp[0], batch_time)) # optimize the generator network else: d_fetches = [ gen_trainer.g_loss_fake.name, gen_trainer.g_loss_rec.name, gen_trainer.g_loss_cls.name, gen_trainer.fake_img.name ] g_loss_fake, g_loss_rec, g_loss_cls, fake_img = exe.run( gen_trainer_program, fetch_list=d_fetches, feed={ "image_real": tensor_img, "label_org": tensor_label_org, "label_org_": tensor_label_org_, "label_trg": tensor_label_trg, "label_trg_": tensor_label_trg_ }) print("epoch{}: batch{}: \n\ g_loss_fake: {}; g_loss_rec: {}; g_loss_cls: {}" .format(epoch_id, batch_id, g_loss_fake[0], g_loss_rec[0], g_loss_cls[0])) sys.stdout.flush() batch_id += 1 if self.cfg.run_test: test_program = gen_trainer.infer_program utility.save_test_image(epoch_id, self.cfg, exe, place, test_program, gen_trainer, self.test_reader) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, exe, gen_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, exe, dis_trainer, "net_D")
def build_model(self): data_shape = [None, 3, self.cfg.image_size, self.cfg.image_size] image_real = fluid.data(name='image_real', shape=data_shape, dtype='float32') label_org = fluid.data(name='label_org', shape=[None, self.cfg.c_dim], dtype='float32') label_trg = fluid.data(name='label_trg', shape=[None, self.cfg.c_dim], dtype='float32') label_org_ = fluid.data(name='label_org_', shape=[None, self.cfg.c_dim], dtype='float32') label_trg_ = fluid.data(name='label_trg_', shape=[None, self.cfg.c_dim], dtype='float32') # used for continuous evaluation if self.cfg.enable_ce: fluid.default_startup_program().random_seed = 90 loader = fluid.io.DataLoader.from_generator( feed_list=[image_real, label_org, label_trg], capacity=64, iterable=True, use_double_buffer=True) test_gen_trainer = GTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) label_org_ = (label_org * 2.0 - 1.0) * self.cfg.thres_int label_trg_ = (label_trg * 2.0 - 1.0) * self.cfg.thres_int gen_trainer = GTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) dis_trainer = DTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) # prepare environment place = fluid.CUDAPlace(0) if self.cfg.use_gpu else fluid.CPUPlace() loader.set_batch_generator(self.train_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if self.cfg.init_model: utility.init_checkpoints(self.cfg, gen_trainer, "net_G") utility.init_checkpoints(self.cfg, dis_trainer, "net_D") ### memory optim build_strategy = fluid.BuildStrategy() gen_trainer_program = fluid.CompiledProgram( gen_trainer.program).with_data_parallel( loss_name=gen_trainer.g_loss.name, build_strategy=build_strategy) dis_trainer_program = fluid.CompiledProgram( dis_trainer.program).with_data_parallel( loss_name=dis_trainer.d_loss.name, build_strategy=build_strategy) # used for continuous evaluation if self.cfg.enable_ce: gen_trainer_program.random_seed = 90 dis_trainer_program.random_seed = 90 t_time = 0 for epoch_id in range(self.cfg.epoch): batch_id = 0 for data in loader(): s_time = time.time() # optimize the discriminator network fetches = [ dis_trainer.d_loss.name, dis_trainer.d_loss_real.name, dis_trainer.d_loss_fake.name, dis_trainer.d_loss_cls.name, dis_trainer.d_loss_gp.name, ] d_loss, d_loss_real, d_loss_fake, d_loss_cls, d_loss_gp = exe.run( dis_trainer_program, fetch_list=fetches, feed=data) if (batch_id + 1) % self.cfg.num_discriminator_time == 0: # optimize the generator network d_fetches = [ gen_trainer.g_loss_fake.name, gen_trainer.g_loss_rec.name, gen_trainer.g_loss_cls.name, gen_trainer.fake_img.name ] g_loss_fake, g_loss_rec, g_loss_cls, fake_img = exe.run( gen_trainer_program, fetch_list=d_fetches, feed=data) print("epoch{}: batch{}: \n\ g_loss_fake: {}; g_loss_rec: {}; g_loss_cls: {}". format(epoch_id, batch_id, g_loss_fake[0], g_loss_rec[0], g_loss_cls[0])) batch_time = time.time() - s_time t_time += batch_time if (batch_id + 1) % self.cfg.print_freq == 0: print("epoch{}: batch{}: \n\ d_loss: {}; d_loss_real: {}; d_loss_fake: {}; d_loss_cls: {}; d_loss_gp: {} \n\ Batch_time_cost: {}".format(epoch_id, batch_id, d_loss[0], d_loss_real[0], d_loss_fake[0], d_loss_cls[0], d_loss_gp[0], batch_time)) sys.stdout.flush() batch_id += 1 if self.cfg.enable_ce and batch_id == 100: break if self.cfg.run_test: image_name = fluid.data(name='image_name', shape=[None, self.cfg.n_samples], dtype='int32') test_loader = fluid.io.DataLoader.from_generator( feed_list=[image_real, label_org, label_trg, image_name], capacity=32, iterable=True, use_double_buffer=True) test_loader.set_batch_generator( self.test_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) test_program = test_gen_trainer.infer_program utility.save_test_image(epoch_id, self.cfg, exe, place, test_program, test_gen_trainer, test_loader) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, gen_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, dis_trainer, "net_D") # used for continuous evaluation if self.cfg.enable_ce: device_num = fluid.core.get_cuda_device_count( ) if self.cfg.use_gpu else 1 print("kpis\tattgan_g_loss_fake_card{}\t{}".format( device_num, g_loss_fake[0])) print("kpis\tattgan_g_loss_rec_card{}\t{}".format( device_num, g_loss_rec[0])) print("kpis\tattgan_g_loss_cls_card{}\t{}".format( device_num, g_loss_cls[0])) print("kpis\tattgan_d_loss_real_card{}\t{}".format( device_num, d_loss_real[0])) print("kpis\tattgan_d_loss_fake_card{}\t{}".format( device_num, d_loss_fake[0])) print("kpis\tattgan_d_loss_gp_card{}\t{}".format( device_num, d_loss_gp[0])) print("kpis\tattgan_Batch_time_cost_card{}\t{}".format( device_num, batch_time))
def compress(args): if args.data == "cifar10": import paddle.dataset.cifar as reader train_reader = reader.train10() val_reader = reader.test10() class_dim = 10 image_shape = "3,32,32" elif args.data == "imagenet": import imagenet_reader as reader train_reader = reader.train() val_reader = reader.val() class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) student_program = fluid.Program() s_startup = fluid.Program() with fluid.program_guard(student_program, s_startup): with fluid.unique_name.guard(): image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') train_loader = fluid.io.DataLoader.from_generator( feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=True) valid_loader = fluid.io.DataLoader.from_generator( feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=True) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) train_reader = paddle.batch(train_reader, batch_size=args.batch_size, drop_last=True) val_reader = paddle.batch(val_reader, batch_size=args.batch_size, drop_last=True) val_program = student_program.clone(for_test=True) places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_loader.set_sample_list_generator(train_reader, places) valid_loader.set_sample_list_generator(val_reader, place) teacher_model = models.__dict__[args.teacher_model]() # define teacher program teacher_program = fluid.Program() t_startup = fluid.Program() with fluid.program_guard(teacher_program, t_startup): with fluid.unique_name.guard(): image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') predict = teacher_model.net(image, class_dim=class_dim) exe.run(t_startup) if not os.path.exists(args.teacher_pretrained_model): _download( 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar', '.') _decompress('./ResNet50_vd_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): return os.path.exists( os.path.join(args.teacher_pretrained_model, var.name)) fluid.io.load_vars(exe, args.teacher_pretrained_model, main_program=teacher_program, predicate=if_exist) data_name_map = {'image': 'image'} merge(teacher_program, student_program, data_name_map, place) with fluid.program_guard(student_program, s_startup): distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) loss = avg_cost + distill_loss lr, opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False parallel_main = fluid.CompiledProgram(student_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for epoch_id in range(args.num_epochs): for step_id, data in enumerate(train_loader): lr_np, loss_1, loss_2, loss_3 = exe.run(parallel_main, feed=data, fetch_list=[ lr.name, loss.name, avg_cost.name, distill_loss.name ]) if step_id % args.log_period == 0: _logger.info( "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}" .format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0], loss_3[0])) val_acc1s = [] val_acc5s = [] for step_id, data in enumerate(valid_loader): val_loss, val_acc1, val_acc5 = exe.run( val_program, data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) val_acc1s.append(val_acc1) val_acc5s.append(val_acc5) if step_id % args.log_period == 0: _logger.info( "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}" .format(epoch_id, step_id, val_loss[0], val_acc1[0], val_acc5[0])) _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format( epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))
def main(): env = os.environ FLAGS.local_rank = int(env.get('PADDLE_TRAINER_ID', 0)) FLAGS.world_size = int(env.get('PADDLE_TRAINERS_NUM', 1)) FLAGS.device_id = int(env['FLAGS_selected_gpus']) FLAGS.whole_batch_size = FLAGS.world_size * FLAGS.batch_size pipe = create_dali_pipeline(batch_size=FLAGS.batch_size, num_threads=FLAGS.num_threads, device_id=FLAGS.device_id, data_dir=os.path.join(FLAGS.data, 'train'), crop=224, size=256, dali_cpu=False, shard_id=FLAGS.local_rank, num_shards=FLAGS.world_size, is_training=True) pipe.build() sample_per_shard = pipe.epoch_size("Reader") // FLAGS.world_size train_loader = DALIClassificationIterator(pipe, reader_name="Reader") if FLAGS.local_rank == 0: pipe = create_dali_pipeline(batch_size=FLAGS.batch_size, num_threads=FLAGS.num_threads, device_id=FLAGS.device_id, data_dir=os.path.join(FLAGS.data, 'val'), crop=224, size=256, dali_cpu=False, shard_id=0, num_shards=1, is_training=False) pipe.build() val_loader = DALIClassificationIterator(pipe, reader_name="Reader") place = fluid.CUDAPlace(FLAGS.device_id) exe = fluid.Executor(place) startup_prog = fluid.Program() train_prog = fluid.Program() eval_prog = fluid.Program() step_per_epoch = int(math.ceil(sample_per_shard / FLAGS.batch_size)) milestones = [step_per_epoch * e for e in (30, 60, 80)] values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)] with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_fetch_list = build() learning_rate = fluid.layers.piecewise_decay(boundaries=milestones, values=values) learning_rate = fluid.layers.linear_lr_warmup( learning_rate=learning_rate, warmup_steps=5 * step_per_epoch, start_lr=0., end_lr=FLAGS.lr) decay = FLAGS.weight_decay optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=FLAGS.momentum, regularization=fluid.regularizer.L2Decay(decay)) avg_loss = train_fetch_list[0] optimizer.minimize(avg_loss) with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): eval_fetch_list = build() eval_prog = eval_prog.clone(True) build_strategy = fluid.BuildStrategy() build_strategy.trainer_id = FLAGS.local_rank build_strategy.num_trainers = FLAGS.world_size config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" t = fluid.DistributeTranspiler(config=config) t.transpile(FLAGS.local_rank, trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'), current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'), startup_program=startup_prog, program=train_prog) exec_strategy = fluid.ExecutionStrategy() exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) compiled_eval_prog = fluid.compiler.CompiledProgram(eval_prog) total_time = AverageMeter() for epoch in range(FLAGS.epochs): if FLAGS.local_rank == 0: print("==== train epoch {:02d} ====".format(epoch + 1)) avg_time, _, _ = run(exe, compiled_train_prog, train_fetch_list, train_loader, epoch) total_time.update(avg_time) # reset DALI iterators train_loader.reset() if FLAGS.local_rank == 0: print("==== validation epoch {:02d} ====".format(epoch + 1)) _, prec1, prec5 = run(exe, compiled_eval_prog, eval_fetch_list, val_loader, epoch) val_loader.reset() ckpt_path = os.path.join('checkpoint', "{:02d}".format(epoch + 1)) if os.path.isdir(ckpt_path): shutil.rmtree(ckpt_path) print('Save model to {}.'.format(ckpt_path)) fluid.io.save_persistables(exe, ckpt_path, train_prog) time_per_sample = FLAGS.whole_batch_size / total_time.avg if epoch == FLAGS.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format(prec1 * 100, prec5 * 100, time_per_sample))
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.print_params: param_delimit_str = '-' * 20 + "All parameters in current graph" + '-' * 20 print(param_delimit_str) for block in train_prog.blocks: for param in block.all_parameters(): print("parameter name: {}\tshape: {}".format( param.name, param.shape)) print('-' * len(param_delimit_str)) return if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' start_iter = 0 if cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) pruned_params = FLAGS.pruned_params assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert (pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios) ), "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) train_prog = pruner.prune(train_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: base_flops = flops(eval_prog) eval_prog = pruner.prune(eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) compiled_eval_prog = fluid.CompiledProgram(eval_prog) if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 if FLAGS.eval: resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) dataset = cfg['EvalReader']['dataset'] box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use VisualDL to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) # use VisualDL to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == cfg.BATCH_SIZE: for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() place = places[0] # Get number of GPU dev_count = len(places) print("#GPU-Devices: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print("batch_size_per_dev: {}".format(batch_size_per_dev)) py_reader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) py_reader.decorate_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print("Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL): print('Pretrained model dir:', cfg.TRAIN.PRETRAINED_MODEL) load_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL, var.name)) if var_shape == shape: return True else: print( "Variable[{}] shape does not match current network, skip" " to load it.".format(var.name)) return False for x in train_prog.list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) if cfg.MODEL.FP16: # If open FP16 training mode, load FP16 var separate load_fp16_vars(exe, cfg.TRAIN.PRETRAINED_MODEL, train_prog) else: fluid.io.load_vars(exe, dirname=cfg.TRAIN.PRETRAINED_MODEL, vars=load_vars) print("Pretrained model loaded successfully!") else: print('Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_tb: if not args.tb_log_dir: print("Please specify the log directory by --tb_log_dir.") exit(1) from tb_paddle import SummaryWriter if os.path.exists(args.tb_log_dir): shutil.rmtree(args.tb_log_dir) log_writer = SummaryWriter(args.tb_log_dir) global_step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print("Use multiprocess reader") else: print("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): py_reader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - global_step, speed))) print("Category IoU:", category_iou) print("Category Acc:", category_acc) if args.use_tb: log_writer.add_scalar('Train/mean_iou', mean_iou, global_step) log_writer.add_scalar('Train/mean_acc', mean_acc, global_step) log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/step/sec', speed, global_step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, speed, calculate_eta(all_step - global_step, speed))) if args.use_tb: log_writer.add_scalar('Train/loss', avg_loss, global_step) log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/speed', speed, global_step) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: py_reader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0: ckpt_dir = save_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_tb: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, global_step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, global_step) # Use Tensorboard to visualize results if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model save_checkpoint(exe, train_prog, 'final')
def check_network_convergence(self, network, use_cuda=True, memory_opt=True, use_ir_memory_optimize=True, enable_inplace=True, iter=5): if use_cuda and not core.is_compiled_with_cuda(): print( 'Skip use_cuda=True because Paddle is not compiled with cuda') return if os.name == 'nt': print( 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' ) return fluid.default_startup_program().random_seed = 100 fluid.default_main_program().random_seed = 100 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, len(self.word_dict)) optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(cost) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False if memory_opt: fluid.memory_optimize(fluid.default_main_program()) else: build_strategy.enable_inplace = use_ir_memory_optimize build_strategy.memory_optimize = enable_inplace # execution place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[data, label], place=place) reader = feeder.decorate_reader(self.train_reader, multi_devices=True) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) train_cp = compiler.CompiledProgram(fluid.default_main_program()) train_cp = train_cp.with_data_parallel(loss_name=cost.name, build_strategy=build_strategy) fetch_list = [cost.name] begin = time.time() first_loss, last_loss = None, None step_id = 0 custom_iter = getattr(self, "iter", None) if not custom_iter == None: iter = custom_iter for data in reader(): ret = exe.run(train_cp, feed=data, fetch_list=fetch_list) print(ret) step_id += 1 if step_id == 1: first_loss = ret[0] if step_id == iter: last_loss = ret[0] break end = time.time() print("%.4f Instance per second" % ((self.batch_size * iter) / (end - begin))) print(first_loss, last_loss) avg_last_loss_val = np.array(last_loss).mean() avg_first_loss_val = np.array(first_loss).mean() if math.isnan(float(avg_last_loss_val)) or math.isnan( float(avg_first_loss_val)): sys.exit("got NaN loss, training failed.") return first_loss, last_loss
def train_quant(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) data_loader.set_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) not_quant_pattern = [] if args.not_quant_pattern: not_quant_pattern = args.not_quant_pattern config = { 'weight_quantize_type': 'channel_wise_abs_max', 'activation_quantize_type': 'moving_average_abs_max', 'quantize_op_types': ['depthwise_conv2d', 'mul', 'conv2d'], 'not_quant_pattern': not_quant_pattern } compiled_train_prog = quant_aware(train_prog, place, config, for_test=False) eval_prog = quant_aware(train_prog, place, config, for_test=True) build_strategy.fuse_all_reduce_ops = False build_strategy.sync_batch_norm = False compiled_train_prog = compiled_train_prog.with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) global_step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - global_step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) global_step += 1 if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, global_step, lr[0], avg_loss, speed, calculate_eta(all_step - global_step, speed))) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, eval_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio, not_quant_pattern=args.not_quant_pattern, convert=False) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info( "Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, eval_prog, 'final')
def main(args): image_shape = args.crop_size image = fluid.layers.data(name='image', shape=[3, image_shape, image_shape], dtype='float32') label = fluid.layers.data(name='label', shape=[image_shape, image_shape], dtype='int64') batch_size = args.batch_size epoch_num = args.epoch_num num_classes = args.num_classes data_root = args.data_folder if args.cuda: num = fluid.core.get_cuda_device_count() print('The number of GPU: {}'.format(num)) else: num = _cpu_num() print('The number of CPU: {}'.format(num)) # program start_prog = fluid.default_startup_program() train_prog = fluid.default_main_program() start_prog.random_seed = args.seed train_prog.random_seed = args.seed np.random.seed(args.seed) random.seed(args.seed) # clone test_prog = train_prog.clone(for_test=True) logging.basicConfig( level=logging.INFO, filename='DANet_{}_train_executor.log'.format(args.backbone), format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logging.info('DANet') logging.info(args) with fluid.program_guard(train_prog, start_prog): with fluid.unique_name.guard(): train_py_reader = fluid.io.PyReader(feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=False) train_data = cityscapes_train(data_root=data_root, base_size=args.base_size, crop_size=args.crop_size, scale=args.scale, xmap=True, batch_size=batch_size, gpu_num=num) batch_train_data = paddle.batch(paddle.reader.shuffle( train_data, buf_size=batch_size * 16), batch_size=batch_size, drop_last=True) train_py_reader.decorate_sample_list_generator(batch_train_data) model = get_model(args) pred, pred2, pred3 = model(image) train_loss = loss_fn(pred, pred2, pred3, label, num_classes=num_classes) train_avg_loss = fluid.layers.mean(train_loss) optimizer = optimizer_setting(args) optimizer.minimize(train_avg_loss) # miou不是真实的 miou, wrong, correct = mean_iou(pred, label, num_classes=num_classes) with fluid.program_guard(test_prog, start_prog): with fluid.unique_name.guard(): test_py_reader = fluid.io.PyReader(feed_list=[image, label], capacity=64, iterable=False, use_double_buffer=True) val_data = cityscapes_val(data_root=data_root, base_size=args.base_size, crop_size=args.crop_size, scale=args.scale, xmap=True) batch_test_data = paddle.batch(val_data, batch_size=batch_size, drop_last=True) test_py_reader.decorate_sample_list_generator(batch_test_data) model = get_model(args) pred, pred2, pred3 = model(image) test_loss = loss_fn(pred, pred2, pred3, label, num_classes=num_classes) test_avg_loss = fluid.layers.mean(test_loss) # miou不是真实的 miou, wrong, correct = mean_iou(pred, label, num_classes=num_classes) place = fluid.CUDAPlace(0) if args.cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(start_prog) if args.use_data_parallel and args.cuda: exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.sync_batch_norm = True print("sync_batch_norm = True!") compiled_train_prog = fluid.compiler.CompiledProgram( train_prog).with_data_parallel(loss_name=train_avg_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: compiled_train_prog = fluid.compiler.CompiledProgram(train_prog) # 加载预训练模型 if args.load_pretrained_model: assert os.path.exists( args.save_model ), "your input save_model: {} ,but '{}' is not exists".format( args.save_model, args.save_model) load_model(args.save_model, exe, program=train_prog) print('load pretrained model!') # 加载最优模型 if args.load_better_model: assert os.path.exists( args.save_model ), "your input save_model: {} ,but '{}' is not exists".format( args.save_model, args.save_model) load_model(args.save_model, exe, program=train_prog) print('load better model!') train_iou_manager = fluid.metrics.Accuracy() train_avg_loss_manager = fluid.metrics.Accuracy() test_iou_manager = fluid.metrics.Accuracy() test_avg_loss_manager = fluid.metrics.Accuracy() better_miou_train = 0 better_miou_test = 0 train_loss_title = 'Train_loss' test_loss_title = 'Test_loss' train_iou_title = 'Train_mIOU' test_iou_title = 'Test_mIOU' plot_loss = Ploter(train_loss_title, test_loss_title) plot_iou = Ploter(train_iou_title, test_iou_title) for epoch in range(epoch_num): prev_time = datetime.now() train_avg_loss_manager.reset() train_iou_manager.reset() logging.info('training, epoch = {}'.format(epoch + 1)) train_py_reader.start() batch_id = 0 while True: try: train_fetch_list = [train_avg_loss, miou, wrong, correct] train_avg_loss_value, train_iou_value, w, c = exe.run( program=compiled_train_prog, fetch_list=train_fetch_list) train_iou_manager.update(train_iou_value, weight=int(batch_size * num)) train_avg_loss_manager.update(train_avg_loss_value, weight=int(batch_size * num)) batch_train_str = "epoch: {}, batch: {}, train_avg_loss: {:.6f}, " \ "train_miou: {:.6f}.".format(epoch + 1, batch_id + 1, train_avg_loss_value[0], train_iou_value[0]) if batch_id % 40 == 0: logging.info(batch_train_str) print(batch_train_str) batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() break cur_time = datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = " Time %02d:%02d:%02d" % (h, m, s) train_str = "epoch: {}, train_avg_loss: {:.6f}, " \ "train_miou: {:.6f}.".format(epoch + 1, train_avg_loss_manager.eval()[0], train_iou_manager.eval()[0]) print(train_str + time_str + '\n') logging.info(train_str + time_str) plot_loss.append(train_loss_title, epoch, train_avg_loss_manager.eval()[0]) plot_loss.plot('./DANet_loss_executor.jpg') plot_iou.append(train_iou_title, epoch, train_iou_manager.eval()[0]) plot_iou.plot('./DANet_miou_executor.jpg') # save_model if better_miou_train < train_iou_manager.eval()[0]: shutil.rmtree('./checkpoint/DANet_better_train_{:.4f}'.format( better_miou_train), ignore_errors=True) better_miou_train = train_iou_manager.eval()[0] logging.warning( '-----------train---------------better_train: {:.6f}, epoch: {}, -----------Train model saved successfully!\n' .format(better_miou_train, epoch + 1)) save_dir = './checkpoint/DANet_better_train_{:.4f}'.format( better_miou_train) save_model(save_dir, exe, program=train_prog) if (epoch + 1) % 5 == 0: save_dir = './checkpoint/DANet_epoch_train' save_model(save_dir, exe, program=train_prog) # test test_py_reader.start() test_iou_manager.reset() test_avg_loss_manager.reset() prev_time = datetime.now() logging.info('testing, epoch = {}'.format(epoch + 1)) batch_id = 0 while True: try: test_fetch_list = [test_avg_loss, miou, wrong, correct] test_avg_loss_value, test_iou_value, _, _ = exe.run( program=test_prog, fetch_list=test_fetch_list) test_iou_manager.update(test_iou_value, weight=int(batch_size * num)) test_avg_loss_manager.update(test_avg_loss_value, weight=int(batch_size * num)) batch_test_str = "epoch: {}, batch: {}, test_avg_loss: {:.6f}, " \ "test_miou: {:.6f}. ".format(epoch + 1, batch_id + 1, test_avg_loss_value[0], test_iou_value[0]) if batch_id % 40 == 0: logging.info(batch_test_str) print(batch_test_str) batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() break cur_time = datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = " Time %02d:%02d:%02d" % (h, m, s) test_str = "epoch: {}, test_avg_loss: {:.6f}, " \ "test_miou: {:.6f}.".format(epoch + 1, test_avg_loss_manager.eval()[0], test_iou_manager.eval()[0]) print(test_str + time_str + '\n') logging.info(test_str + time_str) plot_loss.append(test_loss_title, epoch, test_avg_loss_manager.eval()[0]) plot_loss.plot('./DANet_loss_executor.jpg') plot_iou.append(test_iou_title, epoch, test_iou_manager.eval()[0]) plot_iou.plot('./DANet_miou_executor.jpg') # save_model_infer if better_miou_test < test_iou_manager.eval()[0]: shutil.rmtree('./checkpoint/infer/DANet_better_test_{:.4f}'.format( better_miou_test), ignore_errors=True) better_miou_test = test_iou_manager.eval()[0] logging.warning( '------------test-------------infer better_test: {:.6f}, epoch: {}, ----------------Inference model saved successfully!\n' .format(better_miou_test, epoch + 1)) save_dir = './checkpoint/infer/DANet_better_test_{:.4f}'.format( better_miou_test) # save_model(save_dir, exe, program=test_prog) fluid.io.save_inference_model(save_dir, [image.name], [pred, pred2, pred3], exe) print('Inference model saved successfully')
def train_parallel(args): train_prog = fluid.Program() test_prog = fluid.Program() startup_prog = fluid.Program() train_pyreader, train_cost, train_acc1, train_acc5 = build_program( True, train_prog, startup_prog, args) test_pyreader, test_cost, test_acc1, test_acc5 = build_program( False, test_prog, startup_prog, args) if args.update_method == "pserver": train_prog, startup_prog = pserver_prepare(args, train_prog, startup_prog) elif args.update_method == "nccl2": nccl2_prepare(args, startup_prog, main_prog=train_prog) if args.dist_env["training_role"] == "PSERVER": run_pserver(train_prog, startup_prog) exit(0) if args.use_gpu: # NOTE: for multi process mode: one process per GPU device. gpu_id = 0 if os.getenv("FLAGS_selected_gpus"): gpu_id = int(os.getenv("FLAGS_selected_gpus")) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() startup_exe = fluid.Executor(place) if args.multi_batch_repeat > 1: append_bn_repeat_init_op(train_prog, startup_prog, args.multi_batch_repeat) startup_exe.run(startup_prog) if args.checkpoint: fluid.io.load_persistables(startup_exe, args.checkpoint, main_program=train_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.num_threads # num_iteration_per_drop_scope indicates how # many iterations to clean up the temp variables which # is generated during execution. It may make the execution faster, # because the temp variable's shape are the same between two iterations. strategy.num_iteration_per_drop_scope = 30 build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False build_strategy.enable_sequential_execution = bool( args.enable_sequential_execution) if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce if args.update_method == "pserver" or args.update_method == "local": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 else: num_trainers = args.dist_env["num_trainers"] trainer_id = args.dist_env["trainer_id"] # Set this to let build_strategy to add "allreduce_deps_pass" automatically build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id if args.multi_batch_repeat > 1: pass_builder = build_strategy._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass( len(pass_builder.all_passes()) - 4, "multi_batch_merge_pass") mypass.set("num_repeats", args.multi_batch_repeat) exe = fluid.ParallelExecutor(True, train_cost.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) # Uncomment below lines to use ParallelExecutor to run test. # test_exe = fluid.ParallelExecutor( # True, # main_program=test_prog, # share_vars_from=exe, # scope=fluid.global_scope().new_scope() # ) over_all_start = time.time() fetch_list = [train_cost.name, train_acc1.name, train_acc5.name] # 1. MP mode, batch size for current process should be args.batch_size / GPUs # 2. SP/PG mode, batch size for each process should be original args.batch_size if os.getenv("FLAGS_selected_gpus"): steps_per_pass = args.total_images / ( args.batch_size / get_device_num()) / args.dist_env["num_trainers"] else: steps_per_pass = args.total_images / args.batch_size / args.dist_env[ "num_trainers"] for pass_id in range(args.num_epochs): num_samples = 0 start_time = time.time() batch_id = 1 if pass_id == 0: train_pyreader.start() while True: try: if batch_id % 30 == 0: fetch_ret = exe.run(fetch_list) fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print( "Pass [%d/%d], batch [%d/%d], loss %s, acc1: %s, acc5: %s, avg batch time %.4f" % (pass_id, args.num_epochs, batch_id, steps_per_pass, fetched_data[0], fetched_data[1], fetched_data[2], (time.time() - start_time) / batch_id)) else: fetch_ret = exe.run([]) except fluid.core.EOFException: break except fluid.core.EnforceNotMet: traceback.print_exc() break num_samples += args.batch_size batch_id += 1 if batch_id >= steps_per_pass: break print_train_time(start_time, time.time(), num_samples) if pass_id >= args.start_test_pass: if args.multi_batch_repeat > 1: copyback_repeat_bn_params(train_prog) test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] test_ret = test_single(startup_exe, test_prog, args, test_pyreader, test_fetch_list) # NOTE: switch to below line if you use ParallelExecutor to run test. # test_ret = test_parallel(test_exe, test_prog, args, test_pyreader,test_fetch_list) print("Pass: %d, Test Loss %s, test acc1: %s, test acc5: %s\n" % (pass_id, test_ret[0], test_ret[1], test_ret[2])) model_path = os.path.join(args.model_save_dir + '/' + args.model, str(pass_id)) print("saving model to ", model_path) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(startup_exe, model_path, main_program=train_prog) train_pyreader.reset() startup_exe.close() print("total train time: ", time.time() - over_all_start)
def freeze_graph(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=True, quant_skip_pattern='skip_quant'): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = conv_net(img, label, quant_skip_pattern) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, skip_pattern=quant_skip_pattern) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw( '.', 'main' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format( 'loss' + dev_name + activation_quant_type + '_' + weight_quant_type, loss_v)) test_data = next(test_reader()) with fluid.program_guard(quantized_test_program): w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', quantized_test_program) # Testing with fluid.scope_guard(scope): test_loss1, w_quant = exe.run(program=quantized_test_program, feed=feeder.feed(test_data), fetch_list=[loss, w_var]) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program = test_graph.to_program() with fluid.scope_guard(scope): test_loss2, = exe.run(program=server_program, feed=feeder.feed(test_data), fetch_list=[loss]) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) if not for_ci: print('{}: {}'.format( 'test_loss1' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss1)) print('{}: {}'.format( 'test_loss2' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss2)) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) # Maybe failed, this is due to the calculation precision # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) if not for_ci: print('{}: {}'.format( 'w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) print('{}: {}'.format( 'w_quant' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_quant))) # Convert parameter to 8-bit. convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program_int8 = test_graph.to_program() # Save the 8-bit parameter and model file. with fluid.scope_guard(scope): fluid.io.save_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, server_program_int8) # Test whether the 8-bit parameter and model file can be loaded successfully. [infer, feed, fetch] = fluid.io.load_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, exe) # Check the loaded 8-bit weight. w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) if not for_ci: print('{}: {}'.format( 'w_8bit' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_8bit))) print('{}: {}'.format( 'w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test_mobile' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) mobile_program = test_graph.to_program() with fluid.scope_guard(scope): fluid.io.save_inference_model( 'mobile_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, mobile_program)