def train(self, loaders): args = self.args nets = self.nets nets_ema = self.nets_ema optims = self.optims writer = LogWriter(logdir=self.args.checkpoint_dir + "/log/") # fetch random validation images for debugging fetcher = InputFetcher(loaders.src, loaders.ref, args.latent_dim, 'train') fetcher_val = InputFetcher(loaders.val, None, args.latent_dim, 'val') inputs_val = next(fetcher_val) # resume training if necessary if args.resume_iter > 0: self._load_checkpoint(args.resume_iter) # remember the initial value of ds weight initial_lambda_ds = args.lambda_ds print('Start training...') import tqdm start_time = time.time() tqdm_descriptor = tqdm.trange(args.resume_iter, args.total_iters) for i in tqdm_descriptor: # fetch images and labels inputs = next(fetcher) x_real, y_org = inputs.x_src, inputs.y_src x_ref, x_ref2, y_trg = inputs.x_ref, inputs.x_ref2, inputs.y_ref z_trg, z_trg2 = inputs.z_trg, inputs.z_trg2 masks = nets.fan.get_heatmap(x_real) if args.w_hpf > 0 else None # train the discriminator d_loss, d_losses_latent = compute_d_loss(nets, args, x_real, y_org, y_trg, z_trg=z_trg, masks=masks) self._reset_grad() d_loss.backward() optims.discriminator.minimize(d_loss) d_loss, d_losses_ref = compute_d_loss(nets, args, x_real, y_org, y_trg, x_ref=x_ref, masks=masks) self._reset_grad() d_loss.backward() optims.discriminator.minimize(d_loss) # train the generator if i - args.resume_iter > 1: ## train discriminator first g_loss, g_losses_latent, sample_1 = compute_g_loss( nets, args, x_real, y_org, y_trg, z_trgs=[z_trg, z_trg2], masks=masks) self._reset_grad() g_loss.backward() optims.generator.minimize(g_loss) optims.mapping_network.minimize(g_loss) optims.style_encoder.minimize(g_loss) g_loss, g_losses_ref, sample_2 = compute_g_loss( nets, args, x_real, y_org, y_trg, x_refs=[x_ref, x_ref2], masks=masks) self._reset_grad() g_loss.backward() optims.generator.minimize(g_loss) # # compute moving average of network parameters # moving_average(nets.generator, nets_ema.generator, beta=0.999) # moving_average(nets.mapping_network, nets_ema.mapping_network, beta=0.999) # moving_average(nets.style_encoder, nets_ema.style_encoder, beta=0.999) # decay weight for diversity sensitive loss if args.lambda_ds > 0: args.lambda_ds -= (initial_lambda_ds / args.ds_iter) # print out log info if (i + 1) % args.print_every == 0: elapsed = time.time() - start_time elapsed = str(datetime.timedelta(seconds=elapsed))[:-7] log = "Elapsed time [%s], Iteration [%i/%i], " % ( elapsed, i + 1, args.total_iters) all_losses = dict() for loss, prefix in zip([ d_losses_latent, d_losses_ref, g_losses_latent, g_losses_ref ], ['D/latent_', 'D/ref_', 'G/latent_', 'G/ref_']): for key, value in loss.items(): all_losses[prefix + key] = value writer.add_scalar(tag=prefix + key, step=i + 1, value=value) all_losses['G/lambda_ds'] = args.lambda_ds log += ' '.join([ '%s: [%.4f]' % (key, value) for key, value in all_losses.items() ]) tqdm_descriptor.set_description(log) writer.add_image("x_fake", (utils.denormalize(sample_1) * 255).numpy().transpose([1, 2, 0]).astype( np.uint8), i + 1) # generate images for debugging if (i + 1) % args.sample_every == 0: os.makedirs(args.sample_dir, exist_ok=True) utils.debug_image(nets_ema, args, inputs=inputs_val, step=i + 1) # save model checkpoints if (i + 1) % args.save_every == 0: self._save_checkpoint(step=i + 1) # compute FID and LPIPS if necessary if (i + 1) % args.eval_every == 0: calculate_metrics(nets_ema, args, i + 1, mode='latent') calculate_metrics(nets_ema, args, i + 1, mode='reference') else: if (i + 1) % args.print_every == 0: elapsed = time.time() - start_time elapsed = str(datetime.timedelta(seconds=elapsed))[:-7] log = "Elapsed time [%s], Iteration [%i/%i], " % ( elapsed, i + 1, args.total_iters) all_losses = dict() for loss, prefix in zip([d_losses_latent, d_losses_ref], ['D/latent_', 'D/ref_']): for key, value in loss.items(): all_losses[prefix + key] = value writer.add_scalar(tag=prefix + key, step=i + 1, value=value) log += ' '.join([ '%s: [%.4f]' % (key, value) for key, value in all_losses.items() ]) tqdm_descriptor.set_description(log) writer.close()
def train(self): d_loss_writer = LogWriter(logdir="./log/UGATIT/train") g_loss_writer = LogWriter(logdir="./log/UGATIT/train") self.start_iter = 1 if self.resume: self.load(os.path.join(self.result_dir, self.dataset, 'model'), self.start_iter_arg, True) # training loop print('training start !') start_time = time.time() for step in range(self.start_iter, self.iteration + 1): self.genA2B.train(), self.genB2A.train(), self.disGA.train( ), self.disGB.train(), self.disLA.train(), self.disLB.train() try: real_A, _ = next(trainA_iter)[0] except: trainA_iter = self.trainA_loader() real_A, _ = next(trainA_iter)[0] try: real_B, _ = next(trainB_iter)[0] except: trainB_iter = self.trainB_loader() real_B, _ = next(trainB_iter)[0] # Update D self.D_optim.clear_gradients() fake_A2B, _, _ = self.genA2B(real_A) fake_B2A, _, _ = self.genB2A(real_B) real_GA_logit, real_GA_cam_logit, _ = self.disGA(real_A) real_LA_logit, real_LA_cam_logit, _ = self.disLA(real_A) real_GB_logit, real_GB_cam_logit, _ = self.disGB(real_B) real_LB_logit, real_LB_cam_logit, _ = self.disLB(real_B) fake_GA_logit, fake_GA_cam_logit, _ = self.disGA(fake_B2A) fake_LA_logit, fake_LA_cam_logit, _ = self.disLA(fake_B2A) fake_GB_logit, fake_GB_cam_logit, _ = self.disGB(fake_A2B) fake_LB_logit, fake_LB_cam_logit, _ = self.disLB(fake_A2B) D_ad_loss_GA = self.MSE_loss( real_GA_logit, layers.ones_like(real_GA_logit)) + self.MSE_loss( fake_GA_logit, layers.zeros_like(fake_GA_logit)) D_ad_cam_loss_GA = self.MSE_loss( real_GA_cam_logit, layers.ones_like(real_GA_cam_logit)) + self.MSE_loss( fake_GA_cam_logit, layers.zeros_like(fake_GA_cam_logit)) D_ad_loss_LA = self.MSE_loss( real_LA_logit, layers.ones_like(real_LA_logit)) + self.MSE_loss( fake_LA_logit, layers.zeros_like(fake_LA_logit)) D_ad_cam_loss_LA = self.MSE_loss( real_LA_cam_logit, layers.ones_like(real_LA_cam_logit)) + self.MSE_loss( fake_LA_cam_logit, layers.zeros_like(fake_LA_cam_logit)) D_ad_loss_GB = self.MSE_loss( real_GB_logit, layers.ones_like(real_GB_logit)) + self.MSE_loss( fake_GB_logit, layers.zeros_like(fake_GB_logit)) D_ad_cam_loss_GB = self.MSE_loss( real_GB_cam_logit, layers.ones_like(real_GB_cam_logit)) + self.MSE_loss( fake_GB_cam_logit, layers.zeros_like(fake_GB_cam_logit)) D_ad_loss_LB = self.MSE_loss( real_LB_logit, layers.ones_like(real_LB_logit)) + self.MSE_loss( fake_LB_logit, layers.zeros_like(fake_LB_logit)) D_ad_cam_loss_LB = self.MSE_loss( real_LB_cam_logit, layers.ones_like(real_LB_cam_logit)) + self.MSE_loss( fake_LB_cam_logit, layers.zeros_like(fake_LB_cam_logit)) D_loss_A = self.adv_weight * (D_ad_loss_GA + D_ad_cam_loss_GA + D_ad_loss_LA + D_ad_cam_loss_LA) D_loss_B = self.adv_weight * (D_ad_loss_GB + D_ad_cam_loss_GB + D_ad_loss_LB + D_ad_cam_loss_LB) Discriminator_loss = D_loss_A + D_loss_B Discriminator_loss.backward() self.D_optim.minimize(Discriminator_loss) # Update G self.G_optim.clear_gradients() fake_A2B, fake_A2B_cam_logit, _ = self.genA2B(real_A) fake_B2A, fake_B2A_cam_logit, _ = self.genB2A(real_B) fake_A2B2A, _, _ = self.genB2A(fake_A2B) fake_B2A2B, _, _ = self.genA2B(fake_B2A) fake_A2A, fake_A2A_cam_logit, _ = self.genB2A(real_A) fake_B2B, fake_B2B_cam_logit, _ = self.genA2B(real_B) fake_GA_logit, fake_GA_cam_logit, _ = self.disGA(fake_B2A) fake_LA_logit, fake_LA_cam_logit, _ = self.disLA(fake_B2A) fake_GB_logit, fake_GB_cam_logit, _ = self.disGB(fake_A2B) fake_LB_logit, fake_LB_cam_logit, _ = self.disLB(fake_A2B) G_ad_loss_GA = self.MSE_loss(fake_GA_logit, layers.ones_like(fake_GA_logit)) G_ad_cam_loss_GA = self.MSE_loss( fake_GA_cam_logit, layers.ones_like(fake_GA_cam_logit)) G_ad_loss_LA = self.MSE_loss(fake_LA_logit, layers.ones_like(fake_LA_logit)) G_ad_cam_loss_LA = self.MSE_loss( fake_LA_cam_logit, layers.ones_like(fake_LA_cam_logit)) G_ad_loss_GB = self.MSE_loss(fake_GB_logit, layers.ones_like(fake_GB_logit)) G_ad_cam_loss_GB = self.MSE_loss( fake_GB_cam_logit, layers.ones_like(fake_GB_cam_logit)) G_ad_loss_LB = self.MSE_loss(fake_LB_logit, layers.ones_like(fake_LB_logit)) G_ad_cam_loss_LB = self.MSE_loss( fake_LB_cam_logit, layers.ones_like(fake_LB_cam_logit)) G_recon_loss_A = self.L1_loss(fake_A2B2A, real_A) G_recon_loss_B = self.L1_loss(fake_B2A2B, real_B) G_identity_loss_A = self.L1_loss(fake_A2A, real_A) G_identity_loss_B = self.L1_loss(fake_B2B, real_B) G_cam_loss_A = self.BCELoss( fake_B2A_cam_logit, layers.ones_like(fake_B2A_cam_logit)) + self.BCELoss( fake_A2A_cam_logit, layers.zeros_like(fake_A2A_cam_logit)) G_cam_loss_B = self.BCELoss( fake_A2B_cam_logit, layers.ones_like(fake_A2B_cam_logit)) + self.BCELoss( fake_B2B_cam_logit, layers.zeros_like(fake_B2B_cam_logit)) G_loss_A = self.adv_weight * ( G_ad_loss_GA + G_ad_cam_loss_GA + G_ad_loss_LA + G_ad_cam_loss_LA ) + self.cycle_weight * G_recon_loss_A + self.identity_weight * G_identity_loss_A + self.cam_weight * G_cam_loss_A G_loss_B = self.adv_weight * ( G_ad_loss_GB + G_ad_cam_loss_GB + G_ad_loss_LB + G_ad_cam_loss_LB ) + self.cycle_weight * G_recon_loss_B + self.identity_weight * G_identity_loss_B + self.cam_weight * G_cam_loss_B Generator_loss = G_loss_A + G_loss_B Generator_loss.backward() self.G_optim.minimize(Generator_loss) # clip parameter of AdaILN and ILN, applied after optimizer step clip_rho(self.genA2B) clip_rho(self.genB2A) d_loss_writer.add_scalar(tag="d_loss", step=step, value=Discriminator_loss) g_loss_writer.add_scalar(tag="g_loss", step=step, value=Generator_loss) print("[%5d/%5d] time: %4.4f d_loss: %.8f, g_loss: %.8f" % (step, self.iteration, time.time() - start_time, Discriminator_loss, Generator_loss)) if step % self.print_freq == 0: train_sample_num = 5 test_sample_num = 5 A2B = np.zeros((self.img_size * 7, 0, 3)) B2A = np.zeros((self.img_size * 7, 0, 3)) self.genA2B.eval(), self.genB2A.eval(), self.disGA.eval( ), self.disGB.eval(), self.disLA.eval(), self.disLB.eval() for _ in range(train_sample_num): try: real_A, _ = next(testA_iter)[0] except: testA_iter = self.testA_loader() real_A, _ = next(testA_iter)[0] try: real_B, _ = next(testB_iter)[0] except: testB_iter = self.testB_loader() real_B, _ = next(testB_iter)[0] fake_A2B, _, fake_A2B_heatmap = self.genA2B(real_A) fake_B2A, _, fake_B2A_heatmap = self.genB2A(real_B) fake_A2B2A, _, fake_A2B2A_heatmap = self.genB2A(fake_A2B) fake_B2A2B, _, fake_B2A2B_heatmap = self.genA2B(fake_B2A) fake_A2A, _, fake_A2A_heatmap = self.genB2A(real_A) fake_B2B, _, fake_B2B_heatmap = self.genA2B(real_B) A2B = np.concatenate( (A2B, np.concatenate( (RGB2BGR(tensor2numpy(denorm(real_A[0]))), cam(tensor2numpy(fake_A2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2A[0]))), cam(tensor2numpy(fake_A2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2B[0]))), cam(tensor2numpy(fake_A2B2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2B2A[0])))), 0)), 1) B2A = np.concatenate( (B2A, np.concatenate( (RGB2BGR(tensor2numpy(denorm(real_B[0]))), cam(tensor2numpy(fake_B2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2B[0]))), cam(tensor2numpy(fake_B2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2A[0]))), cam(tensor2numpy(fake_B2A2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2A2B[0])))), 0)), 1) for _ in range(test_sample_num): try: real_A, _ = next(testA_iter)[0] except: testA_iter = self.testA_loader() real_A, _ = next(testA_iter)[0] try: real_B, _ = next(testB_iter)[0] except: testB_iter = self.testB_loader() real_B, _ = next(testB_iter)[0] real_A, real_B = real_A, real_B fake_A2B, _, fake_A2B_heatmap = self.genA2B(real_A) fake_B2A, _, fake_B2A_heatmap = self.genB2A(real_B) fake_A2B2A, _, fake_A2B2A_heatmap = self.genB2A(fake_A2B) fake_B2A2B, _, fake_B2A2B_heatmap = self.genA2B(fake_B2A) fake_A2A, _, fake_A2A_heatmap = self.genB2A(real_A) fake_B2B, _, fake_B2B_heatmap = self.genA2B(real_B) A2B = np.concatenate( (A2B, np.concatenate( (RGB2BGR(tensor2numpy(denorm(real_A[0]))), cam(tensor2numpy(fake_A2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2A[0]))), cam(tensor2numpy(fake_A2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2B[0]))), cam(tensor2numpy(fake_A2B2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_A2B2A[0])))), 0)), 1) B2A = np.concatenate( (B2A, np.concatenate( (RGB2BGR(tensor2numpy(denorm(real_B[0]))), cam(tensor2numpy(fake_B2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2B[0]))), cam(tensor2numpy(fake_B2A_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2A[0]))), cam(tensor2numpy(fake_B2A2B_heatmap[0]), self.img_size), RGB2BGR(tensor2numpy(denorm(fake_B2A2B[0])))), 0)), 1) cv2.imwrite( os.path.join(self.result_dir, self.dataset, 'img', 'A2B_%07d.png' % step), A2B * 255.0) cv2.imwrite( os.path.join(self.result_dir, self.dataset, 'img', 'B2A_%07d.png' % step), B2A * 255.0) self.genA2B.train(), self.genB2A.train(), self.disGA.train( ), self.disGB.train(), self.disLA.train(), self.disLB.train() if step in [8000, 9000, 10000]: self.save(os.path.join(self.result_dir, self.dataset, 'model'), step)
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) import random local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.print_params: param_delimit_str = '-' * 20 + "All parameters in current graph" + '-' * 20 print(param_delimit_str) for block in train_prog.blocks: for param in block.all_parameters(): print("parameter name: {}\tshape: {}".format( param.name, param.shape)) print('-' * len(param_delimit_str)) return if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' start_iter = 0 if cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights) pruned_params = FLAGS.pruned_params assert FLAGS.pruned_params is not None, \ "FLAGS.pruned_params is empty!!! Please set it by '--pruned_params' option." pruned_params = FLAGS.pruned_params.strip().split(",") logger.info("pruned params: {}".format(pruned_params)) pruned_ratios = [float(n) for n in FLAGS.pruned_ratios.strip().split(",")] logger.info("pruned ratios: {}".format(pruned_ratios)) assert len(pruned_params) == len(pruned_ratios), \ "The length of pruned params and pruned ratios should be equal." assert (pruned_ratios > [0] * len(pruned_ratios) and pruned_ratios < [1] * len(pruned_ratios) ), "The elements of pruned ratios should be in range (0, 1)." assert FLAGS.prune_criterion in ['l1_norm', 'geometry_median'], \ "unsupported prune criterion {}".format(FLAGS.prune_criterion) pruner = Pruner(criterion=FLAGS.prune_criterion) if FLAGS.eval: base_flops = flops(eval_prog) eval_prog = pruner.prune(eval_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=True)[0] pruned_flops = flops(eval_prog) logger.info("FLOPs -{}; total FLOPs: {}; pruned FLOPs: {}".format( float(base_flops - pruned_flops) / base_flops, base_flops, pruned_flops)) compiled_eval_prog = fluid.CompiledProgram(eval_prog) train_prog = pruner.prune(train_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_iter, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 if FLAGS.eval: resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution # evaluation results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) dataset = cfg['EvalReader']['dataset'] box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use VisualDL to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg=cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, dataset=dataset) # use VisualDL to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) train_loader.reset()
class DeepSpeech2Model(object): """DeepSpeech2Model class. :param vocab_size: Decoding vocabulary size. :type vocab_size: int :param num_conv_layers: Number of stacking convolution layers. :type num_conv_layers: int :param num_rnn_layers: Number of stacking RNN layers. :type num_rnn_layers: int :param rnn_layer_size: RNN layer size (number of RNN cells). :type rnn_layer_size: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool :param share_rnn_weights: Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. :type share_rnn_weights: bool :param place: Program running place. :type place: CPUPlace or CUDAPlace :param init_from_pretrained_model: Pretrained model path. If None, will train from stratch. :type init_from_pretrained_model: string|None :param output_model_dir: Output model directory. If None, output to current directory. :type output_model_dir: string|None """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru=False, share_rnn_weights=True, place=fluid.CPUPlace(), init_from_pretrained_model=None, output_model_dir=None, is_infer=False, error_rate_type='cer', vocab_list=None): self._vocab_size = vocab_size self._num_conv_layers = num_conv_layers self._num_rnn_layers = num_rnn_layers self._rnn_layer_size = rnn_layer_size self._use_gru = use_gru self._share_rnn_weights = share_rnn_weights self._place = place self._init_from_pretrained_model = init_from_pretrained_model self._output_model_dir = output_model_dir self._ext_scorer = None self.logger = logging.getLogger("") self.logger.setLevel(level=logging.INFO) if not is_infer: shutil.rmtree('log', ignore_errors=True) self.writer = LogWriter(logdir='log') self.error_rate_type = error_rate_type self.vocab_list = vocab_list self.save_model_path = '' # 预测相关的参数 self.infer_program = None self.infer_compiled_prog = None self.infer_feeder = None self.infer_log_probs = None self.infer_exe = None if is_infer: self.init_infer_program() def create_network(self, is_infer=False): """Create data layers and model network. :param is_training: Whether to create a network for training. :type is_training: bool :return reader: Reader for input. :rtype reader: read generater :return log_probs: An output unnormalized log probability layer. :rtype lig_probs: Varable :return loss: A ctc loss layer. :rtype loss: Variable """ if not is_infer: input_fields = { 'names': ['audio_data', 'text_data', 'seq_len_data', 'masks'], 'shapes': [[None, 161, None], [None, 1], [None, 1], [None, 32, 81, None]], 'dtypes': ['float32', 'int32', 'int64', 'float32'], 'lod_levels': [0, 1, 0, 0] } inputs = [ fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] reader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=128, iterable=False, use_double_buffer=True) (audio_data, text_data, seq_len_data, masks) = inputs else: audio_data = fluid.data(name='audio_data', shape=[None, 161, None], dtype='float32', lod_level=0) seq_len_data = fluid.data(name='seq_len_data', shape=[None, 1], dtype='int64', lod_level=0) masks = fluid.data(name='masks', shape=[None, 32, 81, None], dtype='float32', lod_level=0) text_data = None reader = fluid.DataFeeder([audio_data, seq_len_data, masks], self._place) log_probs, loss = deep_speech_v2_network(audio_data=audio_data, text_data=text_data, seq_len_data=seq_len_data, masks=masks, dict_size=self._vocab_size, num_conv_layers=self._num_conv_layers, num_rnn_layers=self._num_rnn_layers, rnn_size=self._rnn_layer_size, use_gru=self._use_gru, share_rnn_weights=self._share_rnn_weights) return reader, log_probs, loss def init_from_pretrained_model(self, exe, program): '''Init params from pretrain model. ''' assert isinstance(self._init_from_pretrained_model, str) if not os.path.exists(self._init_from_pretrained_model): print(self._init_from_pretrained_model) raise Warning("The pretrained params do not exist.") fluid.io.load_params(executor=exe, dirname=self._init_from_pretrained_model, main_program=program, filename="params.pdparams") print("成功加载了预训练模型:%s" % self._init_from_pretrained_model) pre_epoch = 0 dir_name = self._init_from_pretrained_model.split('_') if len(dir_name) >= 2 and dir_name[-2].endswith('epoch') and dir_name[-1].isdigit(): pre_epoch = int(dir_name[-1]) return pre_epoch + 1 def save_param(self, exe, program, dirname): '''Save model params to dirname''' assert isinstance(self._output_model_dir, str) param_dir = os.path.join(self._output_model_dir) if not os.path.exists(param_dir): os.mkdir(param_dir) self.save_model_path = os.path.join(param_dir, dirname) fluid.io.save_params(executor=exe, dirname=os.path.join(param_dir, dirname), main_program=program, filename="params.pdparams") print("save parameters at %s" % self.save_model_path) return True def test(self, test_reader): '''Test the model. :param test_reader: Reader of test. :type test_reader: Reader :return: Wer/Cer rate. :rtype: float ''' errors_sum, len_refs = 0.0, 0 errors_func = char_errors if self.error_rate_type == 'cer' else word_errors # 初始化预测程序 self.init_infer_program() for infer_data in test_reader(): # 执行预测 probs_split = self.infer_batch_probs(infer_data=infer_data) # 使用最优路径解码 result_transcripts = self.decode_batch_greedy(probs_split=probs_split, vocab_list=self.vocab_list) target_transcripts = infer_data[1] # 计算字错率 for target, result in zip(target_transcripts, result_transcripts): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref return errors_sum / len_refs def train(self, train_batch_reader, dev_batch_reader, learning_rate, gradient_clipping, num_epoch, batch_size, num_samples, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_epoch: Number of training epochs. :type num_epoch: int :param batch_size: Number of batch size. :type batch_size: int :param num_samples: The num of train samples. :type num_samples: int :param num_iterations_print: Number of training iterations for printing a training loss. :type num_iteratons_print: int :param only_train_batch:Every epoch only train only_train_batch batch. Avoid insufficient video memory :type only_train_batch:int :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(self._output_model_dir): mkpath(self._output_model_dir) if isinstance(self._place, fluid.CUDAPlace): dev_count = fluid.core.get_cuda_device_count() learning_rate = learning_rate * dev_count else: dev_count = int(os.environ.get('CPU_NUM', 1)) # prepare the network train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_reader, _, ctc_loss = self.create_network() # 学习率 learning_rate = fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=num_samples / batch_size / dev_count, decay_rate=0.83, staircase=True) # 准备优化器 optimizer = fluid.optimizer.AdamOptimizer( learning_rate=learning_rate, regularization=fluid.regularizer.L2Decay(0.0001), grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=gradient_clipping)) optimizer.minimize(loss=ctc_loss) exe = fluid.Executor(self._place) exe.run(startup_prog) # init from some pretrain models, to better solve the current task pre_epoch = 0 if self._init_from_pretrained_model: pre_epoch = self.init_from_pretrained_model(exe, train_program) build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() # pass the build_strategy to with_data_parallel API train_compiled_prog = compiler.CompiledProgram(train_program).with_data_parallel(loss_name=ctc_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) train_reader.set_batch_generator(train_batch_reader) train_step = 0 test_step = 0 num_batch = -1 # run train for epoch_id in range(num_epoch): train_reader.start() epoch_loss = [] time_begin = time.time() batch_id = 0 while True: try: fetch_list = [ctc_loss.name, learning_rate.name] if batch_id % 100 == 0: fetch = exe.run(program=train_compiled_prog, fetch_list=fetch_list, return_numpy=False) each_loss = fetch[0] each_learning_rate = np.array(fetch[1])[0] epoch_loss.extend(np.array(each_loss[0]) / batch_size) print("Train [%s] epoch: [%d/%d], batch: [%d/%d], learning rate: %f, train loss: %f\n" % (datetime.now(), epoch_id, num_epoch, batch_id, num_batch, each_learning_rate, np.mean(each_loss[0]) / batch_size)) # 记录训练损失值 self.writer.add_scalar('Train loss', np.mean(each_loss[0]) / batch_size, train_step) self.writer.add_scalar('Learning rate', each_learning_rate, train_step) train_step += 1 else: _ = exe.run(program=train_compiled_prog, fetch_list=[], return_numpy=False) # 每2000个batch保存一次模型 if batch_id % 2000 == 0 and batch_id != 0: self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) batch_id = batch_id + 1 except fluid.core.EOFException: train_reader.reset() break num_batch = batch_id # 每一个epoch保存一次模型 self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) used_time = time.time() - time_begin if test_off: print('======================last Train=====================') print("Train time: %f sec, epoch: %d, train loss: %f\n" % (used_time, epoch_id, np.mean(np.array(epoch_loss)))) print('======================last Train=====================') else: print('\n======================Begin test=====================') # 设置临时模型的路径 self._init_from_pretrained_model = self.save_model_path # 执行测试 test_result = self.test(test_reader=dev_batch_reader) print("Train time: %f sec, epoch: %d, train loss: %f, test %s: %f" % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), self.error_rate_type, test_result)) print('======================Stop Train=====================\n') # 记录测试结果 self.writer.add_scalar('Test %s' % self.error_rate_type, test_result, test_step) test_step += 1 self.save_param(exe, train_program, "step_final") print("\n------------Training finished!!!-------------") # 预测一个batch的音频 def infer_batch_probs(self, infer_data): """Infer the prob matrices for a batch of speech utterances. :param infer_data: List of utterances to infer, with each utterance consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :return: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :rtype: List of matrix """ # define inferer infer_results = [] data = [] if isinstance(self._place, fluid.CUDAPlace): num_places = fluid.core.get_cuda_device_count() else: num_places = int(os.environ.get('CPU_NUM', 1)) # 开始预测 for i in range(infer_data[0].shape[0]): # 使用多卡推理 data.append([[infer_data[0][i], infer_data[2][i], infer_data[3][i]]]) if len(data) == num_places: each_log_probs = self.infer_exe.run(program=self.infer_compiled_prog, feed=list(self.infer_feeder.feed_parallel( iterable=data, num_places=num_places)), fetch_list=[self.infer_log_probs], return_numpy=False) data = [] infer_results.extend(np.array(each_log_probs[0])) # 如果数据是单数,就获取最后一个计算 last_data_num = infer_data[0].shape[0] % num_places if last_data_num != 0: for i in range(infer_data[0].shape[0] - last_data_num, infer_data[0].shape[0]): each_log_probs = self.infer_exe.run(program=self.infer_program, feed=self.infer_feeder.feed( [[infer_data[0][i], infer_data[2][i], infer_data[3][i]]]), fetch_list=[self.infer_log_probs], return_numpy=False) infer_results.extend(np.array(each_log_probs[0])) # slice result infer_results = np.array(infer_results) seq_len = (infer_data[2] - 1) // 3 + 1 start_pos = [0] * (infer_data[0].shape[0] + 1) for i in range(infer_data[0].shape[0]): start_pos[i + 1] = start_pos[i] + seq_len[i][0] probs_split = [ infer_results[start_pos[i]:start_pos[i + 1]] for i in range(0, infer_data[0].shape[0]) ] return probs_split # 初始化预测程序,加预训练模型 def init_infer_program(self): # define inferer self.infer_program = fluid.Program() startup_prog = fluid.Program() # prepare the network with fluid.program_guard(self.infer_program, startup_prog): with fluid.unique_name.guard(): self.infer_feeder, self.infer_log_probs, _ = self.create_network(is_infer=True) self.infer_program = self.infer_program.clone(for_test=True) self.infer_exe = fluid.Executor(self._place) self.infer_exe.run(startup_prog) # init param from pretrained_model if not self._init_from_pretrained_model: exit("预训练模型文件不存在!") self.init_from_pretrained_model(self.infer_exe, self.infer_program) # 支持多卡推理 build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() self.infer_compiled_prog = compiler.CompiledProgram(self.infer_program).with_data_parallel( build_strategy=build_strategy, exec_strategy=exec_strategy) # 单个音频预测 def infer(self, feature): """Infer the prob matrices for a batch of speech utterances. :param infer_data: List of utterances to infer, with each utterance consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :return: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :rtype: List of matrix """ audio_len = feature[0].shape[1] mask_shape0 = (feature[0].shape[0] - 1) // 2 + 1 mask_shape1 = (feature[0].shape[1] - 1) // 3 + 1 mask_max_len = (audio_len - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat(np.reshape(np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) infer_data = [np.array(feature[0]).astype('float32'), None, np.array(audio_len).astype('int64'), np.array(mask).astype('float32')] # run inference each_log_probs = self.infer_exe.run(program=self.infer_program, feed=self.infer_feeder.feed( [[infer_data[0], infer_data[2], infer_data[3]]]), fetch_list=[self.infer_log_probs], return_numpy=False) infer_result = np.array(each_log_probs[0]) # slice result seq_len = (infer_data[2] - 1) // 3 + 1 start_pos = [0, 0] start_pos[1] = start_pos[0] + seq_len probs_split = [infer_result[start_pos[0]:start_pos[1]]] return probs_split def decode_batch_greedy(self, probs_split, vocab_list): """Decode by best path for a batch of probs matrix input. :param probs_split: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :param probs_split: List of matrix :param vocab_list: List of tokens in the vocabulary, for decoding. :type vocab_list: list :return: List of transcription texts. :rtype: List of str """ results = [] for i, probs in enumerate(probs_split): output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) return results def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, vocab_list): """Initialize the external scorer. :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. :type beam_beta: float :param language_model_path: Filepath for language model. If it is empty, the external scorer will be set to None, and the decoding method will be pure beam search without scorer. :type language_model_path: str|None :param vocab_list: List of tokens in the vocabulary, for decoding. :type vocab_list: list """ if language_model_path != '': self.logger.info("begin to initialize the external scorer for decoding") self._ext_scorer = Scorer(beam_alpha, beam_beta, language_model_path, vocab_list) lm_char_based = self._ext_scorer.is_character_based() lm_max_order = self._ext_scorer.get_max_order() lm_dict_size = self._ext_scorer.get_dict_size() self.logger.info("language model: " "is_character_based = %d," % lm_char_based + " max_order = %d," % lm_max_order + " dict_size = %d" % lm_dict_size) self.logger.info("end initializing scorer") else: self._ext_scorer = None self.logger.info("no language model provided, decoding by pure beam search without scorer.") def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, vocab_list, num_processes): """Decode by beam search for a batch of probs matrix input. :param probs_split: List of 2-D probability matrix, and each consists of prob vectors for one speech utterancce. :param probs_split: List of matrix :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. :type beam_beta: float :param beam_size: Width for Beam search. :type beam_size: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in vocabulary will be used in beam search, default 40. :type cutoff_top_n: int :param vocab_list: List of tokens in the vocabulary, for decoding. :type vocab_list: list :param num_processes: Number of processes (CPU) for decoder. :type num_processes: int :return: List of transcription texts. :rtype: List of str """ if self._ext_scorer is not None: self._ext_scorer.reset_params(beam_alpha, beam_beta) # beam search decode num_processes = min(num_processes, len(probs_split)) beam_search_results = ctc_beam_search_decoder_batch(probs_split=probs_split, vocabulary=vocab_list, beam_size=beam_size, num_processes=num_processes, ext_scoring_func=self._ext_scorer, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n) results = [result[0][1] for result in beam_search_results] return results
def train_loop(self, num_epochs, train_dataset, train_batch_size, eval_dataset=None, save_interval_epochs=1, log_interval_steps=10, save_dir='output', use_vdl=False, early_stop=False, early_stop_patience=5): if train_dataset.num_samples < train_batch_size: raise Exception( 'The amount of training datset must be larger than batch size.') if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if use_vdl: from visualdl import LogWriter vdl_logdir = osp.join(save_dir, 'vdl_log') # 给transform添加arrange操作 input_channel = getattr(self, 'input_channel', 3) arrange_transforms( model_type=self.model_type, class_name=self.__class__.__name__, transforms=train_dataset.transforms, mode='train', input_channel=input_channel) # 构建train_data_loader self.build_train_data_loader( dataset=train_dataset, batch_size=train_batch_size) if eval_dataset is not None: self.eval_transforms = eval_dataset.transforms self.test_transforms = copy.deepcopy(eval_dataset.transforms) # 获取实时变化的learning rate lr = self.optimizer._learning_rate if isinstance(lr, fluid.framework.Variable): self.train_outputs['lr'] = lr # 在多卡上跑训练 if self.parallel_train_prog is None: build_strategy = fluid.compiler.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False if paddlex.env_info['place'] != 'cpu' and len(self.places) > 1: build_strategy.sync_batch_norm = self.sync_bn exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 1 self.parallel_train_prog = fluid.CompiledProgram( self.train_prog).with_data_parallel( loss_name=self.train_outputs['loss'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) total_num_steps = math.floor(train_dataset.num_samples / train_batch_size) num_steps = 0 time_stat = list() time_train_one_epoch = None time_eval_one_epoch = None total_num_steps_eval = 0 # 模型总共的评估次数 total_eval_times = math.ceil(num_epochs / save_interval_epochs) # 检测目前仅支持单卡评估,训练数据batch大小与显卡数量之商为验证数据batch大小。 eval_batch_size = train_batch_size if self.model_type == 'detector': eval_batch_size = self._get_single_card_bs(train_batch_size) if eval_dataset is not None: total_num_steps_eval = math.ceil(eval_dataset.num_samples / eval_batch_size) if use_vdl: # VisualDL component log_writer = LogWriter(vdl_logdir) thresh = 0.0001 if early_stop: earlystop = EarlyStop(early_stop_patience, thresh) best_accuracy_key = "" best_accuracy = -1.0 best_model_epoch = -1 start_epoch = self.completed_epochs # task_id: 目前由PaddleX GUI赋值 # 用于在VisualDL日志中注明所属任务id task_id = getattr(paddlex, "task_id", "") for i in range(start_epoch, num_epochs): records = list() step_start_time = time.time() epoch_start_time = time.time() for step, data in enumerate(self.train_data_loader()): outputs = self.exe.run( self.parallel_train_prog, feed=data, fetch_list=list(self.train_outputs.values())) outputs_avg = np.mean(np.array(outputs), axis=1) records.append(outputs_avg) # 训练完成剩余时间预估 current_time = time.time() step_cost_time = current_time - step_start_time step_start_time = current_time if len(time_stat) < 20: time_stat.append(step_cost_time) else: time_stat[num_steps % 20] = step_cost_time # 每间隔log_interval_steps,输出loss信息 num_steps += 1 if num_steps % log_interval_steps == 0: step_metrics = OrderedDict( zip(list(self.train_outputs.keys()), outputs_avg)) if use_vdl: for k, v in step_metrics.items(): log_writer.add_scalar( '{}-Metrics/Training(Step): {}'.format( task_id, k), v, num_steps) # 估算剩余时间 avg_step_time = np.mean(time_stat) if time_train_one_epoch is not None: eta = (num_epochs - i - 1) * time_train_one_epoch + ( total_num_steps - step - 1) * avg_step_time else: eta = ((num_epochs - i) * total_num_steps - step - 1 ) * avg_step_time if time_eval_one_epoch is not None: eval_eta = ( total_eval_times - i // save_interval_epochs ) * time_eval_one_epoch else: eval_eta = ( total_eval_times - i // save_interval_epochs ) * total_num_steps_eval * avg_step_time eta_str = seconds_to_hms(eta + eval_eta) logging.info( "[TRAIN] Epoch={}/{}, Step={}/{}, {}, time_each_step={}s, eta={}" .format(i + 1, num_epochs, step + 1, total_num_steps, dict2str(step_metrics), round(avg_step_time, 2), eta_str)) train_metrics = OrderedDict( zip(list(self.train_outputs.keys()), np.mean( records, axis=0))) logging.info('[TRAIN] Epoch {} finished, {} .'.format( i + 1, dict2str(train_metrics))) time_train_one_epoch = time.time() - epoch_start_time epoch_start_time = time.time() # 每间隔save_interval_epochs, 在验证集上评估和对模型进行保存 self.completed_epochs += 1 eval_epoch_start_time = time.time() if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1: current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1)) if not osp.isdir(current_save_dir): os.makedirs(current_save_dir) if getattr(self, 'use_ema', False): self.exe.run(self.ema.apply_program) if eval_dataset is not None and eval_dataset.num_samples > 0: self.eval_metrics, self.eval_details = self.evaluate( eval_dataset=eval_dataset, batch_size=eval_batch_size, epoch_id=i + 1, return_details=True) logging.info('[EVAL] Finished, Epoch={}, {} .'.format( i + 1, dict2str(self.eval_metrics))) # 保存最优模型 best_accuracy_key = list(self.eval_metrics.keys())[0] current_accuracy = self.eval_metrics[best_accuracy_key] if current_accuracy > best_accuracy: best_accuracy = current_accuracy best_model_epoch = i + 1 best_model_dir = osp.join(save_dir, "best_model") self.save_model(save_dir=best_model_dir) if use_vdl: for k, v in self.eval_metrics.items(): if isinstance(v, list): continue if isinstance(v, np.ndarray): if v.size > 1: continue log_writer.add_scalar( "{}-Metrics/Eval(Epoch): {}".format( task_id, k), v, i + 1) self.save_model(save_dir=current_save_dir) if getattr(self, 'use_ema', False): self.exe.run(self.ema.restore_program) time_eval_one_epoch = time.time() - eval_epoch_start_time eval_epoch_start_time = time.time() if best_model_epoch > 0: logging.info( 'Current evaluated best model in eval_dataset is epoch_{}, {}={}' .format(best_model_epoch, best_accuracy_key, best_accuracy)) if eval_dataset is not None and early_stop: if earlystop(current_accuracy): break
def do_train(args): paddle.set_device(args.device) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, "mp_degree": args.mp_degree, "pp_degree": args.pp_degree, "sharding_degree": args.sharding_degree } accumulate_steps = args.local_batch_size // args.micro_batch_size strategy.pipeline_configs = { "accumulate_steps": accumulate_steps, "micro_batch_size": args.micro_batch_size } # set control in tensor parallel strategy.tensor_parallel_configs = {"tensor_init_seed": args.seed} fleet.init(is_collective=True, strategy=strategy) # obtain rank message of hybrid parallel hcg = fleet.get_hybrid_communicate_group() global_rank = hcg.get_global_rank() mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() dp_rank = hcg.get_data_parallel_rank() sharding_rank = hcg.get_sharding_parallel_rank() # sharding stage2/3 not support hybrid parallel if args.sharding_stage in [2, 3]: assert args.dp_degree == args.mp_degree == args.pp_degree == 1, "sharding stage2/3 will support hybrid parallel later" sharding_size = hcg.get_sharding_parallel_world_size() data_world_rank = dp_rank * sharding_size + sharding_rank data_world_size = args.dp_degree * args.sharding_degree local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, data_world_rank, mp_rank, pp_rank) default_global_tokens_num = args.global_batch_size * args.max_seq_len model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define log writer log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_pure_fp16_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_pure_fp16, False, global_rank).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config['num_partitions'] = args.mp_degree model_config['use_recompute'] = args.use_recompute if args.pp_degree == 1: model = GPTForPretraining(GPTModel(**model_config)) else: model_config['topology'] = hcg.topology() model = GPTForPretrainingPipe(**model_config) else: model = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = None if args.lr_decay_style == "none": lr_scheduler = None elif args.lr_decay_style == "cosine": lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] if args.sharding_stage == 1 and args.sharding_degree > 1: optimizer = DygraphShardingOptimizer( hcg=fleet.get_hybrid_communicate_group(), user_defined_strategy=strategy, params=model.parameters(), inner_optimizer_class=paddle.optimizer.AdamW, learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) else: optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params, # TODO: remove 'multi_precision' in definition of optimizer # and add it to 'paddle.amp.decorate' multi_precision=args.use_pure_fp16) if args.use_pure_fp16: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) # level O2 means converting the network to FP16 if args.sharding_stage not in [2, 3]: scaler = fleet.distributed_scaler(scaler) model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') # wrap sharding stage2/3 and add collective group # TODO(Baibaifan): combine ShardingStage1/2/3 and fleet.distributed_model in feature if args.sharding_stage in [2, 3]: scaler = scaler if args.use_pure_fp16 else None model, optimizer, scaler = wrap_sharding_2_3(model, optimizer, scaler, args.sharding_offload) elif paddle.distributed.get_world_size() > 1: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = get_train_data_file(args) files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, [data_file], local_rank=local_rank, data_world_size=data_world_size, data_world_rank=data_world_rank, eos_id=tokenizer.eos_token_id) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 tokens, loss_mask, position_ids, labels = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True if args.pp_degree == 1: # In ParallelMode of DataParallel, 'no_sync' can be used for improving # performance of model by gradient accumulation. loss = 0.0 for i in range(accumulate_steps): start_index = i * args.micro_batch_size end_index = start_index + args.micro_batch_size with paddle.amp.auto_cast( args.use_pure_fp16, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): preds = model( tokens[start_index:end_index, :], position_ids[start_index:end_index, :]) loss_mbs = criterion( preds, labels[start_index:end_index, :], loss_mask[start_index:end_index, :]) loss_mbs = loss_mbs / accumulate_steps if args.use_pure_fp16: scaler.scale(loss_mbs).backward() else: loss_mbs.backward() loss = loss + loss_mbs if args.use_pure_fp16: if args.sharding_stage in [2, 3]: scaler.step(optimizer) scaler.update() else: scaler.minimize(optimizer, loss) else: optimizer.step() if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() else: data = [(tokens, position_ids), (labels, loss_mask)] with paddle.amp.auto_cast( args.use_pure_fp16, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): loss = model.train_batch( data, optimizer=optimizer, lr_scheduler=lr_scheduler, scaler=scaler if args.use_pure_fp16 else None) # Sync for profile time, delete it may be a little faster paddle.device.cuda.synchronize() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_freq == 0: avg_loss = loss.numpy() speed = args.logging_freq / (train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_freq logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, avg_loss, avg_reader_cost, 1. / speed, speed, speed * default_global_tokens_num, optimizer.get_lr())) log_writer.add_scalar("loss", float(loss), global_step) log_writer.add_scalar("learning_rate", optimizer.get_lr(), global_step) tic_train = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # Since the valid data broardcast to all devices, we do evaluate on all device. run_evaluate(args, valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, epoch, "valid") # TODO: 1. merge paramters while saving model. 2. ensure that the model is saved and loaded correctly # only dp_rank = 0 save model if (global_step % args.save_steps == 0 or global_step >= args.max_steps) and dp_rank == 0: model_to_save = model._layers if paddle.distributed.get_world_size( ) > 1 and args.sharding_stage not in [2, 3] else model output_dir = os.path.join(args.output_dir, "step_%d" % global_step) os.makedirs(output_dir, exist_ok=True) logger.info("Save model to %s" % output_dir) if args.pp_degree > 1: if mp_rank == 0 and sharding_rank == 0 and pp_rank == 0: tokenizer.save_pretrained(output_dir) model_to_save.save_state_dict(output_dir) paddle.save( optimizer.state_dict(), os.path.join( output_dir, "model_state_mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}.pdopt" .format(mp_rank, sharding_rank, pp_rank))) else: if args.sharding_stage == 3: # If parameter need to convert to cpu, please add convert2cpu=True model_to_save.get_all_parameters(convert2cpu=False) if mp_rank == 0 and sharding_rank == 0: tokenizer.save_pretrained(output_dir) model_to_save.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join( output_dir, "model_state_mp_{:0>2d}_sharding_{:0>2d}.pdopt" .format(mp_rank, sharding_rank))) if global_step >= args.max_steps: run_evaluate(args, test_data_loader, model, criterion, args.test_iters, log_writer, global_step, epoch, "test") logger.info("The training process is complete.") del train_data_loader return reader_start = time.time() del train_data_loader
def train(self, num_epochs, train_dataset, train_batch_size=2, eval_dataset=None, save_interval_epochs=1, log_interval_steps=2, save_dir='output', pretrained_weights=None, resume_weights=None, optimizer=None, learning_rate=0.01, lr_decay_power=0.9, regularization_coeff=4e-5, use_vdl=False, quant=False): self.labels = train_dataset.labels self.train_transforms = train_dataset.transforms self.train_init = locals() self.begin_epoch = 0 if optimizer is None: num_steps_each_epoch = train_dataset.num_samples // train_batch_size optimizer = self.default_optimizer( learning_rate=learning_rate, num_epochs=num_epochs, num_steps_each_epoch=num_steps_each_epoch, lr_decay_power=lr_decay_power, regularization_coeff=regularization_coeff) self.optimizer = optimizer self.build_program() self.net_initialize( startup_prog=fluid.default_startup_program(), pretrained_weights=pretrained_weights, resume_weights=resume_weights) # 进行量化 if quant: # 当 for_test=False ,返回类型为 fluid.CompiledProgram # 当 for_test=True ,返回类型为 fluid.Program self.train_prog = slim.quant.quant_aware( self.train_prog, self.exe.place, for_test=False) self.test_prog = slim.quant.quant_aware( self.test_prog, self.exe.place, for_test=True) # self.parallel_train_prog = self.train_prog.with_data_parallel( # loss_name=self.train_outputs['loss'].name) self.status = 'Quant' if self.begin_epoch >= num_epochs: raise ValueError( ("begin epoch[{}] is larger than num_epochs[{}]").format( self.begin_epoch, num_epochs)) if not osp.isdir(save_dir): if osp.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) # add arrange op tor transforms self.arrange_transform( transforms=train_dataset.transforms, mode='train') self.build_train_data_loader( dataset=train_dataset, batch_size=train_batch_size) if eval_dataset is not None: self.eval_transforms = eval_dataset.transforms self.test_transforms = copy.deepcopy(eval_dataset.transforms) lr = self.optimizer._learning_rate lr.persistable = True if isinstance(lr, fluid.framework.Variable): self.train_outputs['lr'] = lr # 多卡训练 if self.parallel_train_prog is None: build_strategy = fluid.compiler.BuildStrategy() if self.env_info['place'] != 'cpu' and len(self.places) > 1: build_strategy.sync_batch_norm = self.sync_bn exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_iteration_per_drop_scope = 1 if quant: build_strategy.fuse_all_reduce_ops = False build_strategy.sync_batch_norm = False self.parallel_train_prog = self.train_prog.with_data_parallel( loss_name=self.train_outputs['loss'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: self.parallel_train_prog = fluid.CompiledProgram( self.train_prog).with_data_parallel( loss_name=self.train_outputs['loss'].name, build_strategy=build_strategy, exec_strategy=exec_strategy) total_num_steps = math.floor( train_dataset.num_samples / train_batch_size) num_steps = 0 time_stat = list() time_train_one_epoch = None time_eval_one_epoch = None total_num_steps_eval = 0 # eval times total_eval_times = math.ceil(num_epochs / save_interval_epochs) eval_batch_size = train_batch_size if eval_dataset is not None: total_num_steps_eval = math.ceil( eval_dataset.num_samples / eval_batch_size) if use_vdl: from visualdl import LogWriter vdl_logdir = osp.join(save_dir, 'vdl_log') log_writer = LogWriter(vdl_logdir) best_miou = -1.0 best_model_epoch = 1 for i in range(self.begin_epoch, num_epochs): records = list() step_start_time = time.time() epoch_start_time = time.time() for step, data in enumerate(self.train_data_loader()): outputs = self.exe.run( self.parallel_train_prog, feed=data, fetch_list=list(self.train_outputs.values())) outputs_avg = np.mean(np.array(outputs), axis=1) records.append(outputs_avg) # time estimated to complete the training currend_time = time.time() step_cost_time = currend_time - step_start_time step_start_time = currend_time if len(time_stat) < 20: time_stat.append(step_cost_time) else: time_stat[num_steps % 20] = step_cost_time num_steps += 1 if num_steps % log_interval_steps == 0: step_metrics = OrderedDict( zip(list(self.train_outputs.keys()), outputs_avg)) if use_vdl: for k, v in step_metrics.items(): log_writer.add_scalar( step=num_steps, tag='train/{}'.format(k), value=v) # 计算剩余时间 avg_step_time = np.mean(time_stat) if time_train_one_epoch is not None: eta = (num_epochs - i - 1) * time_train_one_epoch + ( total_num_steps - step - 1) * avg_step_time else: eta = ((num_epochs - i) * total_num_steps - step - 1) * avg_step_time if time_eval_one_epoch is not None: eval_eta = (total_eval_times - i // save_interval_epochs ) * time_eval_one_epoch else: eval_eta = (total_eval_times - i // save_interval_epochs ) * total_num_steps_eval * avg_step_time eta_str = seconds_to_hms(eta + eval_eta) logging.info( "[TRAIN] Epoch={}/{}, Step={}/{}, {}, time_each_step={}s, eta={}" .format(i + 1, num_epochs, step + 1, total_num_steps, dict2str(step_metrics), round( avg_step_time, 2), eta_str)) train_metrics = OrderedDict( zip(list(self.train_outputs.keys()), np.mean(records, axis=0))) logging.info('[TRAIN] Epoch {} finished, {} .'.format( i + 1, dict2str(train_metrics))) time_train_one_epoch = time.time() - epoch_start_time eval_epoch_start_time = time.time() if (i + 1) % save_interval_epochs == 0 or i == num_epochs - 1: current_save_dir = osp.join(save_dir, "epoch_{}".format(i + 1)) if not osp.isdir(current_save_dir): os.makedirs(current_save_dir) if eval_dataset is not None: self.eval_metrics = self.evaluate( eval_dataset=eval_dataset, batch_size=eval_batch_size, epoch_id=i + 1) # 保存最优模型 current_miou = self.eval_metrics['miou'] if current_miou > best_miou: best_miou = current_miou best_model_epoch = i + 1 best_model_dir = osp.join(save_dir, "best_model") self.save_model(save_dir=best_model_dir) if use_vdl: for k, v in self.eval_metrics.items(): if isinstance(v, list): continue if isinstance(v, np.ndarray): if v.size > 1: continue log_writer.add_scalar( step=num_steps, tag='evaluate/{}'.format(k), value=v) self.save_model(save_dir=current_save_dir) time_eval_one_epoch = time.time() - eval_epoch_start_time if eval_dataset is not None: logging.info( 'Current evaluated best model in eval_dataset is epoch_{}, miou={}' .format(best_model_epoch, best_miou)) if quant: if osp.exists(osp.join(save_dir, "best_model")): fluid.load( program=self.test_prog, model_path=osp.join(save_dir, "best_model"), executor=self.exe) self.export_quant_model( save_dir=osp.join(save_dir, "best_model_export"), quant_type="online")
train_step = 0 test_step = 0 params_name = fluid.default_startup_program().global_block().all_parameters( )[0].name # 训练10次 for pass_id in range(10): # 进行训练 for batch_id, data in enumerate(train_reader()): train_cost, train_acc, params = exe.run( program=fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost, acc, params_name]) # 保存训练的日志数据 train_step += 1 writer.add_scalar(tag="训练/损失值", step=train_step, value=train_cost[0]) writer.add_scalar(tag="训练/准确率", step=train_step, value=train_acc[0]) writer.add_histogram(tag="训练/参数分布", step=train_step, values=params.flatten(), buckets=50) # 每100个batch打印一次信息 if batch_id % 100 == 0: print('Pass:%d, Batch:%d, Cost:%0.5f, Accuracy:%0.5f' % (pass_id, batch_id, train_cost[0], train_acc[0])) # 进行测试 test_accs = [] test_costs = [] for batch_id, data in enumerate(test_reader()):
train_loader = get_train_loader(c) val_loader = get_val_loader(c) epoch_num = c['epoch_num'] if args.restore != -1: avg_loss, mAP_score, auc_score, dprime = evaluate( args.restore, val_loader, model, bce_loss) print(f'average map at epoch {args.restore} is {mAP_score}') print(f'auc_score: {auc_score}') print(f'd-prime: {dprime}') best_mAP = mAP_score log_writer.add_scalar(tag="eval mAP", step=args.restore, value=mAP_score) log_writer.add_scalar(tag="eval auc", step=args.restore, value=auc_score) log_writer.add_scalar(tag="eval dprime", step=args.restore, value=dprime) else: best_mAP = 0.0 step = 0 for epoch in range(start_epoch, epoch_num): avg_loss = 0.0 avg_preci = 0.0
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, keep_checkpoint_max=5, test_config=None, fp16=False, profiler_options=None): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. test_config(dict, optional): Evaluation config. fp16 (bool, optional): Whether to use amp. profiler_options (str, optional): The option of train profiler. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: paddle.distributed.fleet.init(is_collective=True) optimizer = paddle.distributed.fleet.distributed_optimizer( optimizer) # The return is Fleet object ddp_model = paddle.distributed.fleet.distributed_model(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, worker_init_fn=worker_init_fn, ) # use amp if fp16: logger.info('use amp to train') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_list = [] iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: version = paddle.__version__ if version == '2.1.2': continue else: break reader_cost_averager.record(time.time() - batch_start) images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if hasattr(model, 'data_format') and model.data_format == 'NHWC': images = images.transpose((0, 2, 3, 1)) if fp16: with paddle.amp.auto_cast( enable=True, custom_white_list={ "elementwise_add", "batch_norm", "sync_batch_norm" }, custom_black_list={'bilinear_interp_v2'}): if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward if isinstance(optimizer, paddle.distributed.fleet.Fleet): scaler.minimize(optimizer.user_defined_optimizer, scaled) else: scaler.minimize(optimizer, scaled) # update parameters else: if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) loss.backward() optimizer.step() lr = optimizer.get_lr() # update lr if isinstance(optimizer, paddle.distributed.fleet.Fleet): lr_sche = optimizer.user_defined_optimizer._learning_rate else: lr_sche = optimizer._learning_rate if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler): lr_sche.step() train_profiler.add_profiler_step(profiler_options) model.clear_gradients() avg_loss += loss.numpy()[0] if not avg_loss_list: avg_loss_list = [l.numpy() for l in loss_list] else: for i in range(len(loss_list)): avg_loss_list[i] += loss_list[i].numpy() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_loss_list = [l[0] / log_iters for l in avg_loss_list] remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. if len(avg_loss_list) > 1: avg_loss_dict = {} for i, value in enumerate(avg_loss_list): avg_loss_dict['loss_' + str(i)] = value for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 avg_loss_list = [] reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 if test_config is None: test_config = {} mean_iou, acc, _, _, _ = evaluate(model, val_dataset, num_workers=num_workers, **test_config) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) batch_start = time.time() # Calculate flops. if local_rank == 0: _, c, h, w = images.shape _ = paddle.flops( model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. """ nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel training environment. paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 timer.start() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] train_batch_cost += timer.elapsed_time() if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_train_reader_cost = train_reader_cost / log_iters avg_train_batch_cost = train_batch_cost / log_iters train_reader_cost = 0.0 train_batch_cost = 0.0 remain_iters = iters - iter eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 mean_iou, acc = evaluate(model, val_dataset, num_workers=num_workers) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) timer.restart() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) data_loader.set_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") pruned_params = cfg.SLIM.PRUNE_PARAMS.strip().split(',') pruned_ratios = cfg.SLIM.PRUNE_RATIOS if isinstance(pruned_ratios, float): pruned_ratios = [pruned_ratios] * len(pruned_params) elif isinstance(pruned_ratios, (list, tuple)): pruned_ratios = list(pruned_ratios) else: raise ValueError( 'expect SLIM.PRUNE_RATIOS type is float, list, tuple, ' 'but received {}'.format(type(pruned_ratios))) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) load_vars = [] load_fail_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) return var_shape == shape return False for x in train_prog.list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) else: load_fail_vars.append(x) fluid.io.load_vars(exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) for var in load_vars: print_info("Parameter[{}] loaded sucessfully!".format(var.name)) for var in load_fail_vars: print_info( "Parameter[{}] don't exist or shape does not match current network, skip" " to load it.".format(var.name)) print_info("{}/{} pretrained parameters loaded successfully!".format( len(load_vars), len(load_vars) + len(load_fail_vars))) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) pruner = Pruner() train_prog = pruner.prune(train_prog, fluid.global_scope(), params=pruned_params, ratios=pruned_ratios, place=place, only_graph=False)[0] compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0: ckpt_dir = save_prune_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: save_prune_checkpoint(exe, train_prog, 'final')
def train(model, train_dataset, places=None, eval_dataset=None, optimizer=None, save_dir='output', num_epochs=100, batch_size=2, pretrained_model=None, resume_model=None, save_interval_epochs=1, log_steps=10, num_classes=None, num_workers=8, use_vdl=False): ignore_index = model.ignore_index nranks = ParallelEnv().nranks start_epoch = 0 if resume_model is not None: start_epoch = resume(model, optimizer, resume_model) elif pretrained_model is not None: load_pretrained_model(model, pretrained_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: strategy = fluid.dygraph.prepare_context() ddp_model = fluid.dygraph.DataParallel(model, strategy) batch_sampler = DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = DataLoader( train_dataset, batch_sampler=batch_sampler, places=places, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 steps_per_epoch = len(batch_sampler) total_steps = steps_per_epoch * (num_epochs - start_epoch) num_steps = 0 best_mean_iou = -1.0 best_model_epoch = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 for epoch in range(start_epoch, num_epochs): timer.start() for step, data in enumerate(loader): train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') if nranks > 1: loss = ddp_model(images, labels) # apply_collective_grads sum grads over multiple gpus. loss = ddp_model.scale_loss(loss) loss.backward() ddp_model.apply_collective_grads() else: loss = model(images, labels) loss.backward() optimizer.minimize(loss) model.clear_gradients() avg_loss += loss.numpy()[0] lr = optimizer.current_step_lr() num_steps += 1 train_batch_cost += timer.elapsed_time() if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: avg_loss /= log_steps avg_train_reader_cost = train_reader_cost / log_steps avg_train_batch_cost = train_batch_cost / log_steps train_reader_cost = 0.0 train_batch_cost = 0.0 remain_steps = total_steps - num_steps eta = calculate_eta(remain_steps, avg_train_batch_cost) logging.info( "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, avg_loss * nranks, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss * nranks, num_steps) log_writer.add_scalar('Train/lr', lr, num_steps) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, num_steps) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, num_steps) avg_loss = 0.0 timer.restart() if ((epoch + 1) % save_interval_epochs == 0 or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: current_save_dir = os.path.join(save_dir, "epoch_{}".format(epoch + 1)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) fluid.save_dygraph(model.state_dict(), os.path.join(current_save_dir, 'model')) fluid.save_dygraph(optimizer.state_dict(), os.path.join(current_save_dir, 'model')) if eval_dataset is not None: mean_iou, avg_acc = evaluate(model, eval_dataset, model_dir=current_save_dir, num_classes=num_classes, ignore_index=ignore_index, epoch_id=epoch + 1) if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_epoch = epoch + 1 best_model_dir = os.path.join(save_dir, "best_model") fluid.save_dygraph(model.state_dict(), os.path.join(best_model_dir, 'model')) logging.info( 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' .format(best_model_epoch, best_mean_iou)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1) log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1) model.train() if use_vdl: log_writer.close()
def train_net(cfg): # Set up data augmentation IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W train_transforms = utils.data_transforms.Compose([ utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TRAIN.RANDOM_BG_COLOR_RANGE), utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION), utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.RandomFlip(), utils.data_transforms.RandomPermuteRGB(), utils.data_transforms.ToTensor(), ]) val_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up data loader train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TRAIN_DATASET](cfg) val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[cfg.DATASET.TEST_DATASET](cfg) train_data_loader = paddle.io.DataLoader(dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms), batch_size=cfg.CONST.BATCH_SIZE, #num_workers=0 , # cfg.TRAIN.NUM_WORKER>0时报错,因为dev/shm/太小 https://blog.csdn.net/ctypyb2002/article/details/107914643 #pin_memory=True, use_shared_memory=False, shuffle=True, drop_last=True) val_data_loader = paddle.io.DataLoader(dataset=val_dataset_loader.get_dataset( utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms), batch_size=1, #num_workers=1, #pin_memory=True, shuffle=False) # Set up networks # paddle.Model prepare fit save res_gru_net = Res_Gru_Net(cfg) print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(res_gru_net))) # Set up learning rate scheduler to decay learning rates dynamically res_gru_net_lr_scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=cfg.TRAIN.RES_GRU_NET_LEARNING_RATE, milestones=cfg.TRAIN.RES_GRU_NET_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA, verbose=True) # Set up solver # if cfg.TRAIN.POLICY == 'adam': res_gru_net_solver = paddle.optimizer.Adam(learning_rate=res_gru_net_lr_scheduler, parameters=res_gru_net.parameters()) # Set up loss functions bce_loss = paddle.nn.BCELoss() # Load pretrained model if exists init_epoch = 0 best_iou = -1 best_epoch = -1 if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN: print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS)) # load res_gru_net_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net.pdparams")) res_gru_net_solver_state_dict = paddle.load(os.path.join(cfg.CONST.WEIGHTS, "res_gru_net_solver.pdopt")) res_gru_net.set_state_dict(res_gru_net_state_dict) res_gru_net_solver.set_state_dict(res_gru_net_solver_state_dict) print('[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' % (dt.now(), init_epoch, best_iou, best_epoch)) # Summary writer for TensorBoard output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat()) log_dir = output_dir % 'logs' ckpt_dir = output_dir % 'checkpoints' # train_writer = SummaryWriter() # val_writer = SummaryWriter(os.path.join(log_dir, 'test')) train_writer=LogWriter(os.path.join(log_dir, 'train')) val_writer=LogWriter(os.path.join(log_dir, 'val')) # Training loop for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES): # Tick / tock epoch_start_time = time() # Batch average meterics batch_time = utils.network_utils.AverageMeter() data_time = utils.network_utils.AverageMeter() res_gru_net_losses = utils.network_utils.AverageMeter() # # switch models to training mode res_gru_net.train() batch_end_time = time() n_batches = len(train_data_loader) for batch_idx, (rendering_images, ground_truth_volumes) in enumerate(train_data_loader()): # if batch_idx>1: # exit() # Measure data time data_time.update(time() - batch_end_time) rendering_images = rendering_images.cuda() ground_truth_volumes = ground_truth_volumes.cuda() # print(rendering_images.shape) # print(ground_truth_volumes.shape) # [64, 5, 3, 224, 224] # [64, 32, 32, 32] # print("ground_truth_volumes", ground_truth_volumes) # Train the res_gru_net generated_volumes = res_gru_net(rendering_images) # print("generated_volumes", generated_volumes) res_gru_net_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 res_gru_net_loss.backward() res_gru_net_solver.step() # Gradient decent res_gru_net_solver.clear_grad () # Append loss to average metrics res_gru_net_losses.update(res_gru_net_loss) # Append loss to TensorBoard n_itr = epoch_idx * n_batches + batch_idx train_writer.add_scalar(tag='Res_Gru_Net/BatchLoss', step=n_itr, value=res_gru_net_loss) # Tick / tock batch_time.update(time() - batch_end_time) batch_end_time = time() n_batches = len(train_data_loader) if (batch_idx % int(cfg.CONST.INFO_BATCH )) == 0: print('[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, batch_time.val, data_time.val, res_gru_net_loss)) # Append epoch loss to TensorBoard train_writer.add_scalar(tag='Res_Gru_Net/EpochLoss', step=epoch_idx + 1, value=res_gru_net_losses.avg) # Tick / tock epoch_end_time = time() print('[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, res_gru_net_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, res_gru_net) # Save weights to file if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'ckpt-epoch-%04d' % (epoch_idx + 1)), epoch_idx + 1, res_gru_net, res_gru_net_solver, best_iou, best_epoch) if iou > best_iou: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) best_iou = iou best_epoch = epoch_idx + 1 utils.network_utils.save_checkpoints(cfg, os.path.join(ckpt_dir, 'best-ckpt'), epoch_idx + 1, res_gru_net, res_gru_net_solver, best_iou, best_epoch)
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # tools.vars use_gpu = config.get("runner.use_gpu", True) use_visual = config.get("runner.use_visual", False) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_visual: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, use_visual, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/train") if model_init_path is not None: load_model(model_init_path, dy_model) # to do : add optimizer function optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") train_dataloader = create_data_loader(config=config, place=place) last_epoch_id = config.get("last_epoch", -1) step_num = 0 for epoch_id in range(last_epoch_id + 1, epochs): # set train mode dy_model.train() metric_list, metric_list_name = dy_model_class.create_metrics() #auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) loss, metric_list, tensor_print_dict = dy_model_class.train_forward( dy_model, metric_list, batch, config) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ":{:.6f}, ".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="train/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="train/" + var_name, step=step_num, value=var.numpy()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec" .format( train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() step_num = step_num + 1 metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
for t in range(int(args.timesteps)): episode_timesteps += 1 # 根据状态得到动作 action = ( policy.select_action(np.array(state)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) action[0] *= 3 print('action', action) # 在环境中执行动作 next_state, reward, done, _ = env.step(action) print('reward', reward) writer.add_scalar(tag='reward', step=t, value=reward) # 将交互数据存入容器 replay_buffer.add(state, action, next_state, reward, done) # 状态更新 state = next_state episode_reward += reward # 算法训练 policy.train(replay_buffer, args.batch_size) # 该轮交互结束 if done: # 打印信息,重置状态 print(f'Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}')
class Trainer(object): ''' Model trainer Args: model(paddle.nn.Layer) : Model to train or evaluate. optimizer(paddle.optimizer.Optimizer) : Optimizer for loss. use_gpu(bool) : Whether to use gpu to run. use_vdl(bool) : Whether to use visualdl to record training data. checkpoint_dir(str) : Directory where the checkpoint is saved, and the trainer will restore the state and model parameters from the checkpoint. compare_metrics(callable) : The method of comparing the model metrics. If not specified, the main metric return by `validation_step` will be used for comparison by default, the larger the value, the better the effect. This method will affect the saving of the best model. If the default behavior does not meet your requirements, please pass in a custom method. Example: .. code-block:: python def compare_metrics(old_metric: dict, new_metric: dict): mainkey = list(new_metric.keys())[0] return old_metric[mainkey] < new_metric[mainkey] ''' def __init__(self, model: paddle.nn.Layer, optimizer: paddle.optimizer.Optimizer, use_gpu: bool = False, use_vdl: bool = True, checkpoint_dir: str = None, compare_metrics: Callable = None, **kwargs): paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') self.nranks = paddle.distributed.get_world_size() self.local_rank = paddle.distributed.get_rank() self.model = model self.optimizer = optimizer self.checkpoint_dir = checkpoint_dir if checkpoint_dir else 'ckpt_{}'.format(time.time()) if not isinstance(self.model, paddle.nn.Layer): raise TypeError('The model {} is not a `paddle.nn.Layer` object.'.format(self.model.__name__)) if self.local_rank == 0 and not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.use_vdl = use_vdl if self.local_rank == 0 and self.use_vdl: vdl_dir = os.path.join(self.checkpoint_dir, 'visualization') self.log_writer = LogWriter(vdl_dir) self.current_epoch = 0 self.best_metrics = defaultdict(int) if self.nranks > 1: paddle.distributed.init_parallel_env() self.model = paddle.DataParallel(self.model) self.compare_metrics = self._compare_metrics if not compare_metrics else compare_metrics self._load_checkpoint() def _load_checkpoint(self): '''Load checkpoint and state dict''' max_epoch = -1 for file in os.listdir(self.checkpoint_dir): if not file.startswith('epoch_'): continue _epoch = file.split('_')[-1] if not _epoch.isdigit(): continue max_epoch = max(max_epoch, int(_epoch)) if max_epoch == -1: if self.local_rank == 0: logger.warning('PaddleHub model checkpoint not found, start from scratch...') return # load best metrics self._load_metrics() self.current_epoch = max_epoch metric_msg = ['{}={:.4f}'.format(metric, value) for metric, value in self.best_metrics.items()] metric_msg = ' '.join(metric_msg) if self.local_rank == 0: logger.info('PaddleHub model checkpoint loaded. current_epoch={} [{}]'.format( self.current_epoch, metric_msg)) model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch)) self.load_model(model_path) def load_model(self, load_dir: str): """load model""" # load model checkpoint model_params_path = os.path.join(load_dir, 'model.pdparams') state_dict = paddle.load(model_params_path) self.model.set_state_dict(state_dict) # load optimizer checkpoint optim_params_path = os.path.join(load_dir, 'model.pdopt') state_dict = paddle.load(optim_params_path) self.optimizer.set_state_dict(state_dict) def _save_checkpoint(self): '''Save model checkpoint and state dict''' model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch)) logger.info('Saving model checkpoint to {}'.format(model_path)) self.save_model(model_path) def save_model(self, save_dir: str): '''Save model''' model_params_path = os.path.join(save_dir, 'model.pdparams') optim_params_path = os.path.join(save_dir, 'model.pdopt') paddle.save(self.model.state_dict(), model_params_path) paddle.save(self.optimizer.state_dict(), optim_params_path) def _save_metrics(self): with open(os.path.join(self.checkpoint_dir, 'metrics.pkl'), 'wb') as file: pickle.dump(self.best_metrics, file) def _load_metrics(self): metrics_file = os.path.join(self.checkpoint_dir, 'metrics.pkl') if not os.path.exists(metrics_file): return with open(metrics_file, 'rb') as file: self.best_metrics = pickle.load(file) def train(self, train_dataset: paddle.io.Dataset, epochs: int = 1, batch_size: int = 1, num_workers: int = 0, eval_dataset: paddle.io.Dataset = None, log_interval: int = 10, save_interval: int = 10, collate_fn: Callable = None): ''' Train a model with specific config. Args: train_dataset(paddle.io.Dataset) : Dataset to train the model epochs(int) : Number of training loops, default is 1. batch_size(int) : Batch size of per step, default is 1. num_workers(int) : Number of subprocess to load data, default is 0. eval_dataset(paddle.io.Dataset) : The validation dataset, deafult is None. If set, the Trainer will execute evaluate function every `save_interval` epochs. log_interval(int) : Log the train infomation every `log_interval` steps. save_interval(int) : Save the checkpoint every `save_interval` epochs. collate_fn(callable): function to generate mini-batch data by merging the sample list. None for only stack each fields of sample in axis 0(same as :attr::`np.stack(..., axis=0)`). Default None ''' if eval_dataset is not None: if isinstance(self.model, paddle.DataParallel): model = self.model._layers else: model = self.model if not hasattr(model, 'validation_step'): raise NotImplementedError('The specified finetuning model does not support evaluation.') batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size, shuffle=True, drop_last=False) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, use_buffer_reader=True, collate_fn=collate_fn) steps_per_epoch = len(batch_sampler) timer = Timer(steps_per_epoch * epochs) timer.start() for i in range(epochs): self.current_epoch += 1 avg_loss = 0 avg_metrics = defaultdict(int) self.model.train() for batch_idx, batch in enumerate(loader): loss, metrics = self.training_step(batch, batch_idx) self.optimizer_step(self.current_epoch, batch_idx, self.optimizer, loss) self.optimizer_zero_grad(self.current_epoch, batch_idx, self.optimizer) # calculate metrics and loss avg_loss += loss.numpy()[0] for metric, value in metrics.items(): if isinstance(value, paddle.Tensor): value = value.numpy() avg_metrics[metric] += value timer.count() if (batch_idx + 1) % log_interval == 0 and self.local_rank == 0: lr = self.optimizer.get_lr() avg_loss /= log_interval if self.use_vdl: self.log_writer.add_scalar(tag='TRAIN/loss', step=timer.current_step, value=avg_loss) print_msg = 'Epoch={}/{}, Step={}/{}'.format(self.current_epoch, epochs, batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) for metric, value in avg_metrics.items(): value /= log_interval if self.use_vdl: self.log_writer.add_scalar( tag='TRAIN/{}'.format(metric), step=timer.current_step, value=value) if isinstance(value, np.ndarray): value = value.item() print_msg += ' {}={:.4f}'.format(metric, value) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(lr, timer.timing, timer.eta) logger.train(print_msg) avg_loss = 0 avg_metrics = defaultdict(int) if self.current_epoch % save_interval == 0 and batch_idx + 1 == steps_per_epoch and self.local_rank == 0: if eval_dataset: result = self.evaluate(eval_dataset, batch_size, num_workers, collate_fn=collate_fn) eval_loss = result.get('loss', None) eval_metrics = result.get('metrics', {}) if self.use_vdl: if eval_loss: self.log_writer.add_scalar(tag='EVAL/loss', step=timer.current_step, value=eval_loss) for metric, value in eval_metrics.items(): self.log_writer.add_scalar( tag='EVAL/{}'.format(metric), step=timer.current_step, value=value) if not self.best_metrics or self.compare_metrics(self.best_metrics, eval_metrics): self.best_metrics = eval_metrics best_model_path = os.path.join(self.checkpoint_dir, 'best_model') self.save_model(best_model_path) self._save_metrics() metric_msg = [ '{}={:.4f}'.format(metric, value) for metric, value in self.best_metrics.items() ] metric_msg = ' '.join(metric_msg) logger.eval('Saving best model to {} [best {}]'.format(best_model_path, metric_msg)) self._save_checkpoint() def evaluate(self, eval_dataset: paddle.io.Dataset, batch_size: int = 1, num_workers: int = 0, collate_fn: Callable = None): ''' Run evaluation and returns metrics. Args: eval_dataset(paddle.io.Dataset) : The validation dataset batch_size(int) : Batch size of per step, default is 1. num_workers(int) : Number of subprocess to load data, default is 0. collate_fn(callable): function to generate mini-batch data by merging the sample list. None for only stack each fields of sample in axis 0(same as :attr::`np.stack(..., axis=0)`). Default None ''' if self.local_rank == 0: batch_sampler = paddle.io.BatchSampler(eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False) loader = paddle.io.DataLoader( eval_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, collate_fn=collate_fn) self.model.eval() avg_loss = num_samples = 0 sum_metrics = defaultdict(int) avg_metrics = defaultdict(int) with logger.processing('Evaluation on validation dataset'): with paddle.no_grad(): for batch_idx, batch in enumerate(loader): result = self.validation_step(batch, batch_idx) loss = result.get('loss', None) metrics = result.get('metrics', {}) bs = batch[0].shape[0] num_samples += bs if loss: avg_loss += loss.numpy()[0] * bs for metric, value in metrics.items(): sum_metrics[metric] += value * bs # print avg metrics and loss print_msg = '[Evaluation result]' if loss: avg_loss /= num_samples print_msg += ' avg_loss={:.4f}'.format(avg_loss) for metric, value in sum_metrics.items(): avg_metrics[metric] = float(value) / num_samples print_msg += ' avg_{}={:.4f}'.format(metric, avg_metrics[metric]) logger.eval(print_msg) if loss: return {'loss': avg_loss, 'metrics': avg_metrics} return {'metrics': avg_metrics} def training_step(self, batch: List[paddle.Tensor], batch_idx: int): ''' One step for training, which should be called as forward computation. Args: batch(list[paddle.Tensor]) : The one batch data batch_idx(int) : The index of batch. ''' if self.nranks > 1: result = self.model._layers.training_step(batch, batch_idx) else: result = self.model.training_step(batch, batch_idx) # process result if not isinstance(result, dict): raise RuntimeError('The return value of `trainning_step` in {} is not a dict'.format(self.model.__class__)) loss = result.get('loss', None) if loss is None: raise RuntimeError('Cannot find loss attribute in the return value of `trainning_step` of {}'.format( self.model.__class__)) metrics = result.get('metrics', {}) # back prop loss.backward() return loss, metrics def validation_step(self, batch: Any, batch_idx: int): ''' One step for validation, which should be called as forward computation. Args: batch(list[paddle.Tensor]) : The one batch data batch_idx(int) : The index of batch. ''' if self.nranks > 1: result = self.model._layers.validation_step(batch, batch_idx) else: result = self.model.validation_step(batch, batch_idx) return result def optimizer_step(self, epoch_idx: int, batch_idx: int, optimizer: paddle.optimizer.Optimizer, loss: paddle.Tensor): ''' One step for optimize. Args: epoch_idx(int) : The index of epoch. batch_idx(int) : The index of batch. optimizer(paddle.optimizer.Optimizer) : Optimizer used. loss(paddle.Tensor) : Loss tensor. ''' self.optimizer.step() self.learning_rate_step(epoch_idx, batch_idx, self.optimizer._learning_rate, loss) def learning_rate_step(self, epoch_idx: int, batch_idx: int, learning_rate: Generic, loss: paddle.Tensor): if isinstance(learning_rate, paddle.optimizer.lr.LRScheduler): learning_rate.step() def optimizer_zero_grad(self, epoch_idx: int, batch_idx: int, optimizer: paddle.optimizer.Optimizer): ''' One step for clear gradients. Args: epoch_idx(int) : The index of epoch. batch_idx(int) : The index of batch. optimizer(paddle.optimizer.Optimizer) : Optimizer used. loss(paddle.Tensor) : Loss tensor. ''' self.model.clear_gradients() def _compare_metrics(self, old_metric: dict, new_metric: dict): '''Compare the whether the new metric value is better than the old one''' mainkey = list(new_metric.keys())[0] return old_metric[mainkey] < new_metric[mainkey]
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() assert args.pp_degree == 1, "Please use gpt-3 example to train GPT with pipline prallelism." assert args.mp_degree == 1, "Please use gpt-3 example to train GPT with model prallelism." topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): data_holders = create_data_holder(args) [tokens, loss_mask, attention_mask, position_ids, labels] = data_holders tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, pipeline_mode=False, ) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model = GPTForPretraining(GPTModel(**model_config)) else: model, _ = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args. attention_probs_dropout_prob, topo=topo) # Create the model for the gpt pretrain preds = model(tokens, position_ids, attention_mask) criterion = GPTPretrainingCriterion(topo) loss = criterion(preds, labels, loss_mask) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps # TODO @ZHUI Use paddle network to support lr scheduler lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize if args.use_recompute: dist_strategy.recompute = True dist_strategy.recompute_configs = { "checkpoints": model.gpt.checkpoints } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 tic_train = time.time() epoch = 0 learning_rate = main_program.global_block().vars["learning_rate_0"] while True: fetchs = [] if topo.is_last: fetchs = [loss, learning_rate] # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): global_step += 1 ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: loss_return, lr_return = ret speed = args.logging_freq / (time.time() - tic_train) logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_return[0], speed, speed * args.global_batch_size * args.max_seq_len, lr_return[0])) log_writer.add_scalar("loss", loss_return[0], global_step) log_writer.add_scalar("learning_rate", lr_return[0], global_step) tic_train = time.time() if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step >= args.max_steps: eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "test") del train_data_loader return epoch += 1
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) set_seed(args) # Now, we only support data parallel in dygraph mode for now. topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=worker_num) default_global_batch_size = topo.data_info.size * args.micro_batch_size default_global_tokens_num = default_global_batch_size * args.max_seq_len model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define log writer log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.micro_batch_size * topo.data_info.size, False, False, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = GPTForPretraining(GPTModel(**model_config)) else: model = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = None if args.lr_decay_style == "none": lr_scheduler = None elif args.lr_decay_style == "cosine": lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) global_step = 0 epoch = 0 tic_train = time.time() while True: files = get_train_data_file(args) files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, [data_file], local_rank=local_rank, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=tokenizer.eos_token_id) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 tokens, loss_mask, attention_mask, position_ids, labels = batch loss_mask.stop_gradient = True attention_mask.stop_gradient = True with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"], custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "c_embedding" ]): preds = model(tokens, position_ids, attention_mask) loss = criterion(preds, labels, loss_mask) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() loss_numpy = loss.numpy() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_freq == 0: speed = args.logging_freq / (train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_freq logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_numpy, avg_reader_cost, 1. / speed, speed, speed * default_global_tokens_num, optimizer.get_lr())) log_writer.add_scalar("loss", loss_numpy, global_step) log_writer.add_scalar("learning_rate", optimizer.get_lr(), global_step) tic_train = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # Since the valid data broardcast to all devices, we do evaluate on all device. run_evaluate(valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, epoch, "valid") if global_step % args.save_steps == 0 or global_step >= args.max_steps: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model logger.info("Save model to %s" % output_dir) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) if global_step >= args.max_steps: run_evaluate(test_data_loader, model, criterion, args.test_iters, log_writer, global_step, epoch, "test") logger.info("The training process is complete.") del train_data_loader return reader_start = time.time() del train_data_loader
def train(cfg): # startup_prog = fluid.Program() # train_prog = fluid.Program() drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment # places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # place = places[0] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, loss, lr, pred, grts, masks, image = build_model( phase=ModelPhase.TRAIN) data_loader.set_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) cfg.update_from_file(args.teacher_cfg_file) # teacher_arch = teacher_cfg.architecture teacher_program = fluid.Program() teacher_startup_program = fluid.Program() with fluid.program_guard(teacher_program, teacher_startup_program): with fluid.unique_name.guard(): _, teacher_loss, _, _, _, _, _ = build_model( teacher_program, teacher_startup_program, phase=ModelPhase.TRAIN, image=image, label=grts, mask=masks) exe.run(teacher_startup_program) teacher_program = teacher_program.clone(for_test=True) ckpt_dir = cfg.SLIM.KNOWLEDGE_DISTILL_TEACHER_MODEL_DIR assert ckpt_dir is not None print('load teacher model:', ckpt_dir) fluid.io.load_params(exe, ckpt_dir, main_program=teacher_program) # cfg = load_config(FLAGS.config) cfg.update_from_file(args.cfg_file) data_name_map = { 'image': 'image', 'label': 'label', 'mask': 'mask', } merge(teacher_program, fluid.default_main_program(), data_name_map, place) distill_pairs = [[ 'teacher_bilinear_interp_2.tmp_0', 'bilinear_interp_0.tmp_0' ]] def distill(pairs, weight): """ Add 3 pairs of distillation losses, each pair of feature maps is the input of teacher and student's yolov3_loss respectively """ loss = l2_loss(pairs[0][0], pairs[0][1]) weighted_loss = loss * weight return weighted_loss distill_loss = distill(distill_pairs, 0.1) cfg.update_from_file(args.cfg_file) optimizer = solver.Solver(None, None) all_loss = loss + distill_loss lr = optimizer.optimise(all_loss) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_elewise_add_act_ops = True if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=all_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, fluid.default_main_program()) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) load_vars = [] load_fail_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) return var_shape == shape return False for x in fluid.default_main_program().list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) else: load_fail_vars.append(x) fluid.io.load_vars(exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) for var in load_vars: print_info("Parameter[{}] loaded sucessfully!".format(var.name)) for var in load_fail_vars: print_info( "Parameter[{}] don't exist or shape does not match current network, skip" " to load it.".format(var.name)) print_info("{}/{} pretrained parameters loaded successfully!".format( len(load_vars), len(load_vars) + len(load_fail_vars))) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) #fetch_list = [avg_loss.name, lr.name] fetch_list = [ loss.name, 'teacher_' + teacher_loss.name, distill_loss.name, lr.name ] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, t_loss, d_loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) avg_t_loss += np.mean(np.array(t_loss)) avg_d_loss += np.mean(np.array(d_loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_t_loss /= args.log_steps avg_d_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, avg_t_loss, avg_d_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_t_loss = 0.0 avg_d_loss = 0.0 timer.restart() except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, fluid.default_main_program(), epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate(cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info( "Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, fluid.default_main_program(), epoch) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, fluid.default_main_program(), 'final')
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def do_train(args): paddle.set_device(args.device) worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) if worker_num > 1: paddle.distributed.init_parallel_env() if args.dp_degree * args.sharding_degree == 1: args.dp_degree = worker_num args.sharding_degree = 1 args_post_process(args, worker_num) logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit)) for arg in vars(args): logger.info('{:20}:{}'.format(arg, getattr(args, arg))) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, "mp_degree": 1, "pp_degree": 1, "sharding_degree": 1 } fleet.init(is_collective=True, strategy=strategy) hcg = fleet.get_hybrid_communicate_group() worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) # Create the random seed for the worker set_seed(args) assert args.dp_degree * args.sharding_degree == worker_num, \ "The product of degree num should be equal to worker_num." # Create log write, log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) criterion = criterion_class() # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps lr_scheduler = LinearDecayWithWarmup(args.max_lr, args.max_steps, args.warmup_rate, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_param, multi_precision=args.use_amp) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) scaler = fleet.distributed_scaler(scaler) model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') if paddle.distributed.get_world_size() > 1: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) data_file = get_train_data_file(args) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=worker_num, data_world_rank=worker_index, max_seq_len=args.max_seq_len, current_step=global_step) # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): logger.info("Try to load checkpoint from %s " % checkpoint_dir) opt_path = os.path.join(checkpoint_dir, "model_state.pdopt") params_path = os.path.join(checkpoint_dir, "model_state.pdparams") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) model_dict = paddle.load(params_path) model.set_state_dict(model_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) logger.info( "Checkpoint loaded from global step: {}".format(global_step)) tic_train = time.time() while True: # If not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels input_ids, segment_ids, input_mask, masked_lm_positions, \ masked_lm_labels, next_sentence_labels = batch with paddle.amp.auto_cast(args.use_amp, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): # Create the model for the ernie pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() optimizer.clear_grad() train_run_cost += time.time() - train_start # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 if global_step % args.logging_freq == 0: speed = args.logging_freq / (time.time() - tic_train) common_loginfo = "global step %d, loss: %.9f, lm_loss: %.6f, sop_loss: %.6f, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, loss.item(), lm_loss.item(), sop_loss.item(), speed, speed * args.global_batch_size, lr_scheduler.get_lr()) addition_info = "" if args.use_amp: addition_info = " loss_scaling: %.1f, incr_count: %d, decr_count: %d" % ( scaler._scale.numpy(), scaler._incr_count, scaler._decr_count) logger.info(common_loginfo + addition_info) log_writer.add_scalar("loss", loss.item(), global_step) log_writer.add_scalar("lm_loss", lm_loss.item(), global_step) log_writer.add_scalar("sop_loss", sop_loss.item(), global_step) tic_train = time.time() if lr_scheduler is not None: lr_scheduler.step() if global_step % args.eval_freq == 0: # TODO, check the input data of validation run_evaluate(valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, args, task_name="valid") tic_train = time.time() def save_ckpt(output_dir, model, tokenizer, args, global_step): step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } logger.debug("saving models to {}".format(output_dir)) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if worker_index == 0: save_ckpt(output_dir, model, tokenizer, args, global_step) if worker_num > 1: paddle.distributed.barrier() tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) save_ckpt(output_dir, model, tokenizer, args, global_step) if worker_num > 1: paddle.distributed.barrier() if global_step >= args.max_steps: run_evaluate(test_data_loader, model, criterion, args.test_iters, log_writer, global_step, args, task_name="test") del train_data_loader return
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() assert args.dp_degree * args.sharding_degree * args.mp_degree * args.pp_degree == worker_num, \ "The product of degree num should be equal to worker_num." topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) # if os.path.exists(log_writer_path): # shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): data_holders = create_data_holder(args) # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels ] = data_holders tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, current_step=global_step) fleet.init(is_collective=True) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] if model_config["vocab_size"] % 8 != 0: model_config["vocab_size"] += 8 - (model_config["vocab_size"] % 8) model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model, _ = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, ) # Create the model for the gpt pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) criterion = criterion_class(with_nsp_loss=args.binary_head) if args.binary_head: lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss else: loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps lr_scheduler = LinearAnnealingWithWarmupDecay( args.max_lr, args.min_lr, warmup_step=args.warmup_rate * args.max_steps, decay_step=args.decay_steps, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize # if args.use_recompute: # dist_strategy.recompute = True # dist_strategy.recompute_configs = { # "checkpoints": model.ernie.checkpoints # } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): paddle.static.load(main_program, os.path.join(checkpoint_dir, "static_vars"), exe) fetch_loss_vars = collections.OrderedDict() fetch_other_vars = collections.OrderedDict() fetch_loss_vars["loss"] = loss if args.binary_head: fetch_loss_vars["lm_loss"] = lm_loss fetch_loss_vars["sop_loss"] = sop_loss fetch_other_vars["learning_rate"] = main_program.global_block( ).vars["learning_rate_0"] additional_vars = collections.OrderedDict() if args.use_amp: for key in ["loss_scaling", "num_good_steps", "num_bad_steps"]: additional_vars[key] = main_program.global_block().vars[key + "_0"] tic_train = time.time() while True: fetchs = [] fetchs_keys = [] if topo.is_last: fetchs = list(fetch_loss_vars.values()) + list( fetch_other_vars.values()) + list(additional_vars.values()) fetchs_keys = list(fetch_loss_vars.keys()) + list( fetch_other_vars.keys()) + list(additional_vars.keys()) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: res = collections.defaultdict(float) for k, v in zip(fetchs_keys, ret): res[k] = v[0] speed = args.logging_freq / (time.time() - tic_train) loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f" loss_info = ", ".join([ "{}: {:.6f}".format(k, res[k]) for k in fetch_loss_vars.keys() ]) common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, loss_info, speed, speed * args.global_batch_size, res["learning_rate"]) additional_loginfo = ", ".join([ "{}: {}".format(k, res[k]) for k in additional_vars.keys() ]) if additional_loginfo: common_loginfo += ", " + additional_loginfo logger.info(common_loginfo) for k, v in res.items(): log_writer.add_scalar("train/" + k, v, global_step) tic_train = time.time() #if args.check_accuracy: # if global_step >= args.max_steps: # return # else: # continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) fleet.barrier_worker() logger.debug("saving models to {}".format(output_dir)) if args.sharding_degree <= 1: # Save on the first worker by default. if worker_index == 0: paddle.static.save( main_program, os.path.join(output_dir, "static_vars")) else: # Use save_persistables in sharding, but more slower save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step >= args.max_steps: eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "test") del train_data_loader return
# 导入VisualDL的包 from visualdl import LogWriter # 创建一个LogWriter,logdir参数是指定存放数据的路径, writer = LogWriter(logdir="./random_log") # 读取数据 for step in range(1000): # 在测试分类下创建标量数据1 writer.add_scalar(tag="测试/数据1", step=step, value=step * 1. / 1000) # 在测试分类下创建标量数据2 writer.add_scalar(tag="测试/数据2", step=step, value=1. - step * 1. / 1000)
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env \ and 'PADDLE_TRAINERS_NUM' in env \ and int(env['PADDLE_TRAINERS_NUM']) > 1 num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage(cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) # When iterable mode, set set_sample_list_generator(eval_reader, place) eval_loader.set_sample_list_generator(eval_reader) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches( fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params(exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader(cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num, num_trainers=num_trainers) # When iterable mode, set set_sample_list_generator(train_reader, place) train_loader.set_sample_list_generator(train_reader) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_iter, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_iter) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: assert six.PY3, "VisualDL requires Python >= 3.5" from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): ips = float(cfg['TrainReader']['batch_size']) / time_cost strs = 'iter: {}, lr: {:.6f}, {}, eta: {}, batch_cost: {:.5f} sec, ips: {:.5f} images/sec'.format( it, np.mean(outs[-1]), logs, eta, time_cost, ips) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run(exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results(results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is float: value = float(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # load static model class static_model_class = load_static_model_class(config) input_data = static_model_class.create_feeds(is_infer=True) input_data_names = [data.name for data in input_data] fetch_vars = static_model_class.infer_net(input_data) logger.info("cpu_num: {}".format(os.getenv("CPU_NUM"))) use_gpu = config.get("runner.use_gpu", True) use_xpu = config.get("runner.use_xpu", False) use_auc = config.get("runner.use_auc", False) use_visual = config.get("runner.use_visual", False) auc_num = config.get("runner.auc_num", 1) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) batch_size = config.get("runner.infer_batch_size", None) use_save_data = config.get("runner.use_save_data", False) reader_type = config.get("runner.reader_type", "DataLoader") use_fleet = config.get("runner.use_fleet", False) os.environ["CPU_NUM"] = str(config.get("runner.thread_num", 1)) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_xpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, use_xpu, use_visual, batch_size, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") if use_xpu: xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) place = paddle.set_device(xpu_device) else: place = paddle.set_device('gpu' if use_gpu else 'cpu') exe = paddle.static.Executor(place) # initialize exe.run(paddle.static.default_startup_program()) if reader_type == 'DataLoader': test_dataloader = create_data_loader(config=config, place=place, mode="test") elif reader_type == "CustomizeDataLoader": test_dataloader = static_model_class.create_data_loader() # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/infer") step_num = 0 for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) load_static_model(paddle.static.default_main_program(), model_path, prefix='rec_static') epoch_begin = time.time() interval_begin = time.time() infer_reader_cost = 0.0 infer_run_cost = 0.0 reader_start = time.time() if use_auc: reset_auc(use_fleet, auc_num) #we will drop the last incomplete batch when dataset size is not divisible by the batch size assert any( test_dataloader() ), "test_dataloader's size is null, please ensure batch size < dataset size!" for batch_id, batch_data in enumerate(test_dataloader()): infer_reader_cost += time.time() - reader_start infer_start = time.time() fetch_batch_var = exe.run( program=paddle.static.default_main_program(), feed=dict(zip(input_data_names, batch_data)), fetch_list=[var for _, var in fetch_vars.items()]) infer_run_cost += time.time() - infer_start if batch_id % print_interval == 0: metric_str = "" for var_idx, var_name in enumerate(fetch_vars): metric_str += "{}: {}, ".format( var_name, fetch_batch_var[var_idx][0]) if use_visual: log_visual.add_scalar( tag="infer/" + var_name, step=step_num, value=fetch_batch_var[var_idx][0]) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + "avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.2f} ins/s" .format( infer_reader_cost / print_interval, (infer_reader_cost + infer_run_cost) / print_interval, batch_size, print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() infer_reader_cost = 0.0 infer_run_cost = 0.0 reader_start = time.time() step_num = step_num + 1 metric_str = "" for var_idx, var_name in enumerate(fetch_vars): metric_str += "{}: {}, ".format(var_name, fetch_batch_var[var_idx][0]) logger.info("epoch: {} done, ".format(epoch_id) + metric_str + "epoch time: {:.2f} s".format(time.time() - epoch_begin)) if use_save_data: save_data(fetch_batch_var, model_load_path)
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 drop_last = True dataset = SegDataset(file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: if cfg.DATASET.INPUT_IMAGE_NUM == 1: yield item[0], item[1], item[2] else: yield item[0], item[1], item[2], item[3] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: if cfg.DATASET.INPUT_IMAGE_NUM == 1: yield item[0], item[1], item[2] else: yield item[0], item[1], item[2], item[3] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL) data_loader.set_sample_generator(data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions(precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError(( "begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run(program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() # NOTE : used for benchmark, profiler tools if args.is_profiler and epoch == 1 and step == args.log_steps: profiler.start_profiler("All") elif args.is_profiler and epoch == 1 and step == args.log_steps + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, epoch) save_infer_program(test_prog, ckpt_dir) if args.do_eval: print("Evaluation start") cate_iou, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if cate_iou[0] >= best_mIoU: best_mIoU = cate_iou[0] update_best_model(ckpt_dir) print_info( "Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize(cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, 'final') save_infer_program(test_prog, ckpt_dir)
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) use_visual = config.get("runner.use_visual", False) test_data_dir = config.get("runner.test_data_dir", None) print_interval = config.get("runner.print_interval", None) infer_batch_size = config.get("runner.infer_batch_size", None) model_load_path = config.get("runner.infer_load_path", "model_output") start_epoch = config.get("runner.infer_start_epoch", 0) end_epoch = config.get("runner.infer_end_epoch", 10) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_visual: {}, infer_batch_size: {}, test_data_dir: {}, start_epoch: {}, end_epoch: {}, print_interval: {}, model_load_path: {}" .format(use_gpu, use_visual, infer_batch_size, test_data_dir, start_epoch, end_epoch, print_interval, model_load_path)) logger.info("**************common.configs**********") place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/infer") # to do : add optimizer function #optimizer = dy_model_class.create_optimizer(dy_model, config) logger.info("read data") test_dataloader = create_data_loader(config=config, place=place, mode="test") epoch_begin = time.time() interval_begin = time.time() metric_list, metric_list_name = dy_model_class.create_metrics() step_num = 0 for epoch_id in range(start_epoch, end_epoch): logger.info("load model epoch {}".format(epoch_id)) model_path = os.path.join(model_load_path, str(epoch_id)) try: load_model(model_path, dy_model) except Exception as e: print(e) continue dy_model.eval() infer_reader_cost = 0.0 infer_run_cost = 0.0 reader_start = time.time() for batch_id, batch in enumerate(test_dataloader()): infer_reader_cost += time.time() - reader_start infer_start = time.time() batch_size = len(batch[0]) metric_list, tensor_print_dict = dy_model_class.infer_forward( dy_model, metric_list, batch, config) infer_run_cost += time.time() - infer_start if batch_id % print_interval == 0: tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="infer/" + var_name, step=step_num, value=var.numpy()) metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ": {:.6f},".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="infer/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.2f} ins/s" .format( infer_reader_cost / print_interval, (infer_reader_cost + infer_run_cost) / print_interval, infer_batch_size, print_interval * batch_size / (time.time() - interval_begin))) interval_begin = time.time() infer_reader_cost = 0.0 infer_run_cost = 0.0 step_num = step_num + 1 reader_start = time.time() metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) epoch_begin = time.time()
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) writer=LogWriter(logdir="./log/scalar_test/train") train_ds = load_dataset( read_simcse_text, data_path=args.train_set_file, lazy=False) pretrained_model = ppnlp.transformers.ErnieModel.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) print("loading model from {}".format(args.model_name_or_path)) tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SimCSE( pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) time_start=time.time() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch loss = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) global_step += 1 if global_step % 10 == 0 and rank == 0: print("global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) writer.add_scalar(tag="loss", step=global_step, value=loss) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % (global_step)) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) time_end=time.time() print('totally cost',time_end-time_start)
def train(args): # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: dist.init_parallel_env() if dist.get_rank() == 0: shutil.rmtree('log', ignore_errors=True) # 日志记录器 writer = LogWriter(logdir='log') # 数据输入的形状 input_shape = eval(args.input_shape) # 获取数据 train_dataset = CustomDataset(args.train_list_path, model='train', spec_len=input_shape[3]) # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: train_batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) else: train_batch_sampler = paddle.io.BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) train_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, num_workers=args.num_workers) test_dataset = CustomDataset(args.test_list_path, model='test', spec_len=input_shape[3]) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size) test_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, num_workers=args.num_workers) # 获取模型 model = resnet34() metric_fc = ArcNet(feature_dim=512, class_dim=args.num_classes) if dist.get_rank() == 0: paddle.summary(model, input_size=input_shape) # 设置支持多卡训练 if len(args.gpus.split(',')) > 1: model = paddle.DataParallel(model) metric_fc = paddle.DataParallel(metric_fc) # 初始化epoch数 last_epoch = 0 # 学习率衰减 scheduler = paddle.optimizer.lr.StepDecay(learning_rate=args.learning_rate, step_size=10, gamma=0.1, verbose=True) # 设置优化方法 optimizer = paddle.optimizer.Momentum(parameters=model.parameters() + metric_fc.parameters(), learning_rate=scheduler, momentum=0.9, weight_decay=paddle.regularizer.L2Decay(5e-4)) # 加载预训练模型 if args.pretrained_model is not None: model_dict = model.state_dict() param_state_dict = paddle.load(os.path.join(args.pretrained_model, 'model.pdparams')) for name, weight in model_dict.items(): if name in param_state_dict.keys(): if weight.shape != list(param_state_dict[name].shape): print('{} not used, shape {} unmatched with {} in model.'. format(name, list(param_state_dict[name].shape), weight.shape)) param_state_dict.pop(name, None) else: print('Lack weight: {}'.format(name)) model.set_dict(param_state_dict) print('成功加载预训练模型参数') # 恢复训练 if args.resume is not None: model.set_state_dict(paddle.load(os.path.join(args.resume, 'model.pdparams'))) metric_fc.set_state_dict(paddle.load(os.path.join(args.resume, 'metric_fc.pdparams'))) optimizer_state = paddle.load(os.path.join(args.resume, 'optimizer.pdopt')) optimizer.set_state_dict(optimizer_state) # 获取预训练的epoch数 last_epoch = optimizer_state['LR_Scheduler']['last_epoch'] print('成功加载模型参数和优化方法参数') # 获取损失函数 loss = paddle.nn.CrossEntropyLoss() train_step = 0 test_step = 0 sum_batch = len(train_loader) * (args.num_epoch - last_epoch) # 开始训练 for epoch in range(last_epoch, args.num_epoch): loss_sum = [] accuracies = [] for batch_id, (spec_mag, label) in enumerate(train_loader()): start = time.time() feature = model(spec_mag) output = metric_fc(feature, label) # 计算损失值 los = loss(output, label) los.backward() optimizer.step() optimizer.clear_grad() # 计算准确率 label = paddle.reshape(label, shape=(-1, 1)) acc = accuracy(input=paddle.nn.functional.softmax(output), label=label) accuracies.append(acc.numpy()[0]) loss_sum.append(los) # 多卡训练只使用一个进程打印 if batch_id % 100 == 0 and dist.get_rank() == 0: eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id) eta_str = str(timedelta(seconds=int(eta_sec / 1000))) print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % ( datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str)) writer.add_scalar('Train loss', los, train_step) train_step += 1 loss_sum = [] # 多卡训练只使用一个进程执行评估和保存模型 if dist.get_rank() == 0: acc = test(model, metric_fc, test_loader) print('='*70) print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc)) print('='*70) writer.add_scalar('Test acc', acc, test_step) # 记录学习率 writer.add_scalar('Learning rate', scheduler.last_lr, epoch) test_step += 1 save_model(args, epoch, model, metric_fc, optimizer) scheduler.step()