def train(config): use_gpu = config.use_gpu # Get the rank of the current training process. rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 if rank == 0: # Print the whole config setting. pprint(vars(config)) # Make checkpoint directory. run_dir = os.path.join("runs", config.model, config.name) checkpoint_dir = os.path.join(run_dir, "checkpoint") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # Create tensorboard logger. vdl = LogWriter(os.path.join(run_dir, "logs")) \ if rank == 0 else None # Configurate device place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace() with dg.guard(place): # Fix random seed. seed = config.seed random.seed(seed) np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed print("Random Seed: ", seed) # Build model. model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, vdl) iteration = model.build() while iteration < config.max_iterations: # Run one single training step. model.train_step(iteration) iteration += 1 if iteration % config.test_every == 0: # Run validation step. model.valid_step(iteration) if rank == 0 and iteration % config.save_every == 0: # Save parameters. model.save(iteration) # Close TensorBoard. if rank == 0: vdl.close()
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters(model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio']) elif args.vocoder == 'waveflow': wav = synthesis_with_waveflow(mel_output_postnet, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join(os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
class VisualDL(Callback): def __init__(self, log_dir="./log", freq=1): super(VisualDL, self).__init__() self.log_dir = log_dir self.freq = freq def on_train_begin(self, logs=None): self.writer = LogWriter(self.log_dir) def on_iter_end(self, iter, logs=None): logs = logs or {} if iter % self.freq == 0 and ParallelEnv().local_rank == 0: for k, v in logs.items(): self.writer.add_scalar("Train/{}".format(k), v, iter) self.writer.flush() def on_train_end(self, logs=None): self.writer.close()
class VisualHook(Hook): def __init__(self, priority=1): self.priority = priority def run_begin(self, trainer): rank = dist.get_rank() if rank != 0: return logdir = os.path.join(trainer.output_dir, 'visual_dl') if not os.path.exists(logdir): os.makedirs(logdir) self.writer = LogWriter(logdir=logdir) # app.run(logdir=logdir, port=8040, host="0.0.0.0") def train_epoch_end(self, trainer): rank = dist.get_rank() if rank != 0: return outputs = trainer.outputs for k in outputs.keys(): v = trainer.logs[k].avg self.writer.add_scalar(tag='train/{}'.format(k), step=trainer.current_epoch, value=v) with paddle.no_grad(): if dist.get_world_size() > 1: for name, param in trainer.model._layers.named_parameters(): if 'bn' not in name: self.writer.add_histogram(name, param.numpy(), trainer.current_epoch) else: for name, param in trainer.model.named_parameters(): if 'bn' not in name: self.writer.add_histogram(name, param.numpy(), trainer.current_epoch) def run_end(self, trainer): rank = dist.get_rank() if rank != 0: return self.writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, keep_checkpoint_max=5): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) else: ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_list = [] iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] if not avg_loss_list: avg_loss_list = [l.numpy() for l in loss_list] else: for i in range(len(loss_list)): avg_loss_list[i] += loss_list[i].numpy() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_loss_list = [l[0] / log_iters for l in avg_loss_list] remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. if len(avg_loss_list) > 1: avg_loss_dict = {} for i, value in enumerate(avg_loss_list): avg_loss_dict['loss_' + str(i)] = value for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 avg_loss_list = [] reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 mean_iou, acc, class_iou, _, _ = evaluate( model, val_dataset, num_workers=num_workers) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) for i, iou in enumerate(class_iou): log_writer.add_scalar('Evaluate/IoU {}'.format(i), float(iou), iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) batch_start = time.time() # Calculate flops. if local_rank == 0: def count_syncbn(m, x, y): x = x[0] nelements = x.numel() m.total_ops += int(2 * nelements) _, c, h, w = images.shape flops = paddle.flops( model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def do_train(args): paddle.enable_static() if not args.eager_run else None paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank()) model_class, tokenizer_class = MODEL_CLASSES['ernie-health'] # Loads or initialize a model. pretrained_models = list( tokenizer_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ args.model_name_or_path + '-discriminator'])) model = model_class(generator, discriminator) args.init_from_ckpt = False else: if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: # Load checkpoint tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) with open(os.path.join(args.model_name_or_path, 'run_states.json'), 'r') as f: config_dict = json.load(f) model_name = config_dict['model_name'] if model_name in pretrained_models: generator = ElectraGenerator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-generator'])) discriminator = ErnieHealthDiscriminator( ElectraModel(**model_class.pretrained_init_configuration[ model_name + '-discriminator'])) model = model_class(generator, discriminator) model.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdparams'))) else: raise ValueError( 'initialize a model from ckpt need model_name ' 'in model_config_file. The supported model_name ' 'are as follows: {}'.format( tokenizer_class.pretrained_init_configuration.keys())) else: raise ValueError( 'initialize a model need identifier or the ' 'directory of storing model. if use identifier, the supported model ' 'identifiers are as follows: {}, if use directory, ' 'make sure set init_from_ckpt as True'.format( model_class.pretrained_init_configuration.keys())) criterion = ErnieHealthPretrainingCriterion( getattr(model.generator, ElectraGenerator.base_model_prefix).config['vocab_size'], model.gen_weight) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) # Loads dataset. tic_load_data = time.time() logger.info('start load data : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) train_dataset = MedicalCorpus(data_path=args.input_dir, tokenizer=tokenizer) logger.info('load data done, total : %s s' % (time.time() - tic_load_data)) # Reads data and generates mini-batches. data_collator = DataCollatorForErnieHealth( tokenizer=tokenizer, max_seq_length=args.max_seq_length, mlm_prob=args.mlm_prob) train_data_loader = create_dataloader( train_dataset, batch_size=args.batch_size, mode='train', use_gpu=True if args.device in 'gpu' else False, data_collator=data_collator) num_training_steps = args.max_steps if args.max_steps > 0 else ( len(train_data_loader) * args.num_epochs) args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1 lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ['bias', 'norm']) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=1024) logger.info('start train : %s' % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))) trained_global_step = global_step = 0 t_loss = defaultdict(lambda: paddle.to_tensor([0.0])) log_loss = defaultdict(lambda: paddle.to_tensor([0.0])) loss_list = defaultdict(list) log_list = [] tic_train = time.time() if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt: optimizer.set_state_dict( paddle.load( os.path.join(args.model_name_or_path, 'model_state.pdopt'))) trained_global_step = global_step = config_dict['global_step'] if trained_global_step < num_training_steps: logger.info( '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s' % (trained_global_step, trained_global_step + 1)) else: logger.info( '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !' % (trained_global_step, num_training_steps)) exit(0) if paddle.distributed.get_rank() == 0: writer = LogWriter(os.path.join(args.output_dir, 'loss_log')) for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader): if trained_global_step > 0: trained_global_step -= 1 continue global_step += 1 masked_input_ids, input_ids, gen_labels = batch if args.use_amp: with paddle.amp.auto_cast(): gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) scaled = scaler.scale(loss) scaled.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() scaler.minimize(optimizer, scaled) else: gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model( input_ids=masked_input_ids, raw_input_ids=input_ids, generator_labels=gen_labels) loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion( gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp, disc_labels, masks) loss.backward() t_loss['loss'] += loss.detach() t_loss['gen'] += gen_loss.detach() t_loss['rtd'] += rtd_loss.detach() t_loss['mts'] += mts_loss.detach() t_loss['csp'] += csp_loss.detach() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: local_loss = dict([ (k, (t_loss[k] - log_loss[k]) / args.logging_steps) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) if paddle.distributed.get_world_size() > 1: for k in ['loss', 'gen', 'rtd', 'mts', 'csp']: paddle.distributed.all_gather(loss_list[k], local_loss[k]) if paddle.distributed.get_rank() == 0: tmp_loss = dict([ (k, float((paddle.stack(loss_list[k]).sum() / len(loss_list[k])).numpy())) for k in ['loss', 'gen', 'rtd', 'mts', 'csp'] ]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format( global_step, num_training_steps, epoch, step, tmp_loss['loss'], tmp_loss['gen'], tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) writer.add_scalar('generator_loss', tmp_loss['gen'], global_step) writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50, global_step) writer.add_scalar('mts_loss', tmp_loss['mts'] * 20, global_step) writer.add_scalar('csp_loss', tmp_loss['csp'], global_step) writer.add_scalar('total_loss', tmp_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) loss_list = defaultdict(list) else: local_loss = dict([(k, v.numpy()[0]) for k, v in local_loss.items()]) log_str = ( 'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, ' 'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, ' 'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it' ).format(global_step, num_training_steps, epoch, step, local_loss['loss'], local_loss['gen'], local_loss['rtd'], local_loss['mts'], local_loss['csp'], optimizer.get_lr(), (time.time() - tic_train) / args.logging_steps) logger.info(log_str) log_list.append(log_str) loss_dict = { 'generator_loss': local_loss['gen'], 'rtd_loss': local_loss['rtd'] * 50, 'mts_loss': local_loss['mts'] * 20, 'csp_loss': local_loss['csp'] } for k, v in loss_dict.items(): writer.add_scalar('loss/%s' % k, v, global_step) writer.add_scalar('total_loss', local_loss['loss'], global_step) writer.add_scalar('lr', optimizer.get_lr(), global_step) log_loss = dict(t_loss) tic_train = time.time() if global_step % args.save_steps == 0: if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, 'model_%d.pdparams' % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model config_to_save = copy.deepcopy( model_to_save.discriminator.electra.config) if 'self' in config_to_save: del config_to_save['self'] run_states = { 'model_name': model_name if args.init_from_ckpt else args.model_name_or_path, 'global_step': global_step, 'epoch': epoch, 'step': step, } with open(os.path.join(output_dir, 'model_config.json'), 'w') as f: json.dump(config_to_save, f) with open(os.path.join(output_dir, 'run_states.json'), 'w') as f: json.dump(run_states, f) paddle.save( model.state_dict(), os.path.join(output_dir, 'model_state.pdparams')) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, 'model_state.pdopt')) if len(log_list) > 0: with open(os.path.join(output_dir, 'train.log'), 'w') as f: for log in log_list: if len(log.strip()) > 0: f.write(log.strip() + '\n') if global_step >= num_training_steps: if paddle.distributed.get_rank() == 0: writer.close() return
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg['train']['max_iteration']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() if local_rank == 0: writer.add_scalar('training_loss/loss', loss.numpy(), global_step) # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. """ nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel training environment. paddle.distributed.init_parallel_env() strategy = paddle.distributed.prepare_context() ddp_model = paddle.DataParallel(model, strategy) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 timer.start() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] train_batch_cost += timer.elapsed_time() if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_train_reader_cost = train_reader_cost / log_iters avg_train_batch_cost = train_batch_cost / log_iters train_reader_cost = 0.0 train_batch_cost = 0.0 remain_iters = iters - iter eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 mean_iou, acc = evaluate(model, val_dataset, num_workers=num_workers) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) timer.restart() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, keep_checkpoint_max=5, threshold=0.1, nms_kernel=7, top_k=200): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. threshold (float, optional): A Float, threshold applied to center heatmap score. Default: 0.1. nms_kernel (int, optional): An Integer, NMS max pooling kernel size. Default: 7. top_k (int, optional): An Integer, top k centers to keep. Default: 200. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) else: ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_list = [] iters_per_epoch = len(batch_sampler) best_pq = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) images = data[0] semantic = data[1] semantic_weights = data[2] center = data[3] center_weights = data[4] offset = data[5] offset_weights = data[6] foreground = data[7] if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, losses=losses, semantic=semantic, semantic_weights=semantic_weights, center=center, center_weights=center_weights, offset=offset, offset_weights=offset_weights) loss = sum(loss_list) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] if not avg_loss_list: avg_loss_list = [l.numpy() for l in loss_list] else: for i in range(len(loss_list)): avg_loss_list[i] += loss_list[i].numpy() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_loss_list = [l[0] / log_iters for l in avg_loss_list] remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) logger.info( "[LOSS] loss={:.4f}, semantic_loss={:.4f}, center_loss={:.4f}, offset_loss={:.4f}" .format(avg_loss, avg_loss_list[0], avg_loss_list[1], avg_loss_list[2])) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. if len(avg_loss_list) > 1: avg_loss_dict = {} for i, value in enumerate(avg_loss_list): avg_loss_dict['loss_' + str(i)] = value for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 avg_loss_list = [] reader_cost_averager.reset() batch_cost_averager.reset() # save model if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) # eval model if (iter % save_interval == 0 or iter == iters) and ( val_dataset is not None) and local_rank == 0 and iter > iters // 2: num_workers = 1 if num_workers > 0 else 0 panoptic_results, semantic_results, instance_results = evaluate( model, val_dataset, threshold=threshold, nms_kernel=nms_kernel, top_k=top_k, num_workers=num_workers, print_detail=False) pq = panoptic_results['pan_seg']['All']['pq'] miou = semantic_results['sem_seg']['mIoU'] map = instance_results['ins_seg']['mAP'] map50 = instance_results['ins_seg']['mAP50'] logger.info( "[EVAL] PQ: {:.4f}, mIoU: {:.4f}, mAP: {:.4f}, mAP50: {:.4f}" .format(pq, miou, map, map50)) model.train() # save best model and add evaluate results to vdl if (iter % save_interval == 0 or iter == iters) and local_rank == 0: if val_dataset is not None and iter > iters // 2: if pq > best_pq: best_pq = pq best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation pq ({:.4f}) was saved at iter {}.' .format(best_pq, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/PQ', pq, iter) log_writer.add_scalar('Evaluate/mIoU', miou, iter) log_writer.add_scalar('Evaluate/mAP', map, iter) log_writer.add_scalar('Evaluate/mAP50', map50, iter) batch_start = time.time() # Calculate flops. if local_rank == 0: def count_syncbn(m, x, y): x = x[0] nelements = x.numel() m.total_ops += int(2 * nelements) _, c, h, w = images.shape flops = paddle.flops( model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
class Engine(object): def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"] or self.config["Arch"].get( "is_rec", False): self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global'][ 'use_visualdl'] and mode == "train" and dist.get_rank() == 0: vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in [ "cpu", "gpu", "xpu", "npu", "mlu" ] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config and self.mode == "train" else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update( {'FLAGS_cudnn_batchnorm_spatial_persistent': 1}) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) if "class_num" in config["Global"]: global_class_num = config["Global"]["class_num"] if "class_num" not in config["Arch"]: config["Arch"]["class_num"] = global_class_num msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}." else: msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored." logger.warning(msg) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: if hasattr( self.train_dataloader, "collate_fn" ) and self.train_dataloader.collate_fn is not None: for m_idx, m in enumerate(metric_config): if "TopkAcc" in m: msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed." logger.warning(msg) break metric_config.pop(m_idx) self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for amp training if self.amp: self.scaler = paddle.amp.GradScaler( init_loss_scaling=self.scale_loss, use_dynamic_loss_scaling=self.use_dynamic_loss_scaling) amp_level = self.config['AMP'].get("level", "O1") if amp_level not in ["O1", "O2"]: msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'." logger.warning(msg) self.config['AMP']["level"] = "O1" amp_level = "O1" self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=amp_level, save_dtype='float32') # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 if world_size != 4 and self.mode == "train": msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train." logger.warning(msg) if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"]) def train(self): assert self.mode == "train" print_batch_step = self.config['Global']['print_batch_step'] save_interval = self.config["Global"]["save_interval"] best_metric = { "metric": 0.0, "epoch": 0, } # key: # val: metrics list word self.output_info = dict() self.time_info = { "batch_cost": AverageMeter("batch_cost", '.5f', postfix=" s,"), "reader_cost": AverageMeter("reader_cost", ".5f", postfix=" s,"), } # global iter counter self.global_step = 0 if self.config["Global"]["checkpoints"] is not None: metric_info = init_model(self.config["Global"], self.model, self.optimizer) if metric_info is not None: best_metric.update(metric_info) self.max_iter = len(self.train_dataloader) - 1 if platform.system( ) == "Windows" else len(self.train_dataloader) for epoch_id in range(best_metric["epoch"] + 1, self.config["Global"]["epochs"] + 1): acc = 0.0 # for one epoch train self.train_epoch_func(self, epoch_id, print_batch_step) if self.use_dali: self.train_dataloader.reset() metric_msg = ", ".join([ "{}: {:.5f}".format(key, self.output_info[key].avg) for key in self.output_info ]) logger.info("[Train][Epoch {}/{}][Avg]{}".format( epoch_id, self.config["Global"]["epochs"], metric_msg)) self.output_info.clear() # eval model and save model if possible if self.config["Global"][ "eval_during_train"] and epoch_id % self.config["Global"][ "eval_interval"] == 0: acc = self.eval(epoch_id) if acc > best_metric["metric"]: best_metric["metric"] = acc best_metric["epoch"] = epoch_id save_load.save_model( self.model, self.optimizer, best_metric, self.output_dir, model_name=self.config["Arch"]["name"], prefix="best_model") logger.info("[Eval][Epoch {}][best metric: {}]".format( epoch_id, best_metric["metric"])) logger.scaler(name="eval_acc", value=acc, step=epoch_id, writer=self.vdl_writer) self.model.train() # save model if epoch_id % save_interval == 0: save_load.save_model(self.model, self.optimizer, { "metric": acc, "epoch": epoch_id }, self.output_dir, model_name=self.config["Arch"]["name"], prefix="epoch_{}".format(epoch_id)) # save the latest model save_load.save_model(self.model, self.optimizer, { "metric": acc, "epoch": epoch_id }, self.output_dir, model_name=self.config["Arch"]["name"], prefix="latest") if self.vdl_writer is not None: self.vdl_writer.close() @paddle.no_grad() def eval(self, epoch_id=0): assert self.mode in ["train", "eval"] self.model.eval() eval_result = self.eval_func(self, epoch_id) self.model.train() return eval_result @paddle.no_grad() def infer(self): assert self.mode == "infer" and self.eval_mode == "classification" total_trainer = dist.get_world_size() local_rank = dist.get_rank() image_list = get_image_list(self.config["Infer"]["infer_imgs"]) # data split image_list = image_list[local_rank::total_trainer] batch_size = self.config["Infer"]["batch_size"] self.model.eval() batch_data = [] image_file_list = [] for idx, image_file in enumerate(image_list): with open(image_file, 'rb') as f: x = f.read() for process in self.preprocess_func: x = process(x) batch_data.append(x) image_file_list.append(image_file) if len(batch_data) >= batch_size or idx == len(image_list) - 1: batch_tensor = paddle.to_tensor(batch_data) out = self.model(batch_tensor) if isinstance(out, list): out = out[0] if isinstance(out, dict) and "logits" in out: out = out["logits"] if isinstance(out, dict) and "output" in out: out = out["output"] result = self.postprocess_func(out, image_file_list) print(result) batch_data.clear() image_file_list.clear() def export(self): assert self.mode == "export" use_multilabel = self.config["Global"].get("use_multilabel", False) model = ExportModel(self.config["Arch"], self.model, use_multilabel) if self.config["Global"]["pretrained_model"] is not None: load_dygraph_pretrain(model.base_model, self.config["Global"]["pretrained_model"]) model.eval() save_path = os.path.join(self.config["Global"]["save_inference_dir"], "inference") if model.quanter: model.quanter.save_quantized_model( model.base_model, save_path, input_spec=[ paddle.static.InputSpec( shape=[None] + self.config["Global"]["image_shape"], dtype='float32') ]) else: model = paddle.jit.to_static( model, input_spec=[ paddle.static.InputSpec( shape=[None] + self.config["Global"]["image_shape"], dtype='float32') ]) paddle.jit.save(model, save_path)
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze( dg.to_variable(pos_text).astype(np.int64), [0]) for i in range(args.max_len): pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze( dg.to_variable(pos_mel).astype(np.int64), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) if stop_preds.numpy()[0, -1] > args.stop_threshold: break mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j) if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(postnet_pred, cfg['audio']) elif args.vocoder == 'waveflow': # synthesis use waveflow wav = synthesis_with_waveflow(postnet_pred, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join(os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def train(args): writer = LogWriter(logdir=args.logdir) rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = paddle.CUDAPlace(gpu_id) if world_size > 1: import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True fleet.init(is_collective=True, strategy=strategy) if args.use_synthetic_dataset: trainset = SyntheticDataset(args.num_classes, fp16=args.fp16) else: trainset = CommonDataset(root_dir=args.data_dir, label_file=args.label_file, fp16=args.fp16, is_bin=args.is_bin) num_image = len(trainset) total_batch_size = args.batch_size * world_size steps_per_epoch = num_image // total_batch_size if args.train_unit == 'epoch': warmup_steps = steps_per_epoch * args.warmup_num total_steps = steps_per_epoch * args.train_num decay_steps = [x * steps_per_epoch for x in args.decay_boundaries] total_epoch = args.train_num else: warmup_steps = args.warmup_num total_steps = args.train_num decay_steps = [x for x in args.decay_boundaries] total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch if rank == 0: logging.info('world_size: {}'.format(world_size)) logging.info('total_batch_size: {}'.format(total_batch_size)) logging.info('warmup_steps: {}'.format(warmup_steps)) logging.info('steps_per_epoch: {}'.format(steps_per_epoch)) logging.info('total_steps: {}'.format(total_steps)) logging.info('total_epoch: {}'.format(total_epoch)) logging.info('decay_steps: {}'.format(decay_steps)) base_lr = total_batch_size * args.lr / 512 lr_scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=decay_steps, values=[ base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1) ]) if warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.LinearWarmup( lr_scheduler, warmup_steps, 0, base_lr) train_program = paddle.static.Program() test_program = paddle.static.Program() startup_program = paddle.static.Program() margin_loss_params = eval("losses.{}".format(args.loss))() train_model = StaticModel( main_program=train_program, startup_program=startup_program, backbone_class_name=args.backbone, embedding_size=args.embedding_size, classifier_class_name=args.classifier, num_classes=args.num_classes, sample_ratio=args.sample_ratio, lr_scheduler=lr_scheduler, momentum=args.momentum, weight_decay=args.weight_decay, dropout=args.dropout, mode='train', fp16=args.fp16, fp16_configs={ 'init_loss_scaling': args.init_loss_scaling, 'incr_every_n_steps': args.incr_every_n_steps, 'decr_every_n_nan_or_inf': args.decr_every_n_nan_or_inf, 'incr_ratio': args.incr_ratio, 'decr_ratio': args.decr_ratio, 'use_dynamic_loss_scaling': args.use_dynamic_loss_scaling, 'use_pure_fp16': args.fp16, 'custom_white_list': args.custom_white_list, 'custom_black_list': args.custom_black_list, }, margin_loss_params=margin_loss_params, ) if rank == 0: with open(os.path.join(args.output, 'main_program.txt'), 'w') as f: f.write(str(train_program)) if rank == 0 and args.do_validation_while_train: test_model = StaticModel( main_program=test_program, startup_program=startup_program, backbone_class_name=args.backbone, embedding_size=args.embedding_size, dropout=args.dropout, mode='test', fp16=args.fp16, ) callback_verification = CallBackVerification( args.validation_interval_step, rank, args.batch_size, test_program, list(test_model.backbone.input_dict.values()), list(test_model.backbone.output_dict.values()), args.val_targets, args.data_dir) callback_logging = CallBackLogging(args.log_interval_step, rank, world_size, total_steps, args.batch_size, writer) checkpoint = Checkpoint( rank=rank, world_size=world_size, embedding_size=args.embedding_size, num_classes=args.num_classes, model_save_dir=os.path.join(args.output, args.backbone), checkpoint_dir=args.checkpoint_dir, max_num_last_checkpoint=args.max_num_last_checkpoint) exe = paddle.static.Executor(place) exe.run(startup_program) start_epoch = 0 global_step = 0 loss_avg = AverageMeter() if args.resume: extra_info = checkpoint.load(program=train_program, for_train=True) start_epoch = extra_info['epoch'] + 1 lr_state = extra_info['lr_state'] # there last_epoch means last_step in for PiecewiseDecay # since we always use step style for lr_scheduler global_step = lr_state['last_epoch'] train_model.lr_scheduler.set_state_dict(lr_state) train_loader = paddle.io.DataLoader( trainset, feed_list=list(train_model.backbone.input_dict.values()), places=place, return_list=False, num_workers=args.num_workers, batch_sampler=paddle.io.DistributedBatchSampler( dataset=trainset, batch_size=args.batch_size, shuffle=True, drop_last=True)) max_loss_scaling = np.array([args.max_loss_scaling]).astype(np.float32) for epoch in range(start_epoch, total_epoch): for step, data in enumerate(train_loader): global_step += 1 loss_v = exe.run( train_program, feed=data, fetch_list=[train_model.classifier.output_dict['loss']], use_program_cache=True) loss_avg.update(np.array(loss_v)[0], 1) lr_value = train_model.optimizer.get_lr() callback_logging(global_step, loss_avg, epoch, lr_value) if rank == 0 and args.do_validation_while_train: callback_verification(global_step) train_model.lr_scheduler.step() if global_step >= total_steps: break sys.stdout.flush() checkpoint.save(train_program, lr_scheduler=train_model.lr_scheduler, epoch=epoch, for_train=True) writer.close()
def main(args): config = Config(args.config) cfg = config(vars(args), mode=['infer', 'init']) scale = cfg['infer']['scale'] mdname = cfg['infer']['model'] imgname = ''.join(mdname) # + '/' + str(scale) #dirname = ''.join(mdname) + '_' + str(scale) sz = cfg['infer']['sz'] infer_size = cfg['infer']['infer_size'] #save_path = os.path.join(args.save_dir, cfg['init']['result']) list = cfg['infer']['infer_size'] save_path = create_path(args.save_dir, cfg['init']['result']) save_path = create_path(save_path, imgname) save_path = create_path(save_path, str(scale)) tif_path = create_path(save_path, cfg['infer']['lab']) color_path = create_path(save_path, cfg['infer']['color']) gray_path = create_path(save_path, cfg['infer']['gray']) vdl_dir = os.path.join(args.save_dir, cfg['init']['vdl_dir']) palette = cfg['infer']['palette'] palette = np.array(palette, dtype=np.uint8) num_class = cfg['init']['num_classes'] batchsz = cfg['infer']['batchsz'] infer_path = os.path.join(cfg['infer']['root_path'], cfg['infer']['path']) tagname = imgname + '/' + str(scale) vdl_dir = os.path.join(vdl_dir, 'infer') writer = LogWriter(logdir=vdl_dir) infer_ds = TeDataset(path=cfg['infer']['root_path'], fl=cfg['infer']['path'], sz=sz) total = len(infer_ds) # select model #addresult = np.zeros((total//batchsz,batchsz,num_class,sz,sz)) addresult = np.zeros((total, num_class, sz, sz)) for mnet in mdname: result_list = [] net = modelset(mode=mnet, num_classes=cfg['init']['num_classes']) # load moel input = InputSpec([None, 3, 64, 64], 'float32', 'x') label = InputSpec([None, 1, 64, 64], 'int64', 'label') model = paddle.Model(net, input, label) model.load(path=os.path.join(args.save_dir, mnet) + '/' + mnet) model.prepare() result = model.predict( infer_ds, batch_size=batchsz, num_workers=cfg['infer']['num_workers'], stack_outputs=True # [160,2,64,64] ) addresult = result[0] + scale * addresult pred = construct(addresult, infer_size, sz=sz) # pred = construct(addresult,infer_size,sz = sz) # # 腐蚀膨胀 # read vdl file_list = os.listdir(infer_path) file_list.sort(key=lambda x: int(x[-5:-4])) step = 0 for i, fl in enumerate(file_list): name, _ = fl.split(".") # save pred lab_img = Image.fromarray(pred[i].astype(np.uint8)).convert("L") saveimg(lab_img, tif_path, name=name, type='.tif') # gray_label label = colorize(pred[i], palette) writer.add_image(tag=tagname, img=saveimg(label, gray_path, name=name, type='.png', re_out=True), step=step, dataformats='HW') step += 1 # color_label file = os.path.join(infer_path, fl) out = blend_image(file, label, alpha=0.25) writer.add_image(tag=tagname, img=saveimg(out, color_path, name=name, type='.png', re_out=True), step=step, dataformats='HWC') step += 1 writer.close()
def train(model, train_dataset, places=None, eval_dataset=None, optimizer=None, save_dir='output', num_epochs=100, batch_size=2, pretrained_model=None, resume_model=None, save_interval_epochs=1, log_steps=10, num_classes=None, num_workers=8, use_vdl=False): ignore_index = model.ignore_index nranks = ParallelEnv().nranks start_epoch = 0 if resume_model is not None: start_epoch = resume(model, optimizer, resume_model) elif pretrained_model is not None: load_pretrained_model(model, pretrained_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: strategy = fluid.dygraph.prepare_context() ddp_model = fluid.dygraph.DataParallel(model, strategy) batch_sampler = DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = DataLoader( train_dataset, batch_sampler=batch_sampler, places=places, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 steps_per_epoch = len(batch_sampler) total_steps = steps_per_epoch * (num_epochs - start_epoch) num_steps = 0 best_mean_iou = -1.0 best_model_epoch = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 for epoch in range(start_epoch, num_epochs): timer.start() for step, data in enumerate(loader): train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') if nranks > 1: loss = ddp_model(images, labels) # apply_collective_grads sum grads over multiple gpus. loss = ddp_model.scale_loss(loss) loss.backward() ddp_model.apply_collective_grads() else: loss = model(images, labels) loss.backward() optimizer.minimize(loss) model.clear_gradients() avg_loss += loss.numpy()[0] lr = optimizer.current_step_lr() num_steps += 1 train_batch_cost += timer.elapsed_time() if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: avg_loss /= log_steps avg_train_reader_cost = train_reader_cost / log_steps avg_train_batch_cost = train_batch_cost / log_steps train_reader_cost = 0.0 train_batch_cost = 0.0 remain_steps = total_steps - num_steps eta = calculate_eta(remain_steps, avg_train_batch_cost) logging.info( "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, avg_loss * nranks, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss * nranks, num_steps) log_writer.add_scalar('Train/lr', lr, num_steps) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, num_steps) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, num_steps) avg_loss = 0.0 timer.restart() if ((epoch + 1) % save_interval_epochs == 0 or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: current_save_dir = os.path.join(save_dir, "epoch_{}".format(epoch + 1)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) fluid.save_dygraph(model.state_dict(), os.path.join(current_save_dir, 'model')) fluid.save_dygraph(optimizer.state_dict(), os.path.join(current_save_dir, 'model')) if eval_dataset is not None: mean_iou, avg_acc = evaluate(model, eval_dataset, model_dir=current_save_dir, num_classes=num_classes, ignore_index=ignore_index, epoch_id=epoch + 1) if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_epoch = epoch + 1 best_model_dir = os.path.join(save_dir, "best_model") fluid.save_dygraph(model.state_dict(), os.path.join(best_model_dir, 'model')) logging.info( 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' .format(best_model_epoch, best_mean_iou)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1) log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1) model.train() if use_vdl: log_writer.close()
def main(args): paddle.seed(12345) config = get_config(args.config, overrides=args.override, show=True) # assign the place use_gpu = config.get("use_gpu", True) place = paddle.set_device('gpu' if use_gpu else 'cpu') trainer_num = paddle.distributed.get_world_size() use_data_parallel = trainer_num != 1 config["use_data_parallel"] = use_data_parallel if config["use_data_parallel"]: paddle.distributed.init_parallel_env() net = program.create_model(config.ARCHITECTURE, config.classes_num) optimizer, lr_scheduler = program.create_optimizer( config, parameter_list=net.parameters()) dp_net = net if config["use_data_parallel"]: find_unused_parameters = config.get("find_unused_parameters", False) dp_net = paddle.DataParallel( net, find_unused_parameters=find_unused_parameters) # load model from checkpoint or pretrained model init_model(config, net, optimizer) train_dataloader = Reader(config, 'train', places=place)() if config.validate: valid_dataloader = Reader(config, 'valid', places=place)() last_epoch_id = config.get("last_epoch", -1) best_top1_acc = 0.0 # best top1 acc record best_top1_epoch = last_epoch_id vdl_writer_path = config.get("vdl_dir", None) vdl_writer = None if vdl_writer_path: from visualdl import LogWriter vdl_writer = LogWriter(vdl_writer_path) # Ensure that the vdl log file can be closed normally try: for epoch_id in range(last_epoch_id + 1, config.epochs): net.train() # 1. train with train dataset program.run(train_dataloader, config, dp_net, optimizer, lr_scheduler, epoch_id, 'train', vdl_writer) # 2. validate with validate dataset if config.validate and epoch_id % config.valid_interval == 0: net.eval() with paddle.no_grad(): top1_acc = program.run(valid_dataloader, config, net, None, None, epoch_id, 'valid', vdl_writer) if top1_acc > best_top1_acc: best_top1_acc = top1_acc best_top1_epoch = epoch_id model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, "best_model") message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, best_top1_epoch) logger.info(message) # 3. save the persistable model if epoch_id % config.save_interval == 0: model_path = os.path.join(config.model_save_dir, config.ARCHITECTURE["name"]) save_model(net, optimizer, model_path, epoch_id) except Exception as e: logger.error(e) finally: vdl_writer.close() if vdl_writer else None
def main(args): world_size = int(1.0) rank = int(0.0) local_rank = args.local_rank if not os.path.exists(cfg.output): os.makedirs(cfg.output) else: time.sleep(2) if not os.path.exists(cfg.output): os.makedirs(cfg.output) else: time.sleep(2) writer = LogWriter(logdir=cfg.logdir) trainset = MXFaceDataset(root_dir=cfg.rec) train_loader = DataLoader(dataset=trainset, batch_size=cfg.batch_size, shuffle=True, drop_last=True, num_workers=0) dropout = 0.4 if cfg.dataset == "webface" else 0 backbone = eval("backbones.{}".format(args.network))(False, dropout=0.5, fp16=False) backbone.train() clip_by_norm = ClipGradByNorm(5.0) margin_softmax = eval("losses.{}".format(args.loss))() module_partial_fc = PartialFC(rank=0, local_rank=0, world_size=1, resume=0, batch_size=cfg.batch_size, margin_softmax=margin_softmax, num_classes=cfg.num_classes, sample_rate=cfg.sample_rate, embedding_size=cfg.embedding_size, prefix=cfg.output) scheduler_backbone = paddle.optimizer.lr.LambdaDecay(learning_rate=cfg.lr / 512 * cfg.batch_size, lr_lambda=cfg.lr_func, verbose=True) opt_backbone = paddle.optimizer.SGD(parameters=backbone.parameters(), learning_rate=scheduler_backbone, weight_decay=cfg.weight_decay, grad_clip=clip_by_norm) scheduler_pfc = paddle.optimizer.lr.LambdaDecay(learning_rate=cfg.lr / 512 * cfg.batch_size, lr_lambda=cfg.lr_func, verbose=True) opt_pfc = paddle.optimizer.SGD(parameters=module_partial_fc.parameters(), learning_rate=scheduler_pfc, weight_decay=cfg.weight_decay, grad_clip=clip_by_norm) start_epoch = 0 total_step = int( len(trainset) / cfg.batch_size / world_size * cfg.num_epoch) if rank == 0: print("Total Step is: %d" % total_step) callback_verification = CallBackVerification(2000, rank, cfg.val_targets, cfg.rec) callback_logging = CallBackLogging(100, rank, total_step, cfg.batch_size, world_size, writer) callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output) loss = AverageMeter() global_step = 0 grad_scaler = MaxClipGradScaler( cfg.batch_size, 128 * cfg.batch_size, growth_interval=100) if cfg.fp16 else None for epoch in range(start_epoch, cfg.num_epoch): for step, (img, label) in enumerate(train_loader): label = label.flatten() global_step += 1 features = F.normalize(backbone(img)) x_grad, loss_v = module_partial_fc.forward_backward( label, features, opt_pfc) if cfg.fp16: scaled = grad_scaler.scale(x_grad) (features.multiply(scaled)).backward() grad_scaler._unscale(opt_backbone) grad_scaler.minimize(opt_backbone, scaled) else: (features.multiply(x_grad)).backward() opt_backbone.step() opt_pfc.step() module_partial_fc.update() opt_backbone.clear_gradients() opt_pfc.clear_gradients() loss.update(loss_v, 1) callback_logging(global_step, loss, epoch, cfg.fp16, grad_scaler) callback_verification(global_step, backbone) callback_checkpoint(global_step, backbone, module_partial_fc) scheduler_backbone.step() scheduler_pfc.step() writer.close()
def train(self, loaders): args = self.args nets = self.nets nets_ema = self.nets_ema optims = self.optims writer = LogWriter(logdir=self.args.checkpoint_dir + "/log/") # fetch random validation images for debugging fetcher = InputFetcher(loaders.src, loaders.ref, args.latent_dim, 'train') fetcher_val = InputFetcher(loaders.val, None, args.latent_dim, 'val') inputs_val = next(fetcher_val) # resume training if necessary if args.resume_iter > 0: self._load_checkpoint(args.resume_iter) # remember the initial value of ds weight initial_lambda_ds = args.lambda_ds print('Start training...') import tqdm start_time = time.time() tqdm_descriptor = tqdm.trange(args.resume_iter, args.total_iters) for i in tqdm_descriptor: # fetch images and labels inputs = next(fetcher) x_real, y_org = inputs.x_src, inputs.y_src x_ref, x_ref2, y_trg = inputs.x_ref, inputs.x_ref2, inputs.y_ref z_trg, z_trg2 = inputs.z_trg, inputs.z_trg2 masks = nets.fan.get_heatmap(x_real) if args.w_hpf > 0 else None # train the discriminator d_loss, d_losses_latent = compute_d_loss(nets, args, x_real, y_org, y_trg, z_trg=z_trg, masks=masks) self._reset_grad() d_loss.backward() optims.discriminator.minimize(d_loss) d_loss, d_losses_ref = compute_d_loss(nets, args, x_real, y_org, y_trg, x_ref=x_ref, masks=masks) self._reset_grad() d_loss.backward() optims.discriminator.minimize(d_loss) # train the generator if i - args.resume_iter > 100: ##train discriminator first g_loss, g_losses_latent, sample_1 = compute_g_loss( nets, args, x_real, y_org, y_trg, z_trgs=[z_trg, z_trg2], masks=masks) self._reset_grad() g_loss.backward() optims.generator.minimize(g_loss) optims.mapping_network.minimize(g_loss) optims.style_encoder.minimize(g_loss) g_loss, g_losses_ref, sample_2 = compute_g_loss( nets, args, x_real, y_org, y_trg, x_refs=[x_ref, x_ref2], masks=masks) self._reset_grad() g_loss.backward() optims.generator.minimize(g_loss) # compute moving average of network parameters moving_average(nets.generator, nets_ema.generator, beta=0.999) moving_average(nets.mapping_network, nets_ema.mapping_network, beta=0.999) moving_average(nets.style_encoder, nets_ema.style_encoder, beta=0.999) # decay weight for diversity sensitive loss if args.lambda_ds > 0: args.lambda_ds -= (initial_lambda_ds / args.ds_iter) # print out log info if (i + 1) % args.print_every == 0: elapsed = time.time() - start_time elapsed = str(datetime.timedelta(seconds=elapsed))[:-7] log = "Elapsed time [%s], Iteration [%i/%i], " % ( elapsed, i + 1, args.total_iters) all_losses = dict() for loss, prefix in zip([ d_losses_latent, d_losses_ref, g_losses_latent, g_losses_ref ], ['D/latent_', 'D/ref_', 'G/latent_', 'G/ref_']): for key, value in loss.items(): all_losses[prefix + key] = value writer.add_scalar(tag=prefix + key, step=i + 1, value=value) all_losses['G/lambda_ds'] = args.lambda_ds log += ' '.join([ '%s: [%.4f]' % (key, value) for key, value in all_losses.items() ]) tqdm_descriptor.set_description(log) writer.add_image("x_fake", (utils.denormalize(sample_1) * 255).numpy().transpose([1, 2, 0]).astype( np.uint8), i + 1) # generate images for debugging if (i + 1) % args.sample_every == 0: os.makedirs(args.sample_dir, exist_ok=True) utils.debug_image(nets_ema, args, inputs=inputs_val, step=i + 1) # save model checkpoints if (i + 1) % args.save_every == 0: self._save_checkpoint(step=i + 1) # compute FID and LPIPS if necessary if (i + 1) % args.eval_every == 0: calculate_metrics(nets_ema, args, i + 1, mode='latent') calculate_metrics(nets_ema, args, i + 1, mode='reference') else: if (i + 1) % args.print_every == 0: elapsed = time.time() - start_time elapsed = str(datetime.timedelta(seconds=elapsed))[:-7] log = "Elapsed time [%s], Iteration [%i/%i], " % ( elapsed, i + 1, args.total_iters) all_losses = dict() for loss, prefix in zip([d_losses_latent, d_losses_ref], ['D/latent_', 'D/ref_']): for key, value in loss.items(): all_losses[prefix + key] = value writer.add_scalar(tag=prefix + key, step=i + 1, value=value) log += ' '.join([ '%s: [%.4f]' % (key, value) for key, value in all_losses.items() ]) tqdm_descriptor.set_description(log) writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, keep_checkpoint_max=5, test_config=None, fp16=False, profiler_options=None): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. test_config(dict, optional): Evaluation config. fp16 (bool, optional): Whether to use amp. profiler_options (str, optional): The option of train profiler. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: paddle.distributed.fleet.init(is_collective=True) optimizer = paddle.distributed.fleet.distributed_optimizer( optimizer) # The return is Fleet object ddp_model = paddle.distributed.fleet.distributed_model(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, worker_init_fn=worker_init_fn, ) # use amp if fp16: logger.info('use amp to train') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_list = [] iters_per_epoch = len(batch_sampler) best_acc = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: version = paddle.__version__ if version == '2.1.2': continue else: break reader_cost_averager.record(time.time() - batch_start) images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if hasattr(model, 'data_format') and model.data_format == 'NHWC': images = images.transpose((0, 2, 3, 1)) if fp16: with paddle.amp.auto_cast( enable=True, custom_white_list={ "elementwise_add", "batch_norm", "sync_batch_norm" }, custom_black_list={'bilinear_interp_v2'}): if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward if isinstance(optimizer, paddle.distributed.fleet.Fleet): scaler.minimize(optimizer.user_defined_optimizer, scaled) else: scaler.minimize(optimizer, scaled) # update parameters else: if nranks > 1: logits_list = ddp_model(images) else: logits_list = model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) loss.backward() optimizer.step() lr = optimizer.get_lr() # update lr if isinstance(optimizer, paddle.distributed.fleet.Fleet): lr_sche = optimizer.user_defined_optimizer._learning_rate else: lr_sche = optimizer._learning_rate if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler): lr_sche.step() train_profiler.add_profiler_step(profiler_options) model.clear_gradients() avg_loss += loss.numpy()[0] if not avg_loss_list: avg_loss_list = [l.numpy() for l in loss_list] else: for i in range(len(loss_list)): avg_loss_list[i] += loss_list[i].numpy() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_loss_list = [l[0] / log_iters for l in avg_loss_list] remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. if len(avg_loss_list) > 1: avg_loss_dict = {} for i, value in enumerate(avg_loss_list): avg_loss_dict['loss_' + str(i)] = value for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 avg_loss_list = [] reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 if test_config is None: test_config = {} acc, fp, fn = evaluate(model, val_dataset, num_workers=num_workers, save_dir=save_dir, **test_config) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) if val_dataset is not None: if acc > best_acc: best_acc = acc best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation Acc ({:.4f}) was saved at iter {}.' .format(best_acc, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/Acc', acc, iter) log_writer.add_scalar('Evaluate/Fp', fp, iter) log_writer.add_scalar('Evaluate/Fn', fn, iter) batch_start = time.time() # Calculate flops. if local_rank == 0: _, c, h, w = images.shape _ = paddle.flops( model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def main(args): world_size = int(1.0) rank = int(0.0) if not os.path.exists(args.output): os.makedirs(args.output) else: time.sleep(2) writer = LogWriter(logdir=args.logdir) trainset = CommonDataset(root_dir=cfg.data_dir, label_file=cfg.file_list, is_bin=args.is_bin) train_loader = DataLoader( dataset=trainset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=0) backbone = eval("backbones.{}".format(args.network))() backbone.train() clip_by_norm = ClipGradByNorm(5.0) margin_softmax = eval("losses.{}".format(args.loss))() module_partial_fc = PartialFC( rank=0, world_size=1, resume=0, batch_size=args.batch_size, margin_softmax=margin_softmax, num_classes=cfg.num_classes, sample_rate=cfg.sample_rate, embedding_size=args.embedding_size, prefix=args.output) scheduler_backbone_decay = paddle.optimizer.lr.LambdaDecay( learning_rate=args.lr, lr_lambda=cfg.lr_func, verbose=True) scheduler_backbone = paddle.optimizer.lr.LinearWarmup( learning_rate=scheduler_backbone_decay, warmup_steps=cfg.warmup_epoch, start_lr=0, end_lr=args.lr / 512 * args.batch_size, verbose=True) opt_backbone = paddle.optimizer.Momentum( parameters=backbone.parameters(), learning_rate=scheduler_backbone, momentum=0.9, weight_decay=args.weight_decay, grad_clip=clip_by_norm) scheduler_pfc_decay = paddle.optimizer.lr.LambdaDecay( learning_rate=args.lr, lr_lambda=cfg.lr_func, verbose=True) scheduler_pfc = paddle.optimizer.lr.LinearWarmup( learning_rate=scheduler_pfc_decay, warmup_steps=cfg.warmup_epoch, start_lr=0, end_lr=args.lr / 512 * args.batch_size, verbose=True) opt_pfc = paddle.optimizer.Momentum( parameters=module_partial_fc.parameters(), learning_rate=scheduler_pfc, momentum=0.9, weight_decay=args.weight_decay, grad_clip=clip_by_norm) start_epoch = 0 total_step = int( len(trainset) / args.batch_size / world_size * cfg.num_epoch) if rank == 0: print("Total Step is: %d" % total_step) callback_verification = CallBackVerification(2000, rank, cfg.val_targets, cfg.data_dir) callback_logging = CallBackLogging(10, rank, total_step, args.batch_size, world_size, writer) callback_checkpoint = CallBackModelCheckpoint(rank, args.output, args.network) loss = AverageMeter() global_step = 0 for epoch in range(start_epoch, cfg.num_epoch): for step, (img, label) in enumerate(train_loader): label = label.flatten() global_step += 1 sys.stdout.flush() features = F.normalize(backbone(img)) x_grad, loss_v = module_partial_fc.forward_backward( label, features, opt_pfc) sys.stdout.flush() (features.multiply(x_grad)).backward() sys.stdout.flush() opt_backbone.step() opt_pfc.step() module_partial_fc.update() opt_backbone.clear_gradients() opt_pfc.clear_gradients() sys.stdout.flush() lr_backbone_value = opt_backbone._global_learning_rate().numpy()[0] lr_pfc_value = opt_backbone._global_learning_rate().numpy()[0] loss.update(loss_v, 1) callback_logging(global_step, loss, epoch, lr_backbone_value, lr_pfc_value) sys.stdout.flush() callback_verification(global_step, backbone) callback_checkpoint(global_step, backbone, module_partial_fc) scheduler_backbone.step() scheduler_pfc.step() writer.close()
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() drop_last = True dataset = build_dataset(cfg.DATASET.DATASET_NAME, file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR, base_size= cfg.DATAAUG.BASE_SIZE, crop_size= cfg.DATAAUG.CROP_SIZE, rand_scale=True) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.TRAIN_BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#device count: {}".format(dev_count)) cfg.TRAIN_BATCH_SIZE = dev_count * int(cfg.TRAIN_BATCH_SIZE_PER_GPU) print_info("#train_batch_size: {}".format(cfg.TRAIN_BATCH_SIZE)) print_info("#batch_size_per_dev: {}".format(cfg.TRAIN_BATCH_SIZE_PER_GPU)) py_reader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) py_reader.decorate_sample_generator( data_generator, batch_size=cfg.TRAIN_BATCH_SIZE_PER_GPU, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR) load_vars = [] load_fail_vars = [] def var_shape_matched(var, shape): """ Check whehter persitable variable shape is match with current network """ var_exist = os.path.exists( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) if var_exist: var_shape = parse_shape_from_file( os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name)) return var_shape == shape return False for x in train_prog.list_vars(): if isinstance(x, fluid.framework.Parameter): shape = tuple(fluid.global_scope().find_var( x.name).get_tensor().shape()) if var_shape_matched(x, shape): load_vars.append(x) else: load_fail_vars.append(x) fluid.io.load_vars( exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars) for var in load_vars: print_info("Parameter[{}] loaded sucessfully!".format(var.name)) for var in load_fail_vars: print_info( "Parameter[{}] don't exist or shape does not match current network, skip" " to load it.".format(var.name)) print_info("{}/{} pretrained parameters loaded successfully!".format( len(load_vars), len(load_vars) + len(load_fail_vars))) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.TRAIN_BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.TRAIN_BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): py_reader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={}/{} step={}/{} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, cfg.SOLVER.NUM_EPOCHS, step, all_step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={}/{} step={}/{} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, cfg.SOLVER.NUM_EPOCHS, global_step, all_step, lr[0], avg_loss, speed, calculate_eta(all_step - global_step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() except fluid.core.EOFException: py_reader.reset() break except Exception as e: print(e) if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(exe, train_prog, epoch) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: save_checkpoint(exe, train_prog, 'final') if args.use_vdl: log_writer.close()
def train(self, train_dataset_src, train_dataset_tgt, val_dataset_tgt=None, val_dataset_src=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, keep_checkpoint_max=5, test_config=None): """ Launch training. Args: train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset_tgt (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. test_config(dict, optional): Evaluation config. """ start_iter = 0 self.model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank if resume_model is not None: logger.info(resume_model) start_iter = resume(self.model, optimizer, resume_model) load_ema_model(self.model, self.resume_ema) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: paddle.distributed.fleet.init(is_collective=True) optimizer = paddle.distributed.fleet.distributed_optimizer( optimizer) # The return is Fleet object ddp_model = paddle.distributed.fleet.distributed_model(self.model) batch_sampler_src = paddle.io.DistributedBatchSampler( train_dataset_src, batch_size=batch_size, shuffle=True, drop_last=True) loader_src = paddle.io.DataLoader( train_dataset_src, batch_sampler=batch_sampler_src, num_workers=num_workers, return_list=True, worker_init_fn=worker_init_fn, ) batch_sampler_tgt = paddle.io.DistributedBatchSampler( train_dataset_tgt, batch_size=batch_size, shuffle=True, drop_last=True) loader_tgt = paddle.io.DataLoader( train_dataset_tgt, batch_sampler=batch_sampler_tgt, num_workers=num_workers, return_list=True, worker_init_fn=worker_init_fn, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) iters_per_epoch = len(batch_sampler_tgt) best_mean_iou = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for _, (data_src, data_tgt) in enumerate(zip(loader_src, loader_tgt)): reader_cost_averager.record(time.time() - batch_start) loss_dict = {} #### training ##### images_tgt = data_tgt[0] labels_tgt = data_tgt[1].astype('int64') images_src = data_src[0] labels_src = data_src[1].astype('int64') edges_src = data_src[2].astype('int64') edges_tgt = data_tgt[2].astype('int64') if nranks > 1: logits_list_src = ddp_model(images_src) else: logits_list_src = self.model(images_src) ##### source seg & edge loss #### loss_src_seg_main = self.celoss(logits_list_src[0], labels_src) loss_src_seg_aux = 0.1 * self.celoss(logits_list_src[1], labels_src) loss_src_seg = loss_src_seg_main + loss_src_seg_aux loss_dict["source_main"] = loss_src_seg_main.numpy()[0] loss_dict["source_aux"] = loss_src_seg_aux.numpy()[0] loss = loss_src_seg del loss_src_seg, loss_src_seg_aux, loss_src_seg_main #### generate target pseudo label #### with paddle.no_grad(): if nranks > 1: logits_list_tgt = ddp_model(images_tgt) else: logits_list_tgt = self.model(images_tgt) pred_P_1 = F.softmax(logits_list_tgt[0], axis=1) labels_tgt_psu = paddle.argmax(pred_P_1.detach(), axis=1) # aux label pred_P_2 = F.softmax(logits_list_tgt[1], axis=1) pred_c = (pred_P_1 + pred_P_2) / 2 labels_tgt_psu_aux = paddle.argmax(pred_c.detach(), axis=1) if self.edgeconstrain: loss_src_edge = self.bceloss_src( logits_list_src[2], edges_src) # 1, 2 640, 1280 src_edge = paddle.argmax( logits_list_src[2].detach().clone(), axis=1) # 1, 1, 640,1280 src_edge_acc = ((src_edge == edges_src).numpy().sum().astype('float32')\ /functools.reduce(lambda a, b: a * b, src_edge.shape))*100 if (not self.src_only) and (iter > 200000): #### target seg & edge loss #### logger.info("Add target edege loss") edges_tgt = Func.mask_to_binary_edge( labels_tgt_psu.detach().clone().numpy(), radius=2, num_classes=train_dataset_tgt.NUM_CLASSES) edges_tgt = paddle.to_tensor(edges_tgt, dtype='int64') loss_tgt_edge = self.bceloss_tgt( logits_list_tgt[2], edges_tgt) loss_edge = loss_tgt_edge + loss_src_edge else: loss_tgt_edge = paddle.zeros([1]) loss_edge = loss_src_edge loss += loss_edge loss_dict['target_edge'] = loss_tgt_edge.numpy()[0] loss_dict['source_edge'] = loss_src_edge.numpy()[0] del loss_edge, loss_tgt_edge, loss_src_edge #### target aug loss ####### augs = augmentation.get_augmentation() images_tgt_aug, labels_tgt_aug = augmentation.augment( images=images_tgt.cpu(), labels=labels_tgt_psu.detach().cpu(), aug=augs, iters="{}_1".format(iter)) images_tgt_aug = images_tgt_aug.cuda() labels_tgt_aug = labels_tgt_aug.cuda() _, labels_tgt_aug_aux = augmentation.augment( images=images_tgt.cpu(), labels=labels_tgt_psu_aux.detach().cpu(), aug=augs, iters="{}_2".format(iter)) labels_tgt_aug_aux = labels_tgt_aug_aux.cuda() if nranks > 1: logits_list_tgt_aug = ddp_model(images_tgt_aug) else: logits_list_tgt_aug = self.model(images_tgt_aug) loss_tgt_aug_main = 0.1 * (self.celoss(logits_list_tgt_aug[0], labels_tgt_aug)) loss_tgt_aug_aux = 0.1 * (0.1 * self.celoss( logits_list_tgt_aug[1], labels_tgt_aug_aux)) loss_tgt_aug = loss_tgt_aug_aux + loss_tgt_aug_main loss += loss_tgt_aug loss_dict['target_aug_main'] = loss_tgt_aug_main.numpy()[0] loss_dict['target_aug_aux'] = loss_tgt_aug_aux.numpy()[0] del images_tgt_aug, labels_tgt_aug_aux, images_tgt, \ loss_tgt_aug, loss_tgt_aug_aux, loss_tgt_aug_main #### edge input seg; src & tgt edge pull in ###### if self.edgepullin: src_edge_logit = logits_list_src[2] feat_src = paddle.concat( [logits_list_src[0], src_edge_logit], axis=1).detach() out_src = self.model.fusion(feat_src) loss_src_edge_rec = self.celoss(out_src, labels_src) tgt_edge_logit = logits_list_tgt_aug[2] # tgt_edge_logit = paddle.to_tensor( # Func.mask_to_onehot(edges_tgt.squeeze().numpy(), 2) # ).unsqueeze(0).astype('float32') feat_tgt = paddle.concat( [logits_list_tgt[0], tgt_edge_logit], axis=1).detach() out_tgt = self.model.fusion(feat_tgt) loss_tgt_edge_rec = self.celoss(out_tgt, labels_tgt) loss_edge_rec = loss_tgt_edge_rec + loss_src_edge_rec loss += loss_edge_rec loss_dict['src_edge_rec'] = loss_src_edge_rec.numpy()[0] loss_dict['tgt_edge_rec'] = loss_tgt_edge_rec.numpy()[0] del loss_tgt_edge_rec, loss_src_edge_rec #### mask input feature & pullin ###### if self.featurepullin: # inner-class loss feat_src = logits_list_src[0] feat_tgt = logits_list_tgt_aug[0] center_src_s, center_tgt_s = [], [] total_pixs = logits_list_src[0].shape[2] * \ logits_list_src[0].shape[3] for i in range(train_dataset_tgt.NUM_CLASSES): pred = paddle.argmax( logits_list_src[0].detach().clone(), axis=1).unsqueeze(0) # 1, 1, 640, 1280 sel_num = paddle.sum((pred == i).astype('float32')) # ignore tensor that do not have features in this img if sel_num > 0: feat_sel_src = paddle.where( (pred == i).expand_as(feat_src), feat_src, paddle.zeros(feat_src.shape)) center_src = paddle.mean(feat_sel_src, axis=[ 2, 3 ]) / (sel_num / total_pixs) # 1, C self.src_centers[i] = 0.99 * self.src_centers[ i] + (1 - 0.99) * center_src pred = labels_tgt_aug.unsqueeze(0) # 1, 1, 512, 512 sel_num = paddle.sum((pred == i).astype('float32')) if sel_num > 0: feat_sel_tgt = paddle.where( (pred == i).expand_as(feat_tgt), feat_tgt, paddle.zeros(feat_tgt.shape)) center_tgt = paddle.mean(feat_sel_tgt, axis=[ 2, 3 ]) / (sel_num / total_pixs) self.tgt_centers[i] = 0.99 * self.tgt_centers[ i] + (1 - 0.99) * center_tgt center_src_s.append(center_src) center_tgt_s.append(center_tgt) if iter >= 3000: # average center structure alignment src_centers = paddle.concat(self.src_centers, axis=0) tgt_centers = paddle.concat(self.tgt_centers, axis=0) # 19, 2048 relatmat_src = paddle.matmul(src_centers, src_centers, transpose_y=True) # 19,19 relatmat_tgt = paddle.matmul(tgt_centers, tgt_centers, transpose_y=True) loss_intra_relate = self.klloss(relatmat_src, (relatmat_tgt+relatmat_src)/2) \ + self.klloss(relatmat_tgt, (relatmat_tgt+relatmat_src)/2) loss_pix_align_src = self.mseloss( paddle.to_tensor(center_src_s), paddle.to_tensor( self.src_centers).detach().clone()) loss_pix_align_tgt = self.mseloss( paddle.to_tensor(center_tgt_s), paddle.to_tensor( self.tgt_centers).detach().clone()) loss_feat_align = loss_pix_align_src + loss_pix_align_tgt + loss_intra_relate loss += loss_feat_align loss_dict['loss_pix_align_src'] = \ loss_pix_align_src.numpy()[0] loss_dict['loss_pix_align_tgt'] = \ loss_pix_align_tgt.numpy()[0] loss_dict['loss_intra_relate'] = \ loss_intra_relate.numpy()[0] del loss_pix_align_tgt, loss_pix_align_src, loss_intra_relate, self.tgt_centers = [ item.detach().clone() for item in self.tgt_centers ] self.src_centers = [ item.detach().clone() for item in self.src_centers ] loss.backward() del loss loss = sum(loss_dict.values()) optimizer.step() self.ema.update_params() with paddle.no_grad(): ##### log & save ##### lr = optimizer.get_lr() # update lr if isinstance(optimizer, paddle.distributed.fleet.Fleet): lr_sche = optimizer.user_defined_optimizer._learning_rate else: lr_sche = optimizer._learning_rate if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler): lr_sche.step() if self.cfg['save_edge']: tgt_edge = paddle.argmax( logits_list_tgt_aug[2].detach().clone(), axis=1) # 1, 1, 640,1280 src_feed_gt = paddle.argmax( src_edge_logit.astype('float32'), axis=1) tgt_feed_gt = paddle.argmax( tgt_edge_logit.astype('float32'), axis=1) logger.info('src_feed_gt_{}_{}_{}'.format( src_feed_gt.shape, src_feed_gt.max(), src_feed_gt.min())) logger.info('tgt_feed_gt_{}_{}_{}'.format( tgt_feed_gt.shape, max(tgt_feed_gt), min(tgt_feed_gt))) save_edge(src_feed_gt, 'src_feed_gt_{}'.format(iter)) save_edge(tgt_feed_gt, 'tgt_feed_gt_{}'.format(iter)) save_edge(tgt_edge, 'tgt_pred_{}'.format(iter)) save_edge(src_edge, 'src_pred_{}_{}'.format(iter, src_edge_acc)) save_edge(edges_src, 'src_gt_{}'.format(iter)) save_edge(edges_tgt, 'tgt_gt_{}'.format(iter)) self.model.clear_gradients() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) iter += 1 if (iter) % log_iters == 0 and local_rank == 0: label_tgt_acc = ((labels_tgt == labels_tgt_psu).numpy().sum().astype('float32')\ /functools.reduce(lambda a, b: a * b, labels_tgt_psu.shape))*100 remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average( ) avg_train_reader_cost = reader_cost_averager.get_average( ) eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, tgt_pix_acc: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" .format( (iter - 1) // iters_per_epoch + 1, iter, iters, loss, label_tgt_acc, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) if use_vdl: log_writer.add_scalar('Train/loss', loss, iter) # Record all losses if there are more than 2 losses. if len(loss_dict) > 1: for name, loss in loss_dict.items(): log_writer.add_scalar( 'Train/loss_' + name, loss, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) log_writer.add_scalar('Train/tgt_label_acc', label_tgt_acc, iter) reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and (val_dataset_tgt is not None): num_workers = 4 if num_workers > 0 else 0 # adjust num_worker=4 if test_config is None: test_config = {} self.ema.apply_shadow() self.ema.model.eval() PA_tgt, _, MIoU_tgt, _ = val.evaluate( self.model, val_dataset_tgt, num_workers=num_workers, **test_config) if (iter % (save_interval * 30)) == 0 \ and self.cfg['eval_src']: # add evaluate on src PA_src, _, MIoU_src, _ = val.evaluate( self.model, val_dataset_src, num_workers=num_workers, **test_config) logger.info( '[EVAL] The source mIoU is ({:.4f}) at iter {}.' .format(MIoU_src, iter)) self.ema.restore() self.model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join( save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save( self.model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save( self.ema.shadow, os.path.join(current_save_dir, 'model_ema.pdparams')) paddle.save( optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) if val_dataset_tgt is not None: if MIoU_tgt > best_mean_iou: best_mean_iou = MIoU_tgt best_model_iter = iter best_model_dir = os.path.join( save_dir, "best_model") paddle.save( self.model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', MIoU_tgt, iter) log_writer.add_scalar('Evaluate/PA', PA_tgt, iter) if self.cfg['eval_src']: log_writer.add_scalar('Evaluate/mIoU_src', MIoU_src, iter) log_writer.add_scalar('Evaluate/PA_src', PA_src, iter) batch_start = time.time() self.ema.update_buffer() # # Calculate flops. if local_rank == 0: def count_syncbn(m, x, y): x = x[0] nelements = x.numel() m.total_ops += int(2 * nelements) _, c, h, w = images_src.shape flops = paddle.flops( self.model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(args): writer = LogWriter(logdir=args.logdir) rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) gpu_id = int(os.getenv("FLAGS_selected_gpus", 0)) place = paddle.CUDAPlace(gpu_id) if world_size > 1: import paddle.distributed.fleet as fleet from .utils.data_parallel import sync_gradients, sync_params strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True fleet.init(is_collective=True, strategy=strategy) if args.use_synthetic_dataset: trainset = SyntheticDataset(args.num_classes, fp16=args.fp16) else: trainset = CommonDataset(root_dir=args.data_dir, label_file=args.label_file, fp16=args.fp16, is_bin=args.is_bin) num_image = len(trainset) total_batch_size = args.batch_size * world_size steps_per_epoch = num_image // total_batch_size if args.train_unit == 'epoch': warmup_steps = steps_per_epoch * args.warmup_num total_steps = steps_per_epoch * args.train_num decay_steps = [x * steps_per_epoch for x in args.decay_boundaries] total_epoch = args.train_num else: warmup_steps = args.warmup_num total_steps = args.train_num decay_steps = [x for x in args.decay_boundaries] total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch if rank == 0: logging.info('world_size: {}'.format(world_size)) logging.info('total_batch_size: {}'.format(total_batch_size)) logging.info('warmup_steps: {}'.format(warmup_steps)) logging.info('steps_per_epoch: {}'.format(steps_per_epoch)) logging.info('total_steps: {}'.format(total_steps)) logging.info('total_epoch: {}'.format(total_epoch)) logging.info('decay_steps: {}'.format(decay_steps)) base_lr = total_batch_size * args.lr / 512 lr_scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=decay_steps, values=[ base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1) ]) if warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.LinearWarmup( lr_scheduler, warmup_steps, 0, base_lr) if args.fp16: paddle.set_default_dtype("float16") margin_loss_params = eval("losses.{}".format(args.loss))() backbone = eval("backbones.{}".format(args.backbone))( num_features=args.embedding_size, dropout=args.dropout) classifier = eval("classifiers.{}".format(args.classifier))( rank=rank, world_size=world_size, num_classes=args.num_classes, margin1=margin_loss_params.margin1, margin2=margin_loss_params.margin2, margin3=margin_loss_params.margin3, scale=margin_loss_params.scale, sample_ratio=args.sample_ratio, embedding_size=args.embedding_size, fp16=args.fp16) backbone.train() classifier.train() optimizer = paddle.optimizer.Momentum(parameters=[{ 'params': backbone.parameters(), }, { 'params': classifier.parameters(), }], learning_rate=lr_scheduler, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer._dtype = 'float32' if world_size > 1: # sync backbone params for data parallel sync_params(backbone.parameters()) if args.do_validation_while_train: callback_verification = CallBackVerification( args.validation_interval_step, rank, args.batch_size, args.val_targets, args.data_dir, fp16=args.fp16, ) callback_logging = CallBackLogging(args.log_interval_step, rank, world_size, total_steps, args.batch_size, writer) checkpoint = Checkpoint( rank=rank, world_size=world_size, embedding_size=args.embedding_size, num_classes=args.num_classes, model_save_dir=os.path.join(args.output, args.backbone), checkpoint_dir=args.checkpoint_dir, max_num_last_checkpoint=args.max_num_last_checkpoint) start_epoch = 0 global_step = 0 loss_avg = AverageMeter() if args.resume: extra_info = checkpoint.load(backbone, classifier, optimizer, for_train=True) start_epoch = extra_info['epoch'] + 1 lr_state = extra_info['lr_state'] # there last_epoch means last_step in for PiecewiseDecay # since we always use step style for lr_scheduler global_step = lr_state['last_epoch'] lr_scheduler.set_state_dict(lr_state) train_loader = paddle.io.DataLoader( trainset, places=place, num_workers=args.num_workers, batch_sampler=paddle.io.DistributedBatchSampler( dataset=trainset, batch_size=args.batch_size, shuffle=True, drop_last=True)) scaler = LSCGradScaler( enable=args.fp16, init_loss_scaling=args.init_loss_scaling, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling) for epoch in range(start_epoch, total_epoch): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for step, (img, label) in enumerate(train_loader): train_reader_cost += time.time() - reader_start global_step += 1 train_start = time.time() with paddle.amp.auto_cast(enable=args.fp16): features = backbone(img) loss_v = classifier(features, label) scaler.scale(loss_v).backward() if world_size > 1: # data parallel sync backbone gradients sync_gradients(backbone.parameters()) scaler.step(optimizer) classifier.step(optimizer) optimizer.clear_grad() classifier.clear_grad() train_run_cost += time.time() - train_start total_samples += len(img) lr_value = optimizer.get_lr() loss_avg.update(loss_v.item(), 1) callback_logging( global_step, loss_avg, epoch, lr_value, avg_reader_cost=train_reader_cost / args.log_interval_step, avg_batch_cost=(train_reader_cost + train_run_cost) / args.log_interval_step, avg_samples=total_samples / args.log_interval_step, ips=total_samples / (train_reader_cost + train_run_cost)) if args.do_validation_while_train: callback_verification(global_step, backbone) lr_scheduler.step() if global_step >= total_steps: break sys.stdout.flush() if rank is 0 and global_step > 0 and global_step % args.log_interval_step == 0: train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() checkpoint.save(backbone, classifier, optimizer, epoch=epoch, for_train=True) writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace( dg.parallel.Env().dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) reader = LJSpeechLoader(cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model(character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None): nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel training environment. paddle.distributed.init_parallel_env() strategy = paddle.distributed.prepare_context() ddp_model = paddle.DataParallel(model, strategy) batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 timer.start() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') if nranks > 1: logits = ddp_model(images) loss = loss_computation(logits, labels, losses) loss.backward() else: logits = model(images) loss = loss_computation(logits, labels, losses) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] train_batch_cost += timer.elapsed_time() if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_train_reader_cost = train_reader_cost / log_iters avg_train_batch_cost = train_batch_cost / log_iters train_reader_cost = 0.0 train_batch_cost = 0.0 remain_iters = iters - iter eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 mean_iou, acc = evaluate( model, val_dataset, num_workers=num_workers) model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) timer.restart() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(model, train_dataset, places=None, eval_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval_iters=1000, log_iters=10, num_classes=None, num_workers=8, use_vdl=False): ignore_index = model.ignore_index nranks = ParallelEnv().nranks start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: strategy = fluid.dygraph.prepare_context() ddp_model = fluid.dygraph.DataParallel(model, strategy) batch_sampler = DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = DataLoader( train_dataset, batch_sampler=batch_sampler, places=places, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) timer = Timer() avg_loss = 0.0 iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 train_reader_cost = 0.0 train_batch_cost = 0.0 timer.start() iter = 0 while iter < iters: for data in loader: iter += 1 if iter > iters: break train_reader_cost += timer.elapsed_time() images = data[0] labels = data[1].astype('int64') if nranks > 1: loss = ddp_model(images, labels) # apply_collective_grads sum grads over multiple gpus. loss = ddp_model.scale_loss(loss) loss.backward() ddp_model.apply_collective_grads() else: loss = model(images, labels) loss.backward() optimizer.minimize(loss) model.clear_gradients() avg_loss += loss.numpy()[0] lr = optimizer.current_step_lr() train_batch_cost += timer.elapsed_time() if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0: avg_loss /= log_iters avg_train_reader_cost = train_reader_cost / log_iters avg_train_batch_cost = train_batch_cost / log_iters train_reader_cost = 0.0 train_batch_cost = 0.0 remain_iters = iters - iter eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss * nranks, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss * nranks, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 if (iter % save_interval_iters == 0 or iter == iters) and ParallelEnv().local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) fluid.save_dygraph(model.state_dict(), os.path.join(current_save_dir, 'model')) fluid.save_dygraph(optimizer.state_dict(), os.path.join(current_save_dir, 'model')) if eval_dataset is not None: mean_iou, avg_acc = evaluate(model, eval_dataset, model_dir=current_save_dir, num_classes=num_classes, ignore_index=ignore_index, iter_id=iter) if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") fluid.save_dygraph( model.state_dict(), os.path.join(best_model_dir, 'model')) logger.info( 'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}' .format(best_model_iter, best_mean_iou)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter) model.train() timer.restart() if use_vdl: log_writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, keep_checkpoint_max=5, eval_begin_iters=None): """ Launch training. Args: model(nn.Layer): A matting model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict, optional): A dict of loss, refer to the loss function of the model for details. Default: None. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. eval_begin_iters (int): The iters begin evaluation. It will evaluate at iters/2 if it is None. Defalust: None. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) else: ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = defaultdict(float) iters_per_epoch = len(batch_sampler) best_sad = np.inf best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) # model input if nranks > 1: logit_dict = ddp_model(data) else: logit_dict = model(data) loss_dict = model.loss(logit_dict, data, losses) loss_dict['all'].backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() for key, value in loss_dict.items(): avg_loss[key] += value.numpy()[0] batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: for key, value in avg_loss.items(): avg_loss[key] = value / log_iters remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss['all'], lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) # print loss loss_str = '[TRAIN] [LOSS] ' loss_str = loss_str + 'all={:.4f}'.format(avg_loss['all']) for key, value in avg_loss.items(): if key != 'all': loss_str = loss_str + ' ' + key + '={:.4f}'.format( value) logger.info(loss_str) if use_vdl: for key, value in avg_loss.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) for key in avg_loss.keys(): avg_loss[key] = 0. reader_cost_averager.reset() batch_cost_averager.reset() # save model if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) # eval model if eval_begin_iters is None: eval_begin_iters = iters // 2 if (iter % save_interval == 0 or iter == iters) and ( val_dataset is not None ) and local_rank == 0 and iter >= eval_begin_iters: num_workers = 1 if num_workers > 0 else 0 sad, mse = evaluate(model, val_dataset, num_workers=0, print_detail=True, save_results=False) model.train() # save best model and add evaluation results to vdl if (iter % save_interval == 0 or iter == iters) and local_rank == 0: if val_dataset is not None and iter >= eval_begin_iters: if sad < best_sad: best_sad = sad best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation sad ({:.4f}) was saved at iter {}.' .format(best_sad, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/SAD', sad, iter) log_writer.add_scalar('Evaluate/MSE', mse, iter) batch_start = time.time() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def train(model, train_dataset, val_dataset=None, optimizer=None, loss_computation=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, keep_checkpoint_max=5): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. loss_computation (nn.Layer): A loss function. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) else: ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) # VisualDL log log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_dict = {} iters_per_epoch = len(batch_sampler) reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) images = data[0] targets = data[1] if nranks > 1: predictions = ddp_model(images) else: predictions = model(images) loss_dict = loss_computation(predictions, targets) loss = sum(loss for loss in loss_dict.values()) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] # get the value if len(avg_loss_dict) == 0: avg_loss_dict = {k: v.numpy()[0] for k, v in loss_dict.items()} else: for key, value in loss_dict.items(): avg_loss_dict[key] += value.numpy()[0] batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters for key, value in avg_loss_dict.items(): avg_loss_dict[key] /= log_iters remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) ######################### VisualDL Log ########################## log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) ################################################################# avg_loss = 0.0 avg_loss_list = {} reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) batch_start = time.time() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) log_writer.close()