def train(self): assert self.mode == 'train', "Model not in 'train' mode" # if no given weights loaded, load backbone pretrain weights as default if not self._weights_loaded: self.load_weights(self.cfg.pretrain_weights) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # model forward self.model.train() outputs = self.model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) self._compose_callback.on_epoch_end(self.status)
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False model = self.model if self.cfg.get('fleet', False): model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) # initial fp16 if self.cfg.get('fp16', False): scaler = amp.GradScaler(enable=self.cfg.use_gpu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) if self.cfg.get('print_flops', False): self._flops(self.loader) profiler_options = self.cfg.get('profiler_options', None) self._compose_callback.on_train_begin(self.status) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) data['epoch_id'] = epoch_id if self.cfg.get('fp16', False): with amp.auto_cast(enable=self.cfg.use_gpu): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() if self.cfg.get('unstructured_prune'): self.pruner.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: self.ema.update(self.model) iter_tic = time.time() # apply ema weight on model if self.use_ema: weight = copy.deepcopy(self.model.state_dict()) self.model.set_dict(self.ema.apply()) if self.cfg.get('unstructured_prune'): self.pruner.update_params() self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) # restore origin weight on model if self.use_ema: self.model.set_dict(weight) self._compose_callback.on_train_end(self.status)
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" # if no given weights loaded, load backbone pretrain weights as default if not self._weights_loaded: self.load_weights(self.cfg.pretrain_weights) model = self.model if self._nranks > 1: model = paddle.DataParallel(self.model) else: model = self.model self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) iter_tic = time.time() self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and (epoch_id % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) with paddle.no_grad(): self._eval_with_loader(self._eval_loader)
def run(FLAGS, cfg, place): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) if ParallelEnv().nranks > 1: paddle.distributed.init_parallel_env() # Data datasets = cfg.TrainDataset train_loader = create('TrainReader')(datasets, cfg['worker_num']) steps = len(train_loader) # Model model = create(cfg.architecture) # Optimizer lr = create('LearningRate')(steps) optimizer = create('OptimizerBuilder')(lr, model.parameters()) # Init Model & Optimzer start_epoch = 0 if FLAGS.weight_type == 'resume': start_epoch = load_weight(model, cfg.pretrain_weights, optimizer) else: load_pretrain_weight(model, cfg.pretrain_weights, cfg.get('load_static_weights', False), FLAGS.weight_type) if getattr(model.backbone, 'norm_type', None) == 'sync_bn': assert cfg.use_gpu and ParallelEnv( ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu' # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and # cfg.use_gpu and ParallelEnv().nranks > 1) # if sync_bn: # model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) # The parameter filter is temporary fix for training because of #28997 # in Paddle. def no_grad(param): if param.name.startswith("conv1_") or param.name.startswith("res2a_") \ or param.name.startswith("res2b_") or param.name.startswith("res2c_"): return True for param in filter(no_grad, model.parameters()): param.stop_gradient = True # Parallel Model if ParallelEnv().nranks > 1: model = paddle.DataParallel(model) cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) # Run Train end_epoch = int(cfg.epoch) batch_size = int(cfg['TrainReader']['batch_size']) total_steps = (end_epoch - start_epoch) * steps step_id = 0 train_stats = stats.TrainingStats(cfg.log_iter) batch_time = stats.SmoothedValue(fmt='{avg:.4f}') data_time = stats.SmoothedValue(fmt='{avg:.4f}') end_time = time.time() space_fmt = ':' + str(len(str(steps))) + 'd' # Run Train for cur_eid in range(start_epoch, end_epoch): datasets.set_epoch(cur_eid) for iter_id, data in enumerate(train_loader): data_time.update(time.time() - end_time) # Model Forward model.train() outputs = model(data, mode='train') loss = outputs['loss'] # Model Backward loss.backward() optimizer.step() curr_lr = optimizer.get_lr() lr.step() optimizer.clear_grad() batch_time.update(time.time() - end_time) if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: train_stats.update(outputs) logs = train_stats.log() if iter_id % cfg.log_iter == 0: eta_sec = (total_steps - step_id) * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg fmt = ' '.join([ 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s', ]) fmt = fmt.format(cur_eid, iter_id, steps, meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips) logger.info(fmt) step_id += 1 end_time = time.time() # after copy outputs to CPU. # Save Stage if (ParallelEnv().local_rank == 0 and \ (cur_eid % cfg.snapshot_epoch) == 0) or (cur_eid + 1) == end_epoch: save_name = str( cur_eid) if cur_eid + 1 != end_epoch else "model_final" save_model(model, optimizer, save_dir, save_name, cur_eid + 1)
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" # if validation in training is enabled, metrics should be re-init if validate: self._init_metrics(validate=validate) self._reset_metrics() model = self.model if self.cfg.fleet: model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer( self.optimizer).user_defined_optimizer elif self._nranks > 1: model = paddle.DataParallel(self.model) # initial fp16 if self.cfg.fp16: scaler = amp.GradScaler(enable=self.cfg.use_gpu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) if self.cfg.fp16: with amp.auto_cast(enable=self.cfg.use_gpu): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: self.ema.update(self.model) iter_tic = time.time() # apply ema weight on model if self.use_ema: weight = self.model.state_dict() self.model.set_dict(self.ema.apply()) self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and (epoch_id % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) # restore origin weight on model if self.use_ema: self.model.set_dict(weight)