def load_weights(self, weights, weight_type='pretrain'): assert weight_type in ['pretrain', 'resume', 'finetune'], \ "weight_type can only be 'pretrain', 'resume', 'finetune'" if weight_type == 'resume': self.start_epoch = load_weight(self.model, weights, self.optimizer) logger.debug("Resume weights of epoch {}".format(self.start_epoch)) else: self.start_epoch = 0 load_pretrain_weight(self.model, weights, self.cfg.get('load_static_weights', False), weight_type) logger.debug("Load {} weights {} to start training".format( weight_type, weights)) self._weights_loaded = True
def __init__(self, cfg, slim_cfg): super(DistillModel, self).__init__() self.student_model = create(cfg.architecture) logger.debug('Load student model pretrain_weights:{}'.format( cfg.pretrain_weights)) load_pretrain_weight(self.student_model, cfg.pretrain_weights) slim_cfg = load_config(slim_cfg) self.teacher_model = create(slim_cfg.architecture) self.distill_loss = create(slim_cfg.distill_loss) logger.debug('Load teacher model pretrain_weights:{}'.format( slim_cfg.pretrain_weights)) load_pretrain_weight(self.teacher_model, slim_cfg.pretrain_weights) for param in self.teacher_model.parameters(): param.trainable = False
def build_slim_model(cfg, slim_cfg, mode='train'): with open(slim_cfg) as f: slim_load_cfg = yaml.load(f, Loader=yaml.Loader) if mode != 'train' and slim_load_cfg['slim'] == 'Distill': return cfg if slim_load_cfg['slim'] == 'Distill': model = DistillModel(cfg, slim_cfg) cfg['model'] = model elif slim_load_cfg['slim'] == 'DistillPrune': if mode == 'train': model = DistillModel(cfg, slim_cfg) pruner = create(cfg.pruner) pruner(model.student_model) else: model = create(cfg.architecture) weights = cfg.weights load_config(slim_cfg) pruner = create(cfg.pruner) model = pruner(model) load_pretrain_weight(model, weights) cfg['model'] = model cfg['slim_type'] = cfg.slim elif slim_load_cfg['slim'] == 'PTQ': model = create(cfg.architecture) load_config(slim_cfg) load_pretrain_weight(model, cfg.weights) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim cfg['model'] = slim(model) cfg['slim'] = slim elif slim_load_cfg['slim'] == 'UnstructuredPruner': load_config(slim_cfg) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim cfg['slim'] = slim cfg['unstructured_prune'] = True else: load_config(slim_cfg) model = create(cfg.architecture) if mode == 'train': load_pretrain_weight(model, cfg.pretrain_weights) slim = create(cfg.slim) cfg['slim_type'] = cfg.slim # TODO: fix quant export model in framework. if mode == 'test' and slim_load_cfg['slim'] == 'QAT': slim.quant_config['activation_preprocess_type'] = None cfg['model'] = slim(model) cfg['slim'] = slim if mode != 'train': load_pretrain_weight(cfg['model'], cfg.weights) return cfg
def build_slim_model(cfg, slim_cfg, mode='train'): with open(slim_cfg) as f: slim_load_cfg = yaml.load(f, Loader=yaml.Loader) if mode != 'train' and slim_load_cfg['slim'] == 'Distill': return cfg if slim_load_cfg['slim'] == 'Distill': model = DistillModel(cfg, slim_cfg) cfg['model'] = model elif slim_load_cfg['slim'] == 'DistillPrune': if mode == 'train': model = DistillModel(cfg, slim_cfg) pruner = create(cfg.pruner) pruner(model.student_model) else: model = create(cfg.architecture) weights = cfg.weights load_config(slim_cfg) pruner = create(cfg.pruner) model = pruner(model) load_pretrain_weight(model, weights) cfg['model'] = model else: load_config(slim_cfg) model = create(cfg.architecture) if mode == 'train': load_pretrain_weight(model, cfg.pretrain_weights) slim = create(cfg.slim) cfg['model'] = slim(model) cfg['slim'] = slim if mode != 'train': load_pretrain_weight(cfg['model'], cfg.weights) return cfg
def load_weights(self, weights): if self.is_loaded_weights: return self.start_epoch = 0 if hasattr(self.model, 'detector'): if self.model.__class__.__name__ == 'FairMOT': load_pretrain_weight(self.model, weights) else: load_pretrain_weight(self.model.detector, weights) else: load_pretrain_weight(self.model, weights) logger.debug("Load weights {} to start training".format(weights))
def run(FLAGS, cfg, place): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) if ParallelEnv().nranks > 1: paddle.distributed.init_parallel_env() # Data dataset = cfg.TrainDataset train_loader, step_per_epoch = create('TrainReader')(dataset, cfg['worker_num'], place) # Model model = create(cfg.architecture) # Optimizer lr = create('LearningRate')(step_per_epoch) optimizer = create('OptimizerBuilder')(lr, model.parameters()) # Init Model & Optimzer if FLAGS.weight_type == 'resume': load_weight(model, cfg.pretrain_weights, optimizer) else: load_pretrain_weight(model, cfg.pretrain_weights, cfg.get('load_static_weights', False), FLAGS.weight_type) if getattr(model.backbone, 'norm_type', None) == 'sync_bn': assert cfg.use_gpu and ParallelEnv( ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu' # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and # cfg.use_gpu and ParallelEnv().nranks > 1) # if sync_bn: # model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) # Parallel Model if ParallelEnv().nranks > 1: model = paddle.DataParallel(model) fields = train_loader.collate_fn.output_fields # Run Train time_stat = deque(maxlen=cfg.log_iter) start_time = time.time() end_time = time.time() # Run Train start_epoch = optimizer.state_dict()['LR_Scheduler']['last_epoch'] for epoch_id in range(int(cfg.epoch)): cur_eid = epoch_id + start_epoch train_loader.dataset.epoch = cur_eid for iter_id, data in enumerate(train_loader): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = ( (cfg.epoch - cur_eid) * step_per_epoch - iter_id) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) # Model Forward model.train() outputs = model(data, fields, 'train') # Model Backward loss = outputs['loss'] if ParallelEnv().nranks > 1: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.step() curr_lr = optimizer.get_lr() lr.step() optimizer.clear_grad() if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: # Log state if epoch_id == 0 and iter_id == 0: train_stats = TrainingStats(cfg.log_iter, outputs.keys()) train_stats.update(outputs) logs = train_stats.log() if iter_id % cfg.log_iter == 0: ips = float(cfg['TrainReader']['batch_size']) / time_cost strs = 'Epoch:{}: iter: {}, lr: {:.6f}, {}, eta: {}, batch_cost: {:.5f} sec, ips: {:.5f} images/sec'.format( cur_eid, iter_id, curr_lr, logs, eta, time_cost, ips) logger.info(strs) # Save Stage if ParallelEnv().local_rank == 0 and (cur_eid % cfg.snapshot_epoch == 0 or (cur_eid + 1) == int(cfg.epoch)): cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_name = str(cur_eid) if cur_eid + 1 != int( cfg.epoch) else "model_final" save_dir = os.path.join(cfg.save_dir, cfg_name) save_model(model, optimizer, save_dir, save_name)
def load_weights(self, weights): if self.is_loaded_weights: return self.start_epoch = 0 load_pretrain_weight(self.model, weights) logger.debug("Load weights {} to start training".format(weights))
def run(FLAGS, cfg, place): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) if ParallelEnv().nranks > 1: paddle.distributed.init_parallel_env() # Data datasets = cfg.TrainDataset train_loader = create('TrainReader')(datasets, cfg['worker_num']) steps = len(train_loader) # Model model = create(cfg.architecture) # Optimizer lr = create('LearningRate')(steps) optimizer = create('OptimizerBuilder')(lr, model.parameters()) # Init Model & Optimzer start_epoch = 0 if FLAGS.weight_type == 'resume': start_epoch = load_weight(model, cfg.pretrain_weights, optimizer) else: load_pretrain_weight(model, cfg.pretrain_weights, cfg.get('load_static_weights', False), FLAGS.weight_type) if getattr(model.backbone, 'norm_type', None) == 'sync_bn': assert cfg.use_gpu and ParallelEnv( ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu' # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and # cfg.use_gpu and ParallelEnv().nranks > 1) # if sync_bn: # model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) # The parameter filter is temporary fix for training because of #28997 # in Paddle. def no_grad(param): if param.name.startswith("conv1_") or param.name.startswith("res2a_") \ or param.name.startswith("res2b_") or param.name.startswith("res2c_"): return True for param in filter(no_grad, model.parameters()): param.stop_gradient = True # Parallel Model if ParallelEnv().nranks > 1: model = paddle.DataParallel(model) cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) # Run Train end_epoch = int(cfg.epoch) batch_size = int(cfg['TrainReader']['batch_size']) total_steps = (end_epoch - start_epoch) * steps step_id = 0 train_stats = stats.TrainingStats(cfg.log_iter) batch_time = stats.SmoothedValue(fmt='{avg:.4f}') data_time = stats.SmoothedValue(fmt='{avg:.4f}') end_time = time.time() space_fmt = ':' + str(len(str(steps))) + 'd' # Run Train for cur_eid in range(start_epoch, end_epoch): datasets.set_epoch(cur_eid) for iter_id, data in enumerate(train_loader): data_time.update(time.time() - end_time) # Model Forward model.train() outputs = model(data, mode='train') loss = outputs['loss'] # Model Backward loss.backward() optimizer.step() curr_lr = optimizer.get_lr() lr.step() optimizer.clear_grad() batch_time.update(time.time() - end_time) if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: train_stats.update(outputs) logs = train_stats.log() if iter_id % cfg.log_iter == 0: eta_sec = (total_steps - step_id) * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg fmt = ' '.join([ 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s', ]) fmt = fmt.format(cur_eid, iter_id, steps, meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips) logger.info(fmt) step_id += 1 end_time = time.time() # after copy outputs to CPU. # Save Stage if (ParallelEnv().local_rank == 0 and \ (cur_eid % cfg.snapshot_epoch) == 0) or (cur_eid + 1) == end_epoch: save_name = str( cur_eid) if cur_eid + 1 != end_epoch else "model_final" save_model(model, optimizer, save_dir, save_name, cur_eid + 1)