class DlEval: def __init__(self, config_path, log_level='info'): """ :param str config_path: path to config file """ self.__logger = logging.getLogger() self.__logger.setLevel(LOG_LEVELS[log_level]) ch = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s [%(levelname)s]: %(message)s ' \ '(%(module)s:%(funcName)s:%(lineno)d)') ch.setFormatter(formatter) self.__logger.addHandler(ch) with open(config_path, 'r') as ymlfile: self.__cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) self.__client = Client( self.__cfg['moodle'].get('data_path', DEFAULT_MOODLE_DATA_PATH), self.__logger, self.__cfg['moodle'].get('timeout', DEFAULT_TIMEOUT), self.__cfg['moodle'].get('max_retries', DEFAULT_MAX_RETRIES)) eval_data_path = DEFAULT_EVAL_DATA_PATH if 'eval' in self.__cfg and 'data_path' in self.__cfg['eval']: eval_data_path = self.__cfg['eval']['data_path'] self.__evaluator = Evaluator(eval_data_path, self.__logger) def run(self): while True: try: ok = self.__client.login(self.__cfg['moodle']['username'], self.__cfg['moodle']['password']) if not ok: self.__logger.critical('login failed') else: allowed_assignments = \ self.__evaluator.get_allowed_assignments() course_data = self.__client.download_new_course_data( self.__cfg['moodle']['course_id'], allowed_assignments) self.__evaluator.evaluate(course_data) self.__client.send_feedback(course_data) except: tback = ''.join(traceback.format_exception(*sys.exc_info())) self.__logger.critical('an exception occured!\n' + tback) sleep(self.__cfg.get('interval', DEFAULT_INTERVAL))
def evaluate(config, args): if args.onnx: model = onnx_model_for_eval(args.onnx, args.device == 'cuda') else: model = torch_model_for_eval(args.cfg, args.weight, device=args.device) eval_dataset = EvalDataset(config) evaluator = Evaluator(model, eval_dataset, config) AP = evaluator.evaluate() tools.print_metric(AP)
def test(self): eval_dataset = EvalDataset(self.cfg) dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=None, shuffle=False, num_workers=self._num_workers, pin_memory=True, collate_fn=lambda x: x, ) evaluator = Evaluator(self.new_model, dataloader, self.cfg) self.new_model.eval() AP = evaluator.evaluate() # 打印 tools.print_metric(AP)
class Trainer: def __init__(self, config): # metric self.AP = None # model self._cfg_path = config.model.cfg_path # train self._train_batch_size = config.train.batch_size self._scheduler_type = config.train.scheduler self._mile_stones = config.train.mile_stones self._gamma = config.train.gamma self._init_lr = config.train.learning_rate_init self._end_lr = config.train.learning_rate_end self._weight_decay = config.train.weight_decay self._warmup_epochs = config.train.warmup_epochs self._max_epochs = config.train.max_epochs # weights self._backbone_weight = config.weight.backbone self._weights_dir = os.path.join(config.weight.dir, config.experiment_name) self._resume_weight = config.weight.resume self._clear_history = config.weight.clear_history self._weight_base_name = 'model' # eval self._eval_after = config.eval.after # sparse self._sparse_train = config.sparse.switch self._sparse_ratio = config.sparse.ratio # prune self._prune_ratio = config.prune.ratio # quant self._quant_train = config.quant.switch self._quant_backend = config.quant.backend self._disable_observer_after = config.quant.disable_observer_after self._freeze_bn_after = config.quant.freeze_bn_after # system self._gpus = fix_gpus(config.system.gpus) self._num_workers = config.system.num_workers self._device = get_device(self._gpus) self.init_eopch = 0 self.global_step = 0 self.config = config self.dataload_tt = TicToc() self.model_tt = TicToc() self.epoch_tt = TicToc() self.scheduler = { 'cosine': self.scheduler_cosine, 'step': self.scheduler_step, }[self._scheduler_type] def scheduler_cosine(self, steps: int): '''根据训练步数通过公式计算cosine退火的学习率, 通过对优化器中每个参数赋值调整学习率。 Args: steps: 当前训练的步数。 Returns: float: 学习率。 ''' # 热身步数 warmup_steps = self._warmup_epochs * self._steps_per_epoch # 最大训练步数 max_steps = self._max_epochs * self._steps_per_epoch if steps < warmup_steps: lr = steps / warmup_steps * self._init_lr else: lr = self._end_lr + 0.5*(self._init_lr-self._end_lr) *\ (1 + math.cos((steps-warmup_steps)/(max_steps-warmup_steps)*math.pi)) # 对每个参数赋值新的学习率 for param_group in self.optimizer.param_groups: param_group['lr'] = lr return lr def scheduler_step(self, steps: int): '''根据训练步数通过公式计算多步的学习率, 通过对优化器中每个参数赋值调整学习率。 Args: steps: 当前训练的步数。 Returns: float: 学习率。 ''' # 热身步数 warmup_steps = self._warmup_epochs * self._steps_per_epoch if steps < warmup_steps: lr = steps / warmup_steps * self._init_lr else: for i, m in enumerate(chain(self._mile_stones, [self._max_epochs])): if steps < m * self._steps_per_epoch: lr = self._init_lr * self._gamma**i break # 对每个参数赋值新的学习率 for param_group in self.optimizer.param_groups: param_group['lr'] = lr return lr def init_cfg(self): with open(self._cfg_path, 'r') as fr: self.cfg = fr.read() # 建立数据集 def init_dataset(self): train_dataset = TrainDataset(self.config) eval_dataset = EvalDataset(self.config) # 数据集内部手动生成batch,所以此处batch_size=None self.train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=self._train_batch_size, shuffle=False, num_workers=self._num_workers, pin_memory=True, collate_fn=collate_batch, ) self.eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=None, shuffle=False, num_workers=self._num_workers, pin_memory=True, collate_fn=lambda x: x, ) print(f'{train_dataset.length} images for train.') print(f'{eval_dataset.length} images for evaluate.') # 建立YOLOv3模型 def init_model(self): if self._quant_train: print('quantization aware training') self.model, model_info = tools.build_model( self._cfg_path, self._resume_weight, self._backbone_weight, device=self._device, clear_history=self._clear_history, dataparallel=not self._quant_train, device_ids=self._gpus, qat=self._quant_train, backend=self._quant_backend) self.global_step = model_info.get('step', 0) # 计算恢复的轮数 self.init_eopch = self.global_step // self._steps_per_epoch # 准备评估模型的类 def init_evaluator(self): self.evaluator = Evaluator(self.model, self.eval_dataloader, self.config) # adam优化器 def init_optimizer(self): self.optimizer = optim.Adam(self.model.parameters(), lr=self._init_lr, weight_decay=self._weight_decay) def init_losses(self): # 5个损失:总损失,位置损失,置信度损失,类别损失,各检测头的损失 # 总损失是位置损失,置信度损失,类别损失3个之和,各损失均是一段时间的平均损失 # 各检测头的损失按顺序分别是32,16,8处的 self.losses = { 'loss': AverageMeter(), 'giou_loss': AverageMeter(), 'conf_loss': AverageMeter(), 'class_loss': AverageMeter(), 'loss_per_branch': [AverageMeter() for _ in range(3)], } # 评估计算mAP指标 def eval(self): self.AP = ap = self.evaluator.evaluate() # 打印 tools.print_metric(ap, verbose=False) return ap def _clear_ap(self): self.AP = None def save(self, epoch): # 如果评估了mAP,则保存轮数和mAP值,否则只保存轮数到文件名 base_name = self._weight_base_name model_name = f'{base_name}-{epoch}.pt' if self.AP is None\ else f'{base_name}-{epoch}-{self.AP.AP:.4f}.pt' model_path = os.path.join(self._weights_dir, model_name) # 保存模型参数 status = { 'step': self.global_step, 'AP': self.AP, 'model': self.model.state_dict(), 'cfg': self.cfg, 'type': 'qat' if self._quant_train else 'normal', 'backend': self._quant_backend if self._quant_train else 'none', } torch.save(status, model_path) def train_epoch(self, epoch): # 耗尽一次数据集 self.dataload_tt.tic() for data in self.train_dataloader: self.global_step += 1 # 将data中的每一个去掉第一个维度(去掉batch_size=1的维度) # 依次是: # 经过预处理归一化的指定大小的图片(h, w) # 格点化的一批图片中小目标的标注(batch_size, h/8, w/8, 3, 6+类别数) # 格点化的一批图片中中目标的标注(batch_size, h/16, w/16, 3, 6+类别数) # 格点化的一批图片中大目标的标注(batch_size, h/32, w/32, 3, 6+类别数) # 原始的一批图片中小目标的标注(batch_size, ?, 4) # 原始的一批图片中中目标的标注(batch_size, ?, 4) # 原始的一批图片中大目标的标注(batch_size, ?, 4) if self._quant_train: data = [item.cuda() for item in data] image, label_sbbox, label_mbbox, label_lbbox,\ sbbox, mbbox, lbbox = data self.dataload_tt.toc() # 调整学习率 lr = self.scheduler(self.global_step) self.model_tt.tic() # 前向转播计算总损失和其他部分损失 losses_dict = self.model( image, (label_sbbox, label_mbbox, label_lbbox, sbbox, mbbox, lbbox)) # 清除梯度 self.optimizer.zero_grad() # 反向转播更新参数 losses_dict['loss'].mean().backward() # 如果稀疏训练 if self._sparse_train: # 对每个收集的BN module施加L1惩罚 for m in self.bns: m.weight.grad.data.add_(self._sparse_ratio * torch.sign(m.weight.data)) self.optimizer.step() self.model_tt.toc() # 更新每个损失的记录值 for name, loss in losses_dict.items(): if isinstance(loss, torch.Tensor): self.losses[name].update(loss.mean().item()) else: for i, l in enumerate(loss): self.losses[name][i].update(l.mean().item()) # 如果到达打印的步数 if self.global_step % self._loss_print_interval == 0: # 去除每个损失的平均值并重置 loss_values = {} for name, loss in self.losses.items(): if tools.is_sequence(loss): loss_values.update({ f'{name}_{i}': l.get_avg_reset() for i, l in enumerate(loss) }) else: loss_values[name] = loss.get_avg_reset() # 依次打印:学习率(lr);当前轮数/最大轮数(epoch);步数(step); # 训练总平均损失(train_loss)=检测头0-2平均损失 # 位置平均损失(xy);置信度平均损失(conf);类别平均损失(cls) print(f'lr: {lr:.6f}\tepoch: {epoch}/{self._max_epochs}\tstep: {self.global_step}\t'+\ 'train_loss: {loss:.2f}={loss_per_branch_0:.2f}+{loss_per_branch_1:.2f}+' '{loss_per_branch_2:.2f}(xy: {giou_loss:.2f}, conf: {conf_loss:.2f}, ' 'cls: {class_loss:.2f})'.format(**loss_values) ) self.dataload_tt.tic() self.train_dataloader.dataset.init_shuffle() # 如果稀疏训练 if self._sparse_train: # 排序BN层gamma大小,统计20%,40%,60%,80%,100%位置的gamma大小 # 由此可以看出稀疏化水平 bn_vals = np.concatenate( [m.weight.data.abs().clone().cpu().numpy() for m in self.bns]) bn_vals.sort() bn_num = len(bn_vals) bn_indexes = [round(i / 5 * bn_num) - 1 for i in range(1, 6)] print('sparse level: {}'.format(bn_vals[bn_indexes].tolist())) print('data load time: {:.3f}s, model train time: {:.3f}s'.format( self.dataload_tt.sum_reset() / 1e9, self.model_tt.sum_reset() / 1e9)) def train(self): # 每一轮训练 for epoch in range(self.init_eopch, self._max_epochs): self.model.train() self._clear_ap() if self._quant_train: if epoch >= self._disable_observer_after: # Freeze quantizer parameters self.model.apply(torch.quantization.disable_observer) if epoch >= self._freeze_bn_after: # Freeze batch norm mean and variance estimates self.model.apply(torch.nn.intrinsic.qat.freeze_bn_stats) self.epoch_tt.tic() self.train_epoch(epoch) self.epoch_tt.toc() print('{:.3f}s per epoch'.format(self.epoch_tt.sum_reset() / 1e9)) # 轮数超过了指定轮数 if epoch >= self._eval_after: if self._quant_train: self.evaluator.model = tools.quantized_model(self.model) # 设置模型为评估模式 self.model.eval() self.eval() # 重新设置为训练模式 self.model.train() self.save(epoch) def run_nas(self, model): self._warmup_epochs = 0.5 self.model = model self.init_dataset() self._steps_per_epoch = len(self.train_dataloader) self._loss_print_interval = self._steps_per_epoch // 5 self.init_evaluator() self.init_optimizer() self.init_losses() self.model.train() for epoch in range(0, self._eval_after + 1): self.epoch_tt.tic() self.train_epoch(epoch) self.epoch_tt.toc() print('{:.3f}s per epoch'.format(self.epoch_tt.sum_reset() / 1e9)) if epoch >= self._eval_after: self.model.eval() return self.evaluator.evaluate().AP def run(self): tools.ensure_dir(self._weights_dir) self.init_cfg() self.init_dataset() # 一轮训练的步数 self._steps_per_epoch = len(self.train_dataloader) # 每一轮训练打印多少次损失 self._loss_print_interval = self._steps_per_epoch // 5 self.init_model() self.init_evaluator() self.init_optimizer() self.init_losses() if self._sparse_train: self.bns = tools.get_bn_layers(self.model) self.train() def run_prune(self, prune_weight: str): self._cfg_path = self.config.prune.new_cfg self._init_lr *= 0.2 self._warmup_epochs = 0 self._max_epochs = 20 self._backbone_weight = '' self._resume_weight = prune_weight self._clear_history = True self._eval_after = 0 self._sparse_train = False self._weight_base_name = f'pruned-{round(self._prune_ratio*100)}-model' self.run()