def step_epoch(self): ''' train one epoch ''' lr = -1 for i, (imgs, boxes, labels, frame_labels, locs, scales) in enumerate(self.loader): if self.lr_func is not None: lr = self.lr_func(self.step) for param_group in self.opt.param_groups: param_group['lr'] = lr if i == 0: batch_size = int(imgs.shape[0]) time_start = time.time() self.opt.zero_grad() temp, loss_frame = self.net(imgs, locs, labels, boxes, frame_labels) loss = (get_loss(temp) + get_loss(loss_frame)) / 2 loss.backward() if self.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip) self.opt.step() maxmem = int( torch.cuda.max_memory_allocated(device=self.device[0]) / 1024 / 1024) time_end = time.time() totaltime = int((time_end - time_start) * 1000) print('total_step:%d: epoch:%d, step:%d/%d, loss:%f, maxMem:%dMB, time:%dms, lr:%f' % \ (self.step, self.epoch, i*batch_size, len(self.dataset), loss, maxmem, totaltime, lr)) self.step += 1 self.epoch += 1
def step_epoch(self): ''' train one epoch ''' lr = -1 progressbar = tqdm(enumerate(self.loader), total=len(self.loader)) for i, (imgs, boxes, labels, locs, scales) in progressbar: if self.lr_func is not None: lr = self.lr_func(self.step) for param_group in self.opt.param_groups: param_group['lr'] = lr if i == 0: batch_size = int(imgs.shape[0]) time_start = time.time() self.opt.zero_grad() temp = self.net(imgs, locs, labels, boxes) loss = get_loss(temp) loss.backward() if self.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip) self.opt.step() maxmem = int(torch.cuda.max_memory_allocated(device=self.device[0]) / 1024 / 1024) time_end = time.time() totaltime = int((time_end - time_start) * 1000) # descriptionStr = ("total_step:%d: epoch:%d, step:%d/%d, loss:%f, maxMem:%dMB, time:%dms, lr:%f" % (self.step, self.epoch, i*batch_size, len(self.dataset), loss, maxmem, totaltime, lr)) progressbar.set_description("epoch: %d, loss: %f, lr: %f" % (self.epoch, loss, lr)) # writing log to tensorboard if self.tb_writer and i % 10 == 0: totalStep = self.epoch * len(self.dataset) + i * batch_size self.tb_writer.add_scalar('training/loss', loss, totalStep) self.tb_writer.add_scalar('training/learning rate', lr, totalStep) self.step += 1 self.epoch += 1
def step_epoch(self, writer): ''' train one epoch ''' lr = -1 for i, (img, bbox, label, loc, scale) in enumerate(self.loader): if self.lr_func is not None: lr = self.lr_func(self.step) for param_group in self.opt.param_groups: param_group['lr'] = lr if i == 0: batch_size = int(img.shape[0]) time_start = time.time() self.opt.zero_grad() temp = self.net(img, loc, label, bbox) loss = get_loss(temp) loss.backward() if self.grad_clip > 0: torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_clip) self.opt.step() # self.sch.step() # lr = self.sch.get_lr()[-1] maxmem = int( torch.cuda.max_memory_allocated(device=self.device[0]) / 1024 / 1024) time_end = time.time() totaltime = int((time_end - time_start) * 1000) writer.add_scalar('loss', loss, self.step) print('total_step:%d: epoch:%d, step:%d/%d, loss:%f, maxMem:%dMB, time:%dms, lr:%f' % \ (self.step, self.epoch, i*batch_size, len(self.dataset), loss, maxmem, totaltime, lr)) self.step += 1 self.epoch += 1
# Run warmup WARM_UP_ITERS = 500 WARM_UP_FACTOR = 1.0 / 3.0 if cfg['freeze_bn']: net.module.backbone.freeze_bn() for i, (img, bbox, label, scale, oh, ow) in enumerate(loader_train): alpha = float(i) / WARM_UP_ITERS warmup_factor = WARM_UP_FACTOR * (1.0 - alpha) + alpha for param_group in opt.param_groups: param_group['lr'] = lr * warmup_factor time_start = time.time() opt.zero_grad() temp = net(img, label, bbox) loss = get_loss(temp) loss.backward() clip = cfg['grad_clip'] torch.nn.utils.clip_grad_norm_(net.parameters(), clip) opt.step() maxmem = int(torch.cuda.max_memory_allocated(device=cfg['device'][0]) / 1024 / 1024) time_end = time.time() totaltime = int((time_end - time_start) * 1000) print('warmup: step:%d/%d, lr:%f, loss:%f, maxMem:%dMB, time:%dms' % \ (i, WARM_UP_ITERS, lr * warmup_factor, loss, maxmem, totaltime)) if i >= WARM_UP_ITERS: break # Run epoch epoch = 0