def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=50, warmup=False): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0 and warmup is True: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) mloss = torch.zeros(1).to(device) # mean losses enable_amp = True if "cuda" in device.type else False for i, [images, targets] in enumerate( metric_logger.log_every(data_loader, print_freq, header)): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用 with torch.cuda.amp.autocast(enabled=enable_amp): loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purpose loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() # 记录训练损失 mloss = (mloss * i + loss_value) / (i + 1) # update mean losses if not math.isfinite(loss_value): # 当计算的损失为无穷大时停止训练 print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: # 第一轮使用warmup训练方式 lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) now_lr = optimizer.param_groups[0]["lr"] metric_logger.update(lr=now_lr) return mloss, now_lr
def train_one_epoch(model, optimizer, data_loader, device, epoch, num_classes, lr_scheduler, print_freq=10, scaler=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) if num_classes == 2: # 设置cross_entropy中背景和前景的loss权重(根据自己的数据集进行设置) loss_weight = torch.as_tensor([1.0, 2.0], device=device) else: loss_weight = None for image, target in metric_logger.log_every(data_loader, print_freq, header): image, target = image.to(device), target.to(device) with torch.cuda.amp.autocast(enabled=scaler is not None): output = model(image) loss = criterion(output, target, loss_weight, num_classes=num_classes, ignore_index=255) optimizer.zero_grad() if scaler is not None: scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: loss.backward() optimizer.step() lr_scheduler.step() lr = optimizer.param_groups[0]["lr"] metric_logger.update(loss=loss.item(), lr=lr) return metric_logger.meters["loss"].global_avg, lr
def train_one_epoch(model, optimizer, data_loader, device, epoch, warmup=True, print_freq=10): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0 and warmup is True: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for image, target in metric_logger.log_every(data_loader, print_freq, header): image, target = image.to(device), target.to(device) output = model(image) loss = criterion(output, target) optimizer.zero_grad() loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() lr = optimizer.param_groups[0]["lr"] metric_logger.update(loss=loss.item(), lr=round(lr, 5))
def train_one_epoch(model, optimizer, data_loader, device, epoch, lr_scheduler, print_freq=10, scaler=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) for image, target in metric_logger.log_every(data_loader, print_freq, header): image, target = image.to(device), target.to(device) with torch.cuda.amp.autocast(enabled=scaler is not None): output = model(image) loss = criterion(output, target) optimizer.zero_grad() if scaler is not None: scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: loss.backward() optimizer.step() lr_scheduler.step() lr = optimizer.param_groups[0]["lr"] metric_logger.update(loss=loss.item(), lr=lr) return metric_logger.meters["loss"].global_avg, lr
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, train_loss=None, train_lr=None, warmup=False): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0 and warmup is True: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 warmup_factor = 5.0 / 10000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): # batch inputs information images = torch.stack(images, dim=0) boxes = [] labels = [] img_id = [] for t in targets: boxes.append(t['boxes']) labels.append(t['labels']) img_id.append(t["image_id"]) targets = {"boxes": torch.stack(boxes, dim=0), "labels": torch.stack(labels, dim=0), "image_id": torch.as_tensor(img_id)} images = images.to(device) targets = {k: v.to(device) for k, v in targets.items()} losses_dict = model(images, targets) losses = losses_dict["total_losses"] # reduce losses over all GPUs for logging purpose losses_dict_reduced = utils.reduce_dict(losses_dict) losses_reduce = losses_dict_reduced["total_losses"] loss_value = losses_reduce.item() if isinstance(train_loss, list): # 记录训练损失 train_loss.append(loss_value) if not math.isfinite(loss_value): # 当计算的损失为无穷大时停止训练 print("Loss is {}, stopping training".format(loss_value)) print(losses_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: # 第一轮使用warmup训练方式 lr_scheduler.step() # metric_logger.update(loss=losses, **loss_dict_reduced) metric_logger.update(**losses_dict_reduced) now_lr = optimizer.param_groups[0]["lr"] metric_logger.update(lr=now_lr) if isinstance(train_lr, list): train_lr.append(now_lr)
def _train_one_epoch(self, train_loader, batch_size=0, epoch=0, print_freq=1, multi_scale=False, img_size=(512, 512), grid_min=None, grid_max=None, grid_size=32, random_size=64, device=torch.device('cuda'), warmup=False): self.model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0 and warmup: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(train_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor) random_size = 1 enable_amp = 'cuda' in device.type scale = amp.GradScaler(enabled=enable_amp) lr_now = 0. loss_mean = torch.zeros(4).to(device) # mean losses batch_size = len(train_loader) # number of batches for i, (images, targets, paths, _, _) in enumerate( metric_logger.log_every(train_loader, print_freq, header)): # count_batch 统计从 epoch0 开始的所有 batch 数 count_batch = i + batch_size * epoch # number integrated batches (since train start) images = images.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Multi-Scale # 由于label已转为相对坐标,故缩放图片不影响label的值 # 每训练64张图片,就随机修改一次输入图片大小 if multi_scale: images, img_size = self.random_size( images, img_size, count_batch % random_size == 0, grid_min, grid_max, grid_size) # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用 with amp.autocast(enabled=enable_amp): # loss: compute_loss loss_dict = self.loss(self.model(images), targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purpose loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_items = torch.cat((loss_dict_reduced["box_loss"], loss_dict_reduced["obj_loss"], loss_dict_reduced["class_loss"], losses_reduced)).detach() loss_mean = (loss_mean * i + loss_items) / ( i + 1) # update mean losses if not torch.isfinite(losses_reduced): print('WARNING: non-finite loss, ending training ', loss_dict_reduced) print("training image path: {}".format(",".join(paths))) sys.exit(1) losses *= 1. / random_size # scale loss # backward scale.scale(losses).backward() # optimize # 每训练64张图片更新一次权重 if count_batch % random_size == 0: scale.step(self.optimizer) scale.update() self.optimizer.zero_grad() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) lr_now = self.optimizer.param_groups[0]["lr"] metric_logger.update(lr=lr_now) if count_batch % random_size == 0 and lr_scheduler is not None: # 第一轮使用warmup训练方式 self.optimizer.step() lr_scheduler.step() return loss_mean, lr_now