Exemple #1
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=50,
                    warmup=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    mloss = torch.zeros(1).to(device)  # mean losses
    enable_amp = True if "cuda" in device.type else False
    for i, [images, targets] in enumerate(
            metric_logger.log_every(data_loader, print_freq, header)):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用
        with torch.cuda.amp.autocast(enabled=enable_amp):
            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purpose
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()
            # 记录训练损失
            mloss = (mloss * i + loss_value) / (i + 1)  # update mean losses

            if not math.isfinite(loss_value):  # 当计算的损失为无穷大时停止训练
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        now_lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(lr=now_lr)

    return mloss, now_lr
Exemple #2
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    num_classes,
                    lr_scheduler,
                    print_freq=10,
                    scaler=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    if num_classes == 2:
        # 设置cross_entropy中背景和前景的loss权重(根据自己的数据集进行设置)
        loss_weight = torch.as_tensor([1.0, 2.0], device=device)
    else:
        loss_weight = None

    for image, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        image, target = image.to(device), target.to(device)
        with torch.cuda.amp.autocast(enabled=scaler is not None):
            output = model(image)
            loss = criterion(output,
                             target,
                             loss_weight,
                             num_classes=num_classes,
                             ignore_index=255)

        optimizer.zero_grad()
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        lr_scheduler.step()

        lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(loss=loss.item(), lr=lr)

    return metric_logger.meters["loss"].global_avg, lr
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    warmup=True,
                    print_freq=10):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr',
                            utils.SmoothedValue(window_size=1, fmt='{value}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for image, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        image, target = image.to(device), target.to(device)
        output = model(image)
        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(loss=loss.item(), lr=round(lr, 5))
Exemple #4
0
def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    lr_scheduler,
                    print_freq=10,
                    scaler=None):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    for image, target in metric_logger.log_every(data_loader, print_freq,
                                                 header):
        image, target = image.to(device), target.to(device)
        with torch.cuda.amp.autocast(enabled=scaler is not None):
            output = model(image)
            loss = criterion(output, target)

        optimizer.zero_grad()
        if scaler is not None:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()

        lr_scheduler.step()

        lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(loss=loss.item(), lr=lr)

    return metric_logger.meters["loss"].global_avg, lr
Exemple #5
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    train_loss=None, train_lr=None, warmup=False):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0 and warmup is True:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
        warmup_factor = 5.0 / 10000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        # batch inputs information
        images = torch.stack(images, dim=0)

        boxes = []
        labels = []
        img_id = []
        for t in targets:
            boxes.append(t['boxes'])
            labels.append(t['labels'])
            img_id.append(t["image_id"])
        targets = {"boxes": torch.stack(boxes, dim=0),
                   "labels": torch.stack(labels, dim=0),
                   "image_id": torch.as_tensor(img_id)}

        images = images.to(device)

        targets = {k: v.to(device) for k, v in targets.items()}
        losses_dict = model(images, targets)
        losses = losses_dict["total_losses"]

        # reduce losses over all GPUs for logging purpose
        losses_dict_reduced = utils.reduce_dict(losses_dict)
        losses_reduce = losses_dict_reduced["total_losses"]

        loss_value = losses_reduce.item()
        if isinstance(train_loss, list):
            # 记录训练损失
            train_loss.append(loss_value)

        if not math.isfinite(loss_value):  # 当计算的损失为无穷大时停止训练
            print("Loss is {}, stopping training".format(loss_value))
            print(losses_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:  # 第一轮使用warmup训练方式
            lr_scheduler.step()

        # metric_logger.update(loss=losses, **loss_dict_reduced)
        metric_logger.update(**losses_dict_reduced)
        now_lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(lr=now_lr)
        if isinstance(train_lr, list):
            train_lr.append(now_lr)
    def _train_one_epoch(self,
                         train_loader,
                         batch_size=0,
                         epoch=0,
                         print_freq=1,
                         multi_scale=False,
                         img_size=(512, 512),
                         grid_min=None,
                         grid_max=None,
                         grid_size=32,
                         random_size=64,
                         device=torch.device('cuda'),
                         warmup=False):
        self.model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        lr_scheduler = None
        if epoch == 0 and warmup:  # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
            warmup_factor = 1.0 / 1000
            warmup_iters = min(1000, len(train_loader) - 1)

            lr_scheduler = utils.warmup_lr_scheduler(self.optimizer,
                                                     warmup_iters,
                                                     warmup_factor)
            random_size = 1

        enable_amp = 'cuda' in device.type
        scale = amp.GradScaler(enabled=enable_amp)

        lr_now = 0.
        loss_mean = torch.zeros(4).to(device)  # mean losses
        batch_size = len(train_loader)  # number of batches
        for i, (images, targets, paths, _, _) in enumerate(
                metric_logger.log_every(train_loader, print_freq, header)):
            # count_batch 统计从 epoch0 开始的所有 batch 数
            count_batch = i + batch_size * epoch  # number integrated batches (since train start)
            images = images.to(device).float(
            ) / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Multi-Scale
            # 由于label已转为相对坐标,故缩放图片不影响label的值
            # 每训练64张图片,就随机修改一次输入图片大小
            if multi_scale:
                images, img_size = self.random_size(
                    images, img_size, count_batch % random_size == 0, grid_min,
                    grid_max, grid_size)

            # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用
            with amp.autocast(enabled=enable_amp):
                # loss: compute_loss
                loss_dict = self.loss(self.model(images), targets)

                losses = sum(loss for loss in loss_dict.values())

                # reduce losses over all GPUs for logging purpose
                loss_dict_reduced = utils.reduce_dict(loss_dict)
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                loss_items = torch.cat((loss_dict_reduced["box_loss"],
                                        loss_dict_reduced["obj_loss"],
                                        loss_dict_reduced["class_loss"],
                                        losses_reduced)).detach()
                loss_mean = (loss_mean * i + loss_items) / (
                    i + 1)  # update mean losses

                if not torch.isfinite(losses_reduced):
                    print('WARNING: non-finite loss, ending training ',
                          loss_dict_reduced)
                    print("training image path: {}".format(",".join(paths)))
                    sys.exit(1)

                losses *= 1. / random_size  # scale loss

            # backward
            scale.scale(losses).backward()
            # optimize
            # 每训练64张图片更新一次权重
            if count_batch % random_size == 0:
                scale.step(self.optimizer)
                scale.update()
                self.optimizer.zero_grad()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            lr_now = self.optimizer.param_groups[0]["lr"]
            metric_logger.update(lr=lr_now)

            if count_batch % random_size == 0 and lr_scheduler is not None:  # 第一轮使用warmup训练方式
                self.optimizer.step()
                lr_scheduler.step()

        return loss_mean, lr_now