Ejemplo n.º 1
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for images, targets, _ in metric_logger.log_every(data_loader, print_freq, header):
        
        images = list(image.to(device) for image in images)
        targets_ = [{k: v.to(device) for k, v in t.items()} for t in targets]        
     
        loss_dict = model(images, targets_)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return loss_value
Ejemplo n.º 2
0
def evaluate(model, data_loader, device, epoch,
             print_freq):  # test overfitting
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Validation'.format(epoch)
    sum_loss = []

    with torch.no_grad():
        for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                       header):
            # for images, targets in data_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            # loss in origin paper
            # losses_reduced = loss_dict_reduced['loss_classifier'] + loss_dict_reduced['loss_box_reg']
            # losses = loss_dict['loss_classifier'] + loss_dict['loss_box_reg']
            if math.isfinite(losses.item()):
                sum_loss.append(losses.item())

            loss_value = losses_reduced.item()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)

            if device == 'cuda':
                torch.cuda.empty_cache()
                del images
                del targets
                del losses_reduced
                del losses
                del loss_dict
                del loss_dict_reduced
            # break
    sum_loss = np.sum(sum_loss)
    return sum_loss
Ejemplo n.º 3
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        # for images, targets in data_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        # loss in original paper
        # losses = loss_dict['loss_classifier'] + loss_dict['loss_box_reg']
        # losses_reduced = loss_dict_reduced['loss_classifier'] + loss_dict_reduced['loss_box_reg']

        loss_value = losses_reduced.item()

        optimizer.zero_grad()
        losses.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        optimizer.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if device == 'cuda':
            torch.cuda.empty_cache()
            del images
            del targets
            del losses_reduced
            del losses
            del loss_dict
            del loss_dict_reduced
def trainer(train, model, optimizer):
    print("---------- Start Training ----------")

    trainloader = torch.utils.data.DataLoader(train,
                                              batch_size=2,
                                              shuffle=True,
                                              num_workers=4,
                                              collate_fn=utils.collate_fn)

    try:
        with tqdm(trainloader, ncols=100) as pbar:
            train_loss = 0.0
            for images, targets in pbar:
                images = list(image.to(device) for image in images)
                targets = [{k: v.to(device)
                            for k, v in t.items()} for t in targets]

                loss_dict = model(images, targets)

                losses = sum(loss for loss in loss_dict.values())

                # reduce losses over all GPUs for logging purposes
                loss_dict_reduced = utils.reduce_dict(loss_dict)
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())

                loss_value = losses_reduced.item()

                if not math.isfinite(loss_value):
                    print("Loss is {}, stopping training".format(loss_value))
                    print(loss_dict_reduced)
                    sys.exit(1)

                optimizer.zero_grad()
                losses.backward()
                optimizer.step()

                train_loss += loss_value
        return train_loss
    except ValueError:
        pass
Ejemplo n.º 5
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for images, targets in data_loader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

    return losses
# setup dataloader
dataset = VOCDetection2007(root=voc_base_dir, image_set='train')
dataloader = DataLoader(dataset,
                        3,
                        shuffle=True,
                        num_workers=0,
                        collate_fn=lambda x: tuple(zip(*x)))

# model.train() --> model(imgs, targets) --> loss breakdown
images, targets = next(iter(dataloader))
print('images shape: {} \n\n'.format(images[0].shape))

images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
# gross sum of loss
loss_value = losses_reduced.item()

print('loss_dict.keys: {}'.format(loss_dict.keys()))
print('loss_dict: {}'.format(loss_dict))
print('loss classifier: {}'.format(
    loss_dict['loss_classifier'].cpu().tolist()))
print('losses_reduced: {}'.format(losses_reduced))
print('loss_value: {}\n\n'.format(loss_value))

# model.eval() --> model(imgs) --> model post-processed bbox predictions
model.eval()
preds = model(images)
print('preds[0].keys(): {}'.format(preds[0].keys()))
Ejemplo n.º 7
0
def evaluate(model, criterion, postprocessor, data_loader, base_ds, device, eval_bbox, eval_masks):
    model.eval()
    criterion.eval()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}"))
    header = "Test:"

    iou_types = []
    if eval_masks:
        iou_types += ["segm"]
    if eval_bbox:
        iou_types += ["bbox"]
    iou_types = tuple(iou_types)
    if isinstance(base_ds, LVIS):
        coco_evaluator = LvisEvaluator(base_ds, iou_types) if eval_bbox or eval_masks else None
    else:
        coco_evaluator = CocoEvaluator(base_ds, iou_types) if eval_bbox or eval_masks else None
    # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
    for samples, targets in metric_logger.log_every(data_loader, 10, header):
        samples = samples.to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        outputs = model(samples)
        loss_dict = criterion(outputs, targets)
        weight_dict = criterion.weight_dict

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        loss_dict_reduced_scaled = {
            k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict
        }
        loss_dict_reduced_unscaled = {f"{k}_unscaled": v for k, v in loss_dict_reduced.items()}
        metric_logger.update(
            loss=sum(loss_dict_reduced_scaled.values()),
            **loss_dict_reduced_scaled,
            **loss_dict_reduced_unscaled,
        )
        metric_logger.update(class_error=loss_dict_reduced["class_error"])

        results = postprocessor(outputs, targets)
        res = {target["image_id"].item(): output for target, output in zip(targets, results)}
        if coco_evaluator is not None:
            coco_evaluator.update(res)
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    if coco_evaluator is not None:
        coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    if coco_evaluator is not None:
        coco_evaluator.accumulate()
        coco_evaluator.summarize()
    stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
    if coco_evaluator is not None:
        if eval_bbox:
            stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist()
        if eval_masks:
            stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist()
    return stats, coco_evaluator
Ejemplo n.º 8
0
def main(args):
    torch.cuda.set_device(args.local_rank)

    utils.init_distributed_mode(args)
    hook = smd.Hook.create_from_json_file()

    device = torch.device('cuda')

    # Data loading code
    print("Loading data")

    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

    indices = torch.randperm(len(dataset)).tolist()
    dataset_test = torch.utils.data.Subset(dataset, indices[-50:])
    dataset = torch.utils.data.Subset(dataset, indices[:-50])

    num_classes = 2

    print("Creating data loaders")
    if args.world_size > 1:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    train_batch_sampler = torch.utils.data.BatchSampler(train_sampler,
                                                        args.batch_size,
                                                        drop_last=True)
    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_sampler=train_batch_sampler,
        #num_workers=args.workers,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=args.batch_size,
                                                   sampler=test_sampler,
                                                   num_workers=args.workers,
                                                   collate_fn=utils.collate_fn)

    print("Creating model")
    model = torchvision.models.detection.__dict__[args.model](
        num_classes=num_classes,
        pretrained=False,
        rpn_nms_thresh=1,
        rpn_pre_nms_top_n_train=5000)

    model.to('cuda')
    #hook.register_module(model)
    model_without_ddp = model
    if args.world_size > 1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])

    if args.test_only:
        evaluate(model, data_loader_test, device=device)
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.epochs):
        if args.world_size > 1:
            train_sampler.set_epoch(epoch)

        hook.set_mode(modes.TRAIN)
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        if epoch == 0:
            warmup_factor = 1. / 1000
            warmup_iters = min(1000, len(data_loader) - 1)

            lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                     warmup_factor)

        for iteration, (images, targets) in enumerate(data_loader):
            images = list(image.to('cuda') for image in images)
            targets = [{k: v.to('cuda')
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            optimizer.zero_grad()
            losses.backward()

            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if iteration % args.checkpoint_freq == 0:
                utils.save_on_master(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }, 'model_{}.pth')

        lr_scheduler.step()

        hook.set_mode(modes.EVAL)
        lr_scheduler.step()

        hook.set_mode(modes.EVAL)
        evaluate(model, data_loader_test, device=device)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Ejemplo n.º 9
0
    def train(self):
        LOSSES_NAME = self.args.LOSSES_NAME
        task_dict = {
            'Mask_LM': 'word_mask',
            'Matched': 'matched',
            'Mask_Obj': 'vis_mask',
            'Mask_Attr': 'vis_mask',
            'Mask_Feat': 'vis_mask',
            'QA': 'qa'
        }

        if self.args.dry:
            results = self.evaluate_epoch(epoch=0)

        self.optim.zero_grad()

        if self.verbose:
            loss_meters = [LossMeter() for _ in range(len(LOSSES_NAME))]
            best_eval_loss = 9595.

            from torch.utils.tensorboard import SummaryWriter
            self.writer = SummaryWriter(log_dir=self.args.log_dir)
            print('logging at', str(self.args.log_dir))
            self.logger.info('logging at' + str(self.args.log_dir))

            hparam_dict = {}
            for k, v in self.args.__dict__.items():
                if type(v) in [int, float, str, bool, torch.Tensor]:
                    hparam_dict[k] = v
            metric_dict = {}

            self.writer.add_hparams(hparam_dict, metric_dict)

        dist.barrier()

        n_update = 0
        global_step = 0
        for epoch in range(self.args.epochs):
            if self.start_epoch is not None:
                epoch += self.start_epoch
            if self.args.distributed:
                self.train_loader.sampler.set_epoch(epoch)

            # Train
            self.model.train()
            loss_counts = [0 for _ in range(len(LOSSES_NAME))]

            if self.verbose:
                pbar = tqdm(total=len(self.train_loader), ncols=240)

            epoch_results = {
                'lm_loss': 0,
                'vis_loss': 0,
                'matched_loss': 0,
                'qa_loss': 0,
                'obj_loss': 0,
                'feat_loss': 0,
                'attr_loss': 0,
            }
            for k in list(epoch_results.keys()):
                if k[-4:] == 'loss':
                    epoch_results[f'{k}_count'] = 0

            if self.args.task_qa:
                uid2ans = {}

            for step_i, batch in enumerate(self.train_loader):
                # task = random.choice(self.args.MASK_MODALITY)
                task_i = step_i % len(self.args.MASK_MODALITY)
                task = self.args.MASK_MODALITY[task_i]

                # with torch.autograd.set_detect_anomaly(True):
                results = self.forward(batch, task)

                if self.args.fp16 and _use_native_amp:
                    with autocast():
                        results = self.model(batch, task)
                else:
                    results = self.model(batch, task)

                if task == 'vis_mask':
                    if 'Mask_Obj' in LOSSES_NAME:
                        epoch_results['obj_loss_count'] += 1
                    if 'Mask_Feat' in LOSSES_NAME:
                        epoch_results['feat_loss_count'] += 1
                    if 'Mask_Attr' in LOSSES_NAME:
                        epoch_results['attr_loss_count'] += 1
                    epoch_results['vis_loss_count'] += 1
                elif task == 'word_mask':
                    epoch_results['lm_loss_count'] += 1
                elif task == 'matched':
                    epoch_results['matched_loss_count'] += 1

                if self.args.task_qa:
                    epoch_results['qa_loss_count'] += 1
                    qa_pred = results['qa_pred']
                    for uid, ans_id in zip(batch['uid'],
                                           qa_pred.cpu().numpy()):
                        ans = self.train_loader.dataset.answer_table.id2ans(
                            ans_id)
                        uid2ans[uid] = ans

                loss = results['total_loss']

                #===== Update =====#
                if self.args.fp16 and _use_native_amp:
                    self.scaler.scale(loss).backward()
                elif self.args.fp16 and _use_apex:
                    with amp.scale_loss(loss, self.optim) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                loss = loss.detach()

                # Update Parameters
                if self.args.clip_grad_norm > 0:
                    if self.args.fp16 and _use_native_amp:
                        self.scaler.unscale_(self.optim)
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.args.clip_grad_norm)
                    elif self.args.fp16 and _use_apex:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self.optim),
                            self.args.clip_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.args.clip_grad_norm)

                if self.args.fp16 and _use_native_amp:
                    self.scaler.step(self.optim)
                    self.scaler.update()
                else:
                    self.optim.step()

                if self.lr_scheduler:
                    self.lr_scheduler.step()
                for param in self.model.parameters():
                    param.grad = None

                global_step += 1
                #====================#

                try:
                    lr = self.scheduler.get_last_lr()[0]
                except AttributeError:
                    lr = self.args.lr

                if self.verbose:
                    desc_str = f'Epoch {epoch} | LR {lr:.6f} | '
                    if self.args.word_mask_predict:
                        desc_str += f'Word Mask: Uniform (MP) | '
                    elif self.args.word_mask_rate > 0:
                        desc_str += f'Word Mask: {self.args.word_mask_rate:.2f} | '

                    if self.args.vis_mask_predict:
                        desc_str += f'Vis Mask: Uniform (MP) |'
                    else:
                        desc_str += f'Vis Mask: {self.args.obj_mask_rate:.2f} |'

                    if self.args.task_qa:
                        loss_meter = loss_meters[-1]
                        loss_meter.update(results['qa_loss'].item())
                        loss_counts[-1] += 1

                    for i, (loss_name, loss_meter) in enumerate(
                            zip(LOSSES_NAME, loss_meters)):
                        if task_dict[loss_name] == task:
                            if task == 'vis_mask':
                                if loss_name == 'Mask_Obj':
                                    loss_meter.update(
                                        results['obj_loss'].item())
                                elif loss_name == 'Mask_Attr':
                                    loss_meter.update(
                                        results['attr_loss'].item())
                                elif loss_name == 'Mask_Feat':
                                    loss_meter.update(
                                        results['feat_loss'].item())
                            elif task == 'word_mask':
                                loss_meter.update(results['lm_loss'].item())
                            elif task == 'matched':
                                loss_meter.update(
                                    results['matched_loss'].item())
                            # elif task == 'qa':
                            #     loss_meter.update(results['qa_loss'].item())

                            loss_counts[i] += 1
                        if len(loss_meter) > 0:
                            loss_count = loss_counts[i]
                            if loss_name in [
                                    'Mask_LM', 'Matched', 'Mask_Obj',
                                    'Mask_Attr', 'Mask_Feat', 'QA'
                            ]:
                                desc_str += f' {loss_name} ({loss_count}) {loss_meter.val:.3f}'
                            else:
                                desc_str += f' {loss_name} {loss_meter.val:.3f}'

                            if step_i % 10 == 0:
                                self.writer.add_scalar(
                                    f'Train_steps/{loss_name}', loss_meter.val,
                                    global_step)

                    # if update:
                    n_update += 1
                    desc_str += f' | Total Update: {n_update}'

                    pbar.set_description(desc_str)
                    pbar.update(1)

            if self.verbose:
                pbar.close()

            dist.barrier()

            results = reduce_dict(epoch_results, self.args.gpu)
            if self.args.gpu == 0:
                total_loss = results['lm_loss'] + results[
                    'vis_loss'] + results['matched_loss'] + results['qa_loss']
                total_count = results['lm_loss_count'] + results[
                    'vis_loss_count'] + results['matched_loss_count']
                # + results['qa_loss_count']

                avg_train_loss = total_loss / total_count
                losses_str = f"Train Loss: {avg_train_loss:.4f}\n"

                for name, loss in results.items():
                    if name[-4:] == 'loss':
                        loss_count = int(results[name + '_count'])
                        if loss_count > 0:
                            avg_loss = loss / loss_count
                            if name == 'lm_loss':
                                name = 'Mask_LM'
                            elif name == 'matched_loss':
                                name = 'Matched'
                            elif name == 'obj_loss':
                                name = 'Mask_Obj'
                            elif name == 'attr_loss':
                                name = 'Mask_Attr'
                            elif name == 'feat_loss':
                                name = 'Mask_Feat'
                            elif name == 'qa_loss':
                                name = 'QA'
                            losses_str += f"{name} ({loss_count}): {avg_loss:.4f} "
                            self.writer.add_scalar(f'Train Loss/{name}',
                                                   avg_loss, epoch)
                losses_str += '\n'
                print(losses_str)
                self.logger.info(losses_str)

            if self.args.task_qa:
                dset2score, dset2cnt, score, cnt = self.train_loader.dataset.evaluator.evaluate(
                    uid2ans)

                dset2score = reduce_dict(dset2score, self.args.gpu)
                dset2cnt = reduce_dict(dset2cnt, self.args.gpu)
                score_cnt_dict = reduce_dict({
                    'score': score,
                    'cnt': cnt
                }, self.args.gpu)

                if self.args.gpu == 0:
                    score = score_cnt_dict['score']
                    cnt = score_cnt_dict['cnt']
                    accu = score / cnt
                    dset2accu = {}
                    for dset in dset2cnt:
                        dset2accu[dset] = dset2score[dset] / dset2cnt[dset]
                    accu_str = "Overall Accu %0.4f, " % (accu)
                    sorted_keys = sorted(dset2accu.keys())
                    for key in sorted_keys:
                        accu_str += "%s Accu %0.4f, " % (key, dset2accu[key])
                    print(accu_str)
                    self.logger.info(accu_str)

            dist.barrier()

            # Validation
            valid_results, valid_uid2ans = self.evaluate_epoch(epoch=epoch)

            valid_results = reduce_dict(valid_results, self.args.gpu)
            if self.args.gpu == 0:
                valid_total_loss = valid_results['lm_loss'] + valid_results[
                    'vis_loss'] + valid_results[
                        'matched_loss'] + valid_results['qa_loss']
                valid_total_count = valid_results[
                    'lm_loss_count'] + valid_results[
                        'vis_loss_count'] + valid_results['matched_loss_count']
                #  + valid_results['qa_loss_count']

                avg_valid_loss = valid_total_loss / valid_total_count
                losses_str = f"Valid Loss: {avg_valid_loss:.4f}\n"

                for name, loss in valid_results.items():
                    if name[-4:] == 'loss':
                        loss_count = int(valid_results[name + '_count'])
                        if loss_count > 0:
                            avg_loss = loss / loss_count
                            if name == 'lm_loss':
                                name = 'Mask_LM'
                            elif name == 'matched_loss':
                                name = 'Matched'
                            elif name == 'obj_loss':
                                name = 'Mask_Obj'
                            elif name == 'attr_loss':
                                name = 'Mask_Attr'
                            elif name == 'feat_loss':
                                name = 'Mask_Feat'
                            elif name == 'qa_loss':
                                name = 'QA'
                            losses_str += f"{name} ({loss_count}): {avg_loss:.4f} "
                            self.writer.add_scalar(f'Valid Loss/{name}',
                                                   avg_loss, epoch)

                losses_str += '\n'
                print(losses_str)
                self.logger.info(losses_str)

            if self.args.task_qa:
                dset2score, dset2cnt, score, cnt = self.val_loader.dataset.evaluator.evaluate(
                    valid_uid2ans)

                dset2score = reduce_dict(dset2score, self.args.gpu)
                dset2cnt = reduce_dict(dset2cnt, self.args.gpu)
                score_cnt_dict = reduce_dict({
                    'score': score,
                    'cnt': cnt
                }, self.args.gpu)

                if self.args.gpu == 0:
                    score = score_cnt_dict['score']
                    cnt = score_cnt_dict['cnt']
                    accu = score / cnt
                    dset2accu = {}
                    for dset in dset2cnt:
                        dset2accu[dset] = dset2score[dset] / dset2cnt[dset]
                    accu_str = "Overall Accu %0.4f, " % (accu)
                    sorted_keys = sorted(dset2accu.keys())
                    for key in sorted_keys:
                        accu_str += "%s Accu %0.4f, " % (key, dset2accu[key])
                    print(accu_str)
                    self.logger.info(accu_str)

            dist.barrier()

            if self.verbose:
                # Save
                if avg_valid_loss < best_eval_loss:
                    best_eval_loss = avg_valid_loss
                #     self.save("BEST_EVAL_LOSS")
                self.save("Epoch%02d" % (epoch + 1))

            dist.barrier()
Ejemplo n.º 10
0
def train_one_epoch(model: nn.Module,
                    optimizer: torch.optim.Optimizer,
                    data_loader: torch.utils.data.DataLoader,
                    master_progress_bar: master_bar,
                    device: str = "cpu"):
    """Train model in one epoch
    
    Parameters
    ----------
    model: torch.nn.Module
        model to train
    optimizer: torch.optim.Optimizer
        optimize function
    data_loader: torch.utils.data.DataLoader
        dataset loader in batch
    device: str (default: "cpu")
        "cpu" or "cuda", device to train
    master_progress_bar: fastprogress.master_bar
        progress bar to update trainning information
    
    Returns
    -------
    float
        loss of current training epoch
    """

    # Switch model to training mode
    model.train()
    training_loss = 0  # Storing total loss
    loss_dict = {}

    # For each batch
    train_progress_bar = progress_bar(data_loader, parent=master_progress_bar)
    for batch, (images, targets) in enumerate(train_progress_bar):

        # Move images and targets to device
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        # Back propagation
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        # Log loss
        loss_dict_reduced = reduce_dict(loss_dict)
        loss_dict = {
            k: v + loss_dict.get(k, 0)
            for k, v in loss_dict_reduced.items()
        }
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        training_loss += losses_reduced.item()

        mean_loss = training_loss / (batch + 1)
        log = "Loss: %.2f" % (mean_loss)
        master_progress_bar.child.comment = log

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Return training loss
    return training_loss / len(data_loader), {
        k: v / len(data_loader)
        for k, v in loss_dict.items()
    }
Ejemplo n.º 11
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    tb_writer):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        "lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    header = "Epoch: [{}]".format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1.0 / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        try:
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets
                       if t["boxes"].shape[0] > 0]
            images = list(
                image.to(device) for image, t in zip(images, targets)
                if t["boxes"].shape[0] > 0)
        except:
            print("neeeee")
            # breakpoint()

        try:
            loss_dict = model(images, targets)
        except:
            print("daaaaa")
            # breakpoint()

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)

        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced,
                             epoch=tb_writer["step"],
                             tb_writer=tb_writer["writer"],
                             **loss_dict_reduced)
        metric_logger.update(
            lr=optimizer.param_groups[0]["lr"],
            epoch=tb_writer["step"],
            tb_writer=tb_writer["writer"],
        )
        tb_writer["step"] += 1
Ejemplo n.º 12
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    writer, ckpt_path):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        for batch_idx, (images, targets) in enumerate(
                metric_logger.log_every(data_loader, print_freq, header)):
            writer.add_scalar('Training Loss', loss_value,
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_classifier',
                              loss_dict_reduced['loss_classifier'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_box_reg',
                              loss_dict_reduced['loss_box_reg'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_objectness',
                              loss_dict_reduced['loss_objectness'].item(),
                              epoch * len(data_loader) + batch_idx)
            writer.add_scalar('loss_rpn_box_reg',
                              loss_dict_reduced['loss_rpn_box_reg'].item(),
                              epoch * len(data_loader) + batch_idx)
            for name, param in model.named_parameters():
                if param.grad is not None:
                    param_norm = param.grad.data.norm(2).cpu().item()
                    writer.add_histogram(name + '_grad', param_norm, epoch)
                # else:
                #     print("{} has no grad".format(name))

        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

    #   Save model
    print("Saving model at training epoch: {}".format(epoch + 1))
    ckpt_dict = {
        'epoch': epoch + 1,
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict()
    }
    torch.save(
        ckpt_dict,
        os.path.join(
            ckpt_path, 'ckpt_epoch-' + str(epoch + 1) + 'loss' +
            str(loss_value) + '.pth'))
Ejemplo n.º 13
0
    def train_one_epoch(
        model,
        optimizer,
        data_loader,
        device,
        epoch,
        metric_logger,
        print_freq,
        mq_logger=None,
    ):
        model.train()
        # metric_logger = utils.MetricLogger(delimiter="  ")
        # metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1,
        # fmt='{value:.6f}'))
        header = "Epoch: [{}]".format(epoch)

        metric_logger.clear()

        losses_summed = 0.0
        cnt = 0

        warm_up_lr_scheduler = None
        if epoch == 0:
            warmup_factor = 1.0 / 1000
            warmup_iters = min(1000, len(data_loader) - 1)

            warm_up_lr_scheduler = utils.warmup_lr_scheduler(
                optimizer, warmup_iters, warmup_factor)

        for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                       header):
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            # it just pop.. and we do not train rpn anyway!!!!
            loss_dict.pop("loss_rpn_box_reg")
            # diferrent model use different names
            loss_dict.pop("loss_box_reg")

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            losses_summed += losses_reduced.detach().cpu().numpy()
            cnt += 1

            if warm_up_lr_scheduler is not None:  # only for epoch 0, warm up
                warm_up_lr_scheduler.step()

            if mq_logger is not None:  # issue is it runs off... So NAN
                mq_logger.debug(
                    f"losses summed is {losses_summed}, cnt is {cnt}")
                print(
                    f"losses summed is {losses_summed}, cnt is {cnt}, loss_dict_reduced is {loss_dict_reduced}"
                )
            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        return losses_summed / cnt
Ejemplo n.º 14
0
    def train_one_epoch(self, lr_schedule='cyclic'):
        self.model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(self.epoch)

        lr_scheduler = None
        if (self.epoch == 0):
            if lr_schedule == 'warmup':
                warmup_factor = 1. / 1000
                warmup_iters = min(1000, len(self.data_loader) - 1)

                lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor)
            elif lr_schedule == 'cyclic':
                lr_scheduler = torch.optim.lr_scheduler.CyclicLR(self.optimizer, 1e-6, 1e-2)

        for iteration, (images, targets) in enumerate(metric_logger.log_every(self.data_loader, self.print_freq, header)):
            with torch.autograd.detect_anomaly():
                images = list(image.to(self.device) for image in images)
                targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]

                loss_dict = self.model(images, targets)

                losses = sum(loss for loss in loss_dict.values())

                # reduce losses over all GPUs for logging purposes
                loss_dict_reduced = utils.reduce_dict(loss_dict)
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                loss_value = losses_reduced.item()

                if self.emergency is True:
                    if not math.isfinite(loss_value):
                        print()
                        print("Loss is {}, stopping training".format(loss_value))
                        print(loss_dict_reduced)
                        sys.exit(1)

                self.optimizer.zero_grad()
                losses.backward()
                grad_norm = clip_grad_norm_(self.model.parameters(), grad_clip_norm_value)
                self.optimizer.step()

                if lr_scheduler is not None:
                    lr_scheduler.step()

                metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
                metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])

                if self.logger is not None:
                    if iteration % 50 == 0:
                        # 1. Log scalar values (scalar summary)
                        info = {'loss': losses_reduced, **loss_dict_reduced}

                        for tag, value in info.items():
                            self.logger.scalar_summary(tag, value, iteration+1)

                        # 2. Log values and gradients of the parameters (histogram summary)
                        for tag, value in self.model.named_parameters():
                            tag = tag.replace('.', '/')
                            self.logger.histo_summary(tag, value.data.cpu().numpy(), iteration+1)
                       
        self.epoch += 1         
Ejemplo n.º 15
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device)
                      for image in images)  #.to(device) for both
        targets = [{k: v.to(device)
                    for k, v in t.items()}
                   for t in targets]  #.to(device) for both

        loss_dict = model(images, targets)
        '''    
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the class label for each ground-truth box
        - masks (UInt8Tensor[N, H, W]): the segmentation binary masks for each instance
    The model returns a Dict[Tensor] during training, containing the classification and regression
    losses for both the RPN and the R-CNN, and the mask loss.
    
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores or each prediction
        - masks (UInt8Tensor[N, 1, H, W]): the predicted masks for each instance, in 0-1 range. In order to
          obtain the final segmentation masks, the soft masks can be thresholded, generally
          with a value of 0.5 (mask >= 0.5)
          '''

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Ejemplo n.º 16
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)

        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        ts = copy.deepcopy(targets)
        # print(f"targets before model: {targets[0]['boxes']}")
        # print(f"n images: {len(images)}\nn boxes: {targets[0]['boxes'].shape}\nn labels: {targets[0]['labels'].shape}\nn masks: {targets[0]['masks'].shape}\n")
        loss_dict = model(images, targets)
        print(loss_dict)
        # print(f"targets after model: {targets[0]['boxes']}")
        losses = sum(loss for loss in loss_dict.values())
        # print(losses)
        # if losses.item() > 1:
        #     single_image = np.transpose(images[0].cpu().detach().numpy(),(1,2,0)).squeeze()
        #     fig = plt.figure()
        #     ax = fig.add_subplot(111, aspect='equal')
        #     ax.imshow(single_image)
        #     # print(np.unique(single_image))
        #     # cvimg = cv2.imread(img_path)
        #     # print(single_image.shape)
        #     # plt.imshow(single_image)
        #     # plt.show()
        #     # cvimg = np.uint8(single_image*255)
        #     # print(cvimg.shape)
        #     # cvimg = cvimg.astype(int)
            
        #     # r,g,b = cv2.split(cvimg)
        #     # cvimg = cv2.merge([b,g,r])
        #     # print(cvimg)
        #     # print(targets[0]['boxes'])
        #     # for box in ts[0]['boxes']:
        #     for box in targets[0]['boxes']:
        #         # print(f"dict: {dict}")
        #         # box = dict['boxes']
        #         # print(f"box: {box}")
        #         # box = box.item()

        #         x1 = box[0].item()
        #         y1 = box[1].item()
        #         x2 = box[2].item()
        #         y2 = box[3].item()
        #         # print(box)
        #         # print(f"x1:{x1} y1:{y1} x2:{x2} y2:{y2}")
                
        #         rect = patches.Rectangle((x1,y1),x2-x1,y2-y1,fill=False,edgecolor='r')
        #         ax.add_patch(rect)
                # cv2.rectangle(cvimg,(x1,y1),(x2,y2),(255,255,0))
            # plt.show()
            
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        # print(loss_dict_reduced)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        # print(losses_reduced)
        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            # visualize_bboxes(images,targets)
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def train_one_epoch_FastRCNN(model,
                             optimizer,
                             data_loader,
                             device,
                             epoch,
                             print_freq,
                             mode="sew6",
                             encoder=None,
                             train_encoder=False):
    #this data loader is given loader
    #mode can be "sew6", "panorm", "autoencode"
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    #     if epoch == 0:
    #         warmup_factor = 1. / 1000
    #         warmup_iters = min(1000, len(data_loader) - 1)

    #         lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
    if mode == 'panorm':
        tt = transforms.Compose(
            [transforms.Resize((800, 800)),
             transforms.ToTensor(), normalize])  #this is for 6 images combo
    for sample, old_targets, road_image, extra in metric_logger.log_every(
            data_loader, print_freq, header):

        #images = sample[0]

        targets = trans_target(old_targets)
        #print("images len {}, targets len {}".format(len(images), len(targets)))
        #print("len(sample) {}, sample [0] shape {}".format(len(sample), sample[0].shape)) # [6, 3, 256, 306]
        #images = list(image.to(device) for image in images)
        if mode == "panorm":
            images = [
                tt(s).to(device)
                for s in sew_images_panorm(sample, to_img=True)
            ]

        elif mode == "autoencode":
            encoder.cuda()
            samp_pan = sew_images_panorm(sample)  #convert to panoramic tensor
            samp_pan = [normalize(i) for i in samp_pan]
            samp_pan_t = torch.stack(samp_pan, dim=0)  #stack
            if train_encoder:
                images = encoder.return_image_tensor(
                    samp_pan_t.to(device), train_encoder
                )  #see if it will take it or it needs to take a list
            else:
                images = encoder.return_image_tensor(samp_pan_t.cuda(),
                                                     train_encoder).to(device)

        else:  #mode is sew6
            images = [tt(sew_images(s)).to(device) for s in sample
                      ]  #list of [3, 800, 800], should be 1 per patch
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        #print(loss_dict)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Ejemplo n.º 18
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq,
                    transfer_learning):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    loss_epoch = {
        'loss_classifier': 0,
        'loss_box_reg': 0,
        'loss_objectness': 0,
        'loss_rpn_box_reg': 0,
        'loss_total': 0,
    }
    counter = 0
    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        counter += 1
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        _, loss_dict = model(images, targets)

        # if transfer_learning:
        #     losses = sum(loss_dict[key] if key == 'loss_box_reg' or key == 'loss_classifier' else torch.zeros_like(
        #         loss_dict[key]) for key in loss_dict.keys())
        # else:
        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        # save the epoch loss
        for key in loss_dict_reduced.keys():
            loss_epoch[key] += loss_dict_reduced[key].item()

        loss_epoch['loss_total'] += loss_value

    print('Epoch: [{}]'.format(epoch))
    for key in loss_epoch.keys():
        loss_epoch[key] = loss_epoch[key] / counter
        print('{}: {}.'.format(key, loss_epoch[key]))

    return metric_logger
Ejemplo n.º 19
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    for i in metric_logger.log_every(data_loader, print_freq, header):

        try:
            images, targets = i
            '''Burası değiştirilecek'''
            targets["boxes"] = targets["boxes"].to(device)
            targets["labels"] = targets["labels"].to(device)
            targets["boxes"].squeeze_()
            targets["labels"].squeeze_()
            targets1 = [{k: v for k, v in targets.items()}]
            
            images = images.to(device)
            targets = targets1
            # zero the parameter gradients

            # forward
            # track history if only in train
            #images = list(image.to(device) for image in images)
            #targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()
            #print(targets[0]["boxes"])
            if not math.isfinite(loss_value):
                print(images.size())
                print(targets[0]["boxes"])
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        except ValueError:
            continue
            
    return metric_logger
Ejemplo n.º 20
0
def train_one_epoch(model, optimizer, data_loader, device, epoch,
                    gradient_accumulation_steps, print_freq, box_threshold):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    optimizer.zero_grad()  # gradient_accumulation
    steps = 0  # gradient_accumulation
    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        # print("target: {}".format(targets))

        steps += 1  # gradient_accumulation
        # images = list(image.to(device) for image in images)
        # images = torch.stack(images).to(device)
        # images = images.to(device)
        # targets = [{k: v.to(device) if torch.is_tensor(v) else v for k, v in t.items()} for t in targets]
        # targets = {k: v.to(device) if torch.is_tensor(v) else v for k, v in targets.items()}

        # vis = visualize.Visualize('.', targets['img_size'][0][0])
        # num_of_detections = len(torch.where(targets['cls'][0] > -1)[0])
        # vis.show_image_data(images[0], targets['cls'][0,:num_of_detections].int(), None, targets['bbox'][0,:num_of_detections,[1,0,3,2]])

        if box_threshold is None:
            loss_dict = model(images, targets)
        else:
            # loss_dict = model(images, box_threshold, targets)
            loss_dict = model(images, targets)

        # losses = sum(loss / gradient_accumulation_steps for loss in loss_dict.values())  # gradient_accumulation
        losses = loss_dict['loss'] / gradient_accumulation_steps

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        # optimizer.zero_grad()
        losses.backward()

        # ofekp: we add grad clipping here to avoid instabilities in training
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)

        # gradient_accumulation
        if steps % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Ejemplo n.º 21
0
def train(batch_size, checkpoint_freq, num_epochs):

    num_classes = 2
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(
        pretrained=True, rpn_nms_thresh=1, rpn_pre_nms_top_n_train=5000)

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256

    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)
    model = torch.nn.DataParallel(model)
    model.to('cuda')

    dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
    dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))

    indices = torch.randperm(len(dataset)).tolist()
    dataset = torch.utils.data.Subset(dataset, indices[:-50])
    dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])

    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=4,
                                              collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=4,
                                                   collate_fn=utils.collate_fn)

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params,
                                lr=0.005,
                                momentum=0.9,
                                weight_decay=0.0005)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=3,
                                                   gamma=0.1)

    hook = smd.Hook.create_from_json_file()

    for epoch in range(num_epochs):

        hook.set_mode(modes.TRAIN)
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        if epoch == 0:
            warmup_factor = 1. / 1000
            warmup_iters = min(1000, len(data_loader) - 1)

            lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                     warmup_factor)

        for iteration, (images, targets) in enumerate(data_loader):
            images = list(image.to('cuda') for image in images)
            targets = [{k: v.to('cuda')
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            optimizer.zero_grad()
            losses.backward()

            optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if iteration % checkpoint_freq == 0:
                utils.save_on_master(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }, 'model_{}.pth')

        lr_scheduler.step()

        hook.set_mode(modes.EVAL)
        evaluate(model, data_loader_test, device='cuda')
Ejemplo n.º 22
0
            coarse_optim, warmup_iters, warmup_factor)

    for i, (fine_train, coarse_train) in enumerate(
            zip(fine_train_loader, coarse_train_loader)):
        # train
        fine_model.train()
        coarse_model.train()
        #### fine train ###
        # Label mathching
        fine_imgs, fine_labels = label_matching(fine_train, device)
        fine_imgs = fine_imgs.to(device) / 255.

        ## train: img normalization --> not, zerodivision err
        fine_loss_dict = fine_model(fine_imgs, copy.deepcopy(fine_labels))
        fine_losses = sum(loss for loss in fine_loss_dict.values())
        fine_loss_dict_reduced = reduce_dict(fine_loss_dict)
        fine_loss_reduced = sum(loss
                                for loss in fine_loss_dict_reduced.values())
        fine_loss_val = fine_loss_reduced.item()

        # optimizer
        fine_optim.zero_grad()
        fine_losses.backward()
        fine_optim.step()

        if fine_lr_scheduler is not None:
            fine_lr_scheduler.step()

        fine_metric_logger.update(loss=fine_loss_reduced,
                                  **fine_loss_dict_reduced)
        fine_metric_logger.update(lr=fine_optim.param_groups[0]["lr"])
Ejemplo n.º 23
0
        hists[cat]["data_obs"] = OrderedDict()
        hists[cat]["data_obs"]["shapes_prefit"] = nd_tot[
            category_mapper[cat]]["data_obs"]["nominal"]
        for samp in hists[cat].keys():
            for syst in hists[cat][samp].keys():
                old_d = hists[cat][samp][syst]
                new_d = OrderedDict()
                for ibin in range(
                        1,
                        len(hists[cat]["data_obs"]["shapes_prefit"]) + 1):
                    label = "bin_{0}".format(ibin)
                    new_d[label] = old_d[label]
                hists[cat][samp][syst] = new_d

    bins = bins_to_category(hists)
    bins_sob = reduce_dict(bins, dcard_repr.calculate_signal_over_background)
    bins_sorted = sorted(bins_sob.keys(),
                         key=lambda x: bins_sob[x],
                         reverse=False)
    sob_data = [bins_sob[b] for b in bins_sorted]
    print("Best bins by SoB are")
    for bs in bins_sorted[-10:]:
        print(" {0} {1:.4f}".format(bs, bins_sob[bs]))

    nd3 = OrderedDict()
    for samp in ["total_signal", "total_background", "data_obs"]:
        nd3[samp] = OrderedDict()
        for syst in ["shapes_prefit", "shapes_fit_s", "shapes_fit_b"]:
            nd3[samp][syst] = OrderedDict()
            if not hists[cat][samp].has_key(syst):
                continue
Ejemplo n.º 24
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = list(image.to(device) for image in images)

        # for boxNum, box in enumerate(targets[0]['boxes']):
        #     image = copy.deepcopy(images[0].cpu().numpy())
        #     image = image.transpose(1,2,0)
        #     image = image*255
        #     image = np.ascontiguousarray(image, dtype=np.uint16)
        #     # print (image.shape, box[0].item(),box[1].item(),box[2].item(),box[3].item(),  targets[0]['names'][boxNum])
        #     image = cv2.rectangle(image, (int(box[0].item()),int(box[1].item())), (int(box[2].item()),int(box[3].item())), (0,0,255), 2)
        #     # cv2.imwrite('checker/'+targets[0]['names'][boxNum]+'.jpg',image)
        #     cv2.imwrite('checker/'+str(boxNum)+'.jpg',image)
        # import pdb;pdb.set_trace()
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        # for target in targets:
        #     for boxNum, box in enumerate(target['boxes']):
        #         if ((box[2]-box[0])<=0) or ((box[3]-box[1])<=0):
        #             import pdb;pdb.set_trace()
        # print(images[0].size(), targets[0]['boxes'][0],targets[0]['names'][0])
        # import pdb;pdb.set_trace()
        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()
        # for target in targets:
        #     for box in target['boxes']:
        #         print (box), print (loss_value)
        #         if ((box[2]-box[0])<=0) or ((box[3]-box[1])<=0):
        #             import pdb;pdb.set_trace()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
Ejemplo n.º 25
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    loss_plt = []
    for images, ann in metric_logger.log_every(data_loader, print_freq,
                                               header):
        targets = []
        for data1 in ann:  #这个for循环可以舍去
            boxes = []
            target = {}
            labels = []
            for d in data1:
                box = d['bbox']
                box = [box[0], box[1], box[0] + box[2], box[1] + box[3]]
                boxes.append(box)
                labels.append(d['category_id'])
                # convert everything into a torch.Tensor
            boxes = torch.as_tensor(boxes, dtype=torch.float32)
            # there is only one class
            labels = torch.as_tensor(labels, dtype=torch.int64)
            image_id = torch.tensor([data1[0]['image_id']])
            area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
            #print(area)
            #return
            iscrowd = torch.zeros((len(data1), ), dtype=torch.int64)
            # suppose all instances are not crowd
            target["boxes"] = boxes
            target["labels"] = labels
            target["image_id"] = image_id
            target["area"] = area
            target["iscrowd"] = iscrowd
            targets.append(target)
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device)
                    for k, v in t.items()}
                   for t in targets]  #假设标签没有放大相应device上??

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        loss_plt.append(losses)
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
        #break

    return metric_logger, loss_plt
Ejemplo n.º 26
0
def train_one_epoch(
        model,
        arch,
        optimizer,
        lr_scheduler,
        data_loader,
        device,
        epoch,
        print_freq,
        ngpus_per_node,
        model_without_ddp,
        args
    ):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}"))
    # header = "Epoch: [{}]".format(epoch)

    for images, targets in metric_logger.log_every(
            iterable=data_loader,
            print_freq=print_freq,
            # header=header,
            iter_num=args.iter_num
        ):

        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        """
        [{"boxes": tensor([], device="cuda:0"), "labels": tensor([], device="cuda:0", dtype=torch.int64), "masks": tensor([], device="cuda:0", dtype=torch.uint8), "iscrowd": tensor([], device="cuda:0", dtype=torch.int64)}]
        """

        try:
            loss_dict = model(images, targets) 
            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                logger.fatal("Loss is {}, stopping training".format(loss_value))
                logger.fatal(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        except Exception as e:
            logger.warning(e, exc_info=True)
            # logger.info("print target for debug")
            # print(targets)

        args.iter_num += 1

        # save checkpoint here
        if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0):
            if args.iter_num % 1000 == 0:
                utils.save_on_master({
                        "model": model_without_ddp.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "lr_scheduler": lr_scheduler.state_dict(),
                        "epoch": epoch,
                        "iter_num": args.iter_num,
                        "args": args,
                    },
                    "{}/{}_{}.pth".format(checkpoint_dir, arch.__name__, args.iter_num)
                )

                os.makedirs("{}/debug_image/".format(checkpoint_dir), exist_ok=True)

                if args.iter_num < 5000:
                    continue

                model.eval()

                from barez import overlay_ann	
                debug_image = None
                debug_image_list = []
                cnt = 0
                for image_path in glob.glob("./table_test/*"):
                    cnt += 1
                    image_name = os.path.basename(image_path)
                    # print(image_name)
                    image = cv2.imread(image_path)
                    rat = 1300 / image.shape[0]
                    image = cv2.resize(image, None, fx=rat, fy=rat)

                    transform = transforms.Compose([transforms.ToTensor()])
                    image = transform(image)

                    # put the model in evaluation mode
                    with torch.no_grad():
                        tensor = [image.to(device)]
                        prediction = model(tensor)
                        
                    image = torch.squeeze(image, 0).permute(1, 2, 0).mul(255).numpy().astype(np.uint8)

                    for pred in prediction:
                        for idx, mask in enumerate(pred['masks']):
                            if pred['scores'][idx].item() < 0.5:
                                continue
                        
                            m =  mask[0].mul(255).byte().cpu().numpy()
                            box = list(map(int, pred["boxes"][idx].tolist())) 
                            score = pred["scores"][idx].item()
                            image = overlay_ann(image, m, box, "", score)

                    if debug_image is None:
                        debug_image = image
                    else:
                        debug_image = np.concatenate((debug_image, image), axis=1)

                    if cnt == 10:
                        cnt = 0
                        debug_image_list.append(debug_image)
                        debug_image = None
                    
                avg_length = np.mean([i.shape[1] for i in debug_image_list])

                
                di = None

                
                for debug_image in debug_image_list:
                    rat = avg_length / debug_image.shape[1]
                    debug_image = cv2.resize(debug_image, None, fx=rat, fy=rat)

                    if di is None:
                        di = debug_image
                    else:
                        
                        di = np.concatenate((di, debug_image), axis=0)
            

                di = cv2.resize(di, None, fx=0.4, fy=0.4)
                cv2.imwrite("{}/debug_image/{}.jpg".format(checkpoint_dir, args.iter_num), di)

                model.train()

        # hard stop
        if args.iter_num == 50000:
            logger.info("ITER NUM == 50k, training successfully!")
            raise SystemExit
Ejemplo n.º 27
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, log_writer):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    # lr_scheduler = None
    milestones = [len(data_loader)//2]
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.8)
    # if epoch == 0:
    #     warmup_factor = 1. / 1000
    #     warmup_iters = min(1000, len(data_loader) - 1)
    #
    #     lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    count = 0
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        count += 1
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("count {}".format(count))
            print(">>>>>>>>>>>>>>>>>> bboxes")
            print(targets[0]["boxes"])
            print(">>>>>>>>>>>>>>>>>> labels")
            print(targets[0]["labels"])
            print(">>>>>>>>>>>>>>>>>> image_id")
            print(targets[0]["image_id"])
            print(">>>>>>>>>>>>>>>>>> area")
            print(targets[0]["area"])
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        # ================================================================== #
        #                        Tensorboard Logging                         #
        # ================================================================== #
        if count % 100 == 0:
            n_iter = count + epoch * len(data_loader) / len(images)
            log_writer.add_scalar('Loss/total', loss_value, n_iter/100)
            log_writer.add_scalar('Loss/class', loss_dict['loss_classifier'], n_iter/100)
            log_writer.add_scalar('Loss/bbox', loss_dict['loss_box_reg'], n_iter/100)
            log_writer.add_scalar('Loss/mask', loss_dict['loss_mask'], n_iter/100)
            log_writer.add_scalar('Loss/objectness', loss_dict['loss_objectness'], n_iter/100)
            log_writer.add_scalar('Loss/rpn_box', loss_dict['loss_rpn_box_reg'], n_iter/100)
Ejemplo n.º 28
0
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer=None):
    count = 0

    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)

    flag = False

    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        count += 1
        
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        if 1 not in targets[0]['labels']:
            continue

#print(len(targets))
        flag = 0
        for i in range(len(targets)) :
            if len(targets[i]['boxes'])==0:
                flag = 1    
                break

        if flag is 1 :
            continue

        loss_dict = model(images, targets)

        # losses = sum(loss for loss in loss_dict.values())
        losses = 0
        for i in loss_dict:
            if i == 'loss_keypoint':
                losses += loss_dict[i] * 0.5
            else:
                losses += loss_dict[i]

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            print(loss_dict_reduced)
#           sys.exit(1)
            continue

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        if writer and count % 100 == 0:
            writer.add_scalar('loss_box_reg', loss_dict_reduced['loss_box_reg'], epoch * len(data_loader) + count)
            writer.add_scalar('loss_classifier', loss_dict_reduced['loss_classifier'], epoch * len(data_loader) + count)
            if 'loss_mask' in loss_dict.keys():
                writer.add_scalar('loss_mask', loss_dict_reduced['loss_mask'], epoch * len(data_loader) + count)
            if 'loss_keypoint' in loss_dict.keys():
                writer.add_scalar('loss_keypoint', loss_dict_reduced['loss_keypoint'], epoch * len(data_loader) + count)
Ejemplo n.º 29
0
def evaluate(model, data_loader, device, epoch, writer=None):
    global best_mAp
    n_threads = torch.get_num_threads()
    torch.set_num_threads(1)
    cpu_device = torch.device("cpu")
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ")
    header = 'Test:'

    coco = get_coco_api_from_dataset(data_loader.dataset)
    print("get coco dataset completed!")
    iou_types = _get_iou_types(model)
    coco_evaluator = CocoEvaluator(coco, iou_types)

    running_loss = 0
    running_num = 0

    for image, targets in metric_logger.log_every(data_loader, 100, header):
        image = list(img.to(device) for img in image)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        torch.cuda.synchronize()
        model_time = time.time()
        outputs, loss_dict = model(image, targets)

        loss_dict_reduced = utils.reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        print('losses_reduced type:', type(losses_reduced))
        loss_value = losses_reduced

        running_loss += loss_value
        running_num += len(image)

        outputs = [{k: v.to(cpu_device)
                    for k, v in t.items()} for t in outputs]
        model_time = time.time() - model_time

        res = {
            target["image_id"].item(): output
            for target, output in zip(targets, outputs)
        }
        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time
        metric_logger.update(model_time=model_time,
                             evaluator_time=evaluator_time)

    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    coco_evaluator.synchronize_between_processes()

    # accumulate predictions from all images
    coco_evaluator.accumulate()
    stats_dic = coco_evaluator.summarize()
    print('stats_dic', stats_dic)
    bbox_mAp = stats_dic['bbox'][0]
    torch.set_num_threads(n_threads)

    if writer is not None:
        writer.add_scalar('runing_loss', running_loss / running_num, epoch)
        writer.add_scalar('test_mAP', bbox_mAp, epoch)
    return coco_evaluator, bbox_mAp