def train_one_epoch(model,
                    optimizer,
                    data_loader,
                    device,
                    epoch,
                    print_freq=1000):
    """
    Train model (JointDetector) for 1 epoch from data in data_loader (images, obj_targets, part_targets) 
    """
    model.train()
    metric_logger = MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr',
                            SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)
        lr_scheduler = warmup_lr_scheduler(optimizer, warmup_iters,
                                           warmup_factor)

    for images, obj_targets, part_targets in metric_logger.log_every(
            data_loader, print_freq, header, device):
        images = list(image.to(device) for image in images)
        obj_targets = [{k: v.to(device)
                        for k, v in t.items()} for t in obj_targets]
        part_targets = [{k: v.to(device)
                         for k, v in t.items()} for t in part_targets]

        loss_dict = model(images, obj_targets, part_targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

        loss_value = losses_reduced.item()

        if not math.isfinite(loss_value):
            print('Loss is {}, stopping training'.format(loss_value))
            print(loss_dict_reduced)
            sys.exit(1)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        if lr_scheduler is not None:
            lr_scheduler.step()

        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

    return metric_logger
Beispiel #2
0
def train_one_epoch_SSD(model, loss_func, optimizer, data_loader, encoder,
                        epoch, print_freq, mean, std, device):

    model.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)

    lr_scheduler = None
    losses = AverageMeter()
    if epoch == 0:
        warmup_factor = 1. / 1000
        warmup_iters = min(1000, len(data_loader) - 1)

        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                 warmup_factor)

    criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)
    dboxes = dboxes300_coco()
    loss_func = Loss(dboxes)
    for images, targets in metric_logger.log_every(data_loader, print_freq,
                                                   header):
        images = torch.stack(images).to(device)
        bboxes = [t["boxes"].to(device) for t in targets]
        labels = [t["labels"].to(device) for t in targets]

        ploc, pscores = model(images)
        ploc, pscores = ploc.float(), pscores.float()

        # loss
        loss = criterion(ploc, pscores, bboxes, labels)

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))

        metric_logger.update(loss=losses.val, **{})
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Creating tensorboard writer
    if not args.resume:
        writer = SummaryWriter(comment=TENSORBOARD_RESULT_FILE_NAME)
    else:
        writer = SummaryWriter("")

    ######################
    # Creating test data #
    ######################
    print("Loading test data")

    viped_dataset_test = get_dataset("viped",
                                     get_transform(train=False, aug=args.aug),
                                     percentage=5,
                                     val=True)
    mot19_dataset_test = get_dataset("mot19",
                                     get_transform(train=False),
                                     val=True)
    mot17_dataset_test = get_dataset("mot17",
                                     get_transform(train=False),
                                     val=True)
    crowd_human_dataset_test = get_dataset("crowd_human",
                                           get_transform(train=False),
                                           val=True)
    city_persons_dataset_test = get_dataset("city_persons",
                                            get_transform(train=False),
                                            val=True)
    coco_persons_dataset_test = get_dataset("COCO_persons",
                                            get_transform(train=False),
                                            val=True)

    ##########################
    # Creating training data #
    ##########################
    print("Loading training data")
    train_datasets_dict = {
        'viped':
        lambda: get_dataset("viped", get_transform(train=True, aug=args.aug)),
        'mot19':
        lambda: get_dataset("mot19", get_transform(train=True)),
        'mot17':
        lambda: get_dataset("mot17", get_transform(train=True)),
        'crowd_human':
        lambda: get_dataset("crowd_human", get_transform(train=True)),
        'city_persons':
        lambda: get_dataset("city_persons", get_transform(train=True)),
        'COCO_persons:':
        lambda: get_dataset("COCO_persons", get_transform(train=True)),
    }

    #################################
    # Preparing training dataloader #
    #################################
    if args.train_on in train_datasets_dict:
        # the train dataset is a normal single dataset
        train_dataset = train_datasets_dict[args.train_on]()
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=args.workers,
            collate_fn=train_dataset.standard_collate_fn)
        print('Using training dataset: {}'.format(args.train_on))
    elif ',' in args.train_on:
        assert args.tgt_images_in_batch > 0, "Using mixed training. " \
                                             "You need to specify the args.tgt_images_in_batch parameter!"
        # the train dataset is an ensamble of datasets
        source_dataset_name, target_dataset_name = args.train_on.split(',')
        train_dataset = DatasetsEnsemble(
            train_datasets_dict[source_dataset_name](),
            train_datasets_dict[target_dataset_name]())
        train_dataloader = DataLoader(
            train_dataset,
            collate_fn=train_dataset.source_dataset.standard_collate_fn,
            num_workers=args.workers,
            batch_sampler=EnsembleBatchSampler(
                train_dataset,
                batch_size=args.batch_size,
                shuffle=True,
                tgt_imgs_in_batch=args.tgt_images_in_batch))
        print(
            'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}'
            .format(source_dataset_name, target_dataset_name,
                    args.tgt_images_in_batch, args.batch_size,
                    target_dataset_name))
    else:
        raise ValueError('Dataset not known!')

    ##############################
    # Preparing test dataloaders #
    ##############################

    data_loader_viped_test = DataLoader(
        viped_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=viped_dataset_test.standard_collate_fn)

    data_loader_mot19_test = DataLoader(
        mot19_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=mot19_dataset_test.standard_collate_fn)

    data_loader_mot17_test = DataLoader(
        mot17_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=mot17_dataset_test.standard_collate_fn)

    data_loader_crowd_human_test = DataLoader(
        crowd_human_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=crowd_human_dataset_test.standard_collate_fn)

    data_loader_city_persons_test = DataLoader(
        city_persons_dataset_test,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=city_persons_dataset_test.standard_collate_fn)

    data_loader_coco_persons_test = DataLoader(
        coco_persons_dataset_test,
        shuffle=False,
        num_workers=args.workers,
        collate_fn=coco_persons_dataset_test.standard_collate_fn)

    # Creating model
    print("Creating model")
    model, backbone = get_model_detection(num_classes=1,
                                          model=args.model,
                                          pretrained=args.pretrained)

    # Putting model to device and setting eval mode
    model.to(device)
    model.train()

    # freeze the backbone parameters, if needed
    if backbone is not None and args.freeze_backbone:
        for param in backbone.parameters():
            param.requires_grad = False
        print('Backbone is freezed!')

    # construct an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    if args.optimizer == "sgd":
        optimizer = torch.optim.SGD(params,
                                    lr=args.lr,
                                    momentum=args.momentum,
                                    weight_decay=args.weight_decay)
    elif args.optimizer == "adam":
        optimizer = torch.optim.Adam(
            params=params,
            lr=args.lr,
        )
    else:
        print("Optimizer not available")
        exit(1)

    # and a learning rate scheduler
    if args.lr_scheduler == "step_lr":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    elif args.lr_scheduler == "plateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', patience=args.lr_patience, verbose=True)
    else:
        print("L-Scheduler not available")
        exit(1)

    # Defining a warm-uo lr scheduler
    warmup_iters = min(1000, len(train_dataloader) - 1)
    warmup_factor = 1. / 1000
    warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                    warmup_factor)

    # Loading checkpoint
    start_epoch = 0
    train_step = -1
    best_viped_ap, best_mot19_ap, best_mot17_ap, best_crowdhuman_ap, best_citypersons_ap, best_cocopersons_ap \
        = 0, 0, 0, 0, 0, 0
    if args.resume:
        print("Resuming from checkpoint")
        checkpoint = torch.load(args.resume)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler'])
        start_epoch = checkpoint['epoch']
        train_step = checkpoint['iteration']
        best_viped_ap = checkpoint['best_viped_ap']
        best_mot19_ap = checkpoint['best_mot19_ap']
        best_mot17_ap = checkpoint['best_mot17_ap']
        best_crowdhuman_ap = checkpoint['best_crowdhuman_ap']
        best_citypersons_ap = checkpoint['best_citypersons_ap']
        best_cocopersons_ap = checkpoint['best_cocopersons_ap']

    # Cross-check if the backbone has been really freezed
    if backbone is not None and args.freeze_backbone:
        for param in backbone.parameters():
            assert not param.requires_grad, "Backbone seems to be not freezed correctly!"

    # Train
    print("Start training")
    for epoch in range(start_epoch, args.epochs):
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        for images, targets in metric_logger.log_every(
                train_dataloader, print_freq=args.print_freq, header=header):
            train_step += 1
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            # clip norm
            torch.nn.utils.clip_grad_norm(model.parameters(), 50)
            optimizer.step()

            if epoch == 0 and train_step < warmup_iters:
                warmup_lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if train_step % args.log_loss == 0:
                writer.add_scalar('Training/Learning Rate',
                                  optimizer.param_groups[0]["lr"], train_step)
                writer.add_scalar('Training/Reduced Sum Losses',
                                  losses_reduced, train_step)
                writer.add_scalars('Training/All Losses', loss_dict,
                                   train_step)

            if (train_step % args.save_freq == 0 and train_step != 0) or \
               (args.pretrained and train_step < 5*args.save_freq and train_step % 200 == 0 and train_step != 0) \
                    or train_step == 100:
                # evaluate on the test datasets
                print("Validation viped Dataset")
                viped_coco_evaluator = evaluate(model,
                                                data_loader_viped_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation mot19 Dataset")
                mot19_coco_evaluator = evaluate(model,
                                                data_loader_mot19_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation mot17 Dataset")
                mot17_coco_evaluator = evaluate(model,
                                                data_loader_mot17_test,
                                                device=device,
                                                max_dets=args.max_dets)
                print("Validation crowdhuman Dataset")
                crowdhuman_coco_evaluator = evaluate(
                    model,
                    data_loader_crowd_human_test,
                    device=device,
                    max_dets=args.max_dets)
                print("Validation citypersons Dataset")
                citypersons_coco_evaluator = evaluate(
                    model,
                    data_loader_city_persons_test,
                    device=device,
                    max_dets=args.max_dets)
                print("Validation COCO Persons Dataset")
                cocopersons_coco_evaluator = evaluate(
                    model,
                    data_loader_coco_persons_test,
                    device=device,
                    max_dets=args.max_dets)

                # save using tensorboard
                viped_ap, mot19_ap, mot17_ap, crowdhuman_ap, citypersons_ap, cocopersons_ap = \
                    None, None, None, None, None, None
                for iou_type, coco_eval in viped_coco_evaluator.coco_eval.items(
                ):
                    viped_ap = coco_eval.stats[1]
                for iou_type, coco_eval in mot19_coco_evaluator.coco_eval.items(
                ):
                    mot19_ap = coco_eval.stats[1]
                for iou_type, coco_eval in mot17_coco_evaluator.coco_eval.items(
                ):
                    mot17_ap = coco_eval.stats[1]
                for iou_type, coco_eval in crowdhuman_coco_evaluator.coco_eval.items(
                ):
                    crowdhuman_ap = coco_eval.stats[1]
                for iou_type, coco_eval in citypersons_coco_evaluator.coco_eval.items(
                ):
                    citypersons_ap = coco_eval.stats[1]
                for iou_type, coco_eval in cocopersons_coco_evaluator.coco_eval.items(
                ):
                    cocopersons_ap = coco_eval.stats[1]
                writer.add_scalar('COCO mAP Validation/ViPeD', viped_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/MOT19', mot19_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/MOT17', mot17_ap,
                                  train_step)
                writer.add_scalar('COCO mAP Validation/CrowdHuman',
                                  crowdhuman_ap, train_step)
                writer.add_scalar('COCO mAP Validation/CityPersons',
                                  citypersons_ap, train_step)
                writer.add_scalar('COCO mAP Validation/COCOPersons',
                                  cocopersons_ap, train_step)

                # Eventually saving best models
                if viped_ap > best_viped_ap:
                    best_viped_ap = viped_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="viped")
                if mot19_ap > best_mot19_ap:
                    best_mot19_ap = mot19_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="mot19")
                if mot17_ap > best_mot17_ap:
                    best_mot17_ap = mot17_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="mot17")
                if crowdhuman_ap > best_crowdhuman_ap:
                    best_crowdhuman_ap = crowdhuman_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="crowdhuman")
                if citypersons_ap > best_citypersons_ap:
                    best_citypersons_ap = citypersons_ap
                    save_checkpoint(
                        {
                            'model':
                            model.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                            'lr_scheduler':
                            lr_scheduler.state_dict(),
                            'warmup_lr_scheduler':
                            warmup_lr_scheduler.state_dict()
                            if warmup_lr_scheduler is not None else None,
                            'epoch':
                            epoch,
                            'iteration':
                            train_step,
                            'best_viped_ap':
                            best_viped_ap,
                            'best_mot19_ap':
                            best_mot19_ap,
                            'best_mot17_ap':
                            best_mot17_ap,
                            'best_crowdhuman_ap':
                            best_crowdhuman_ap,
                            'best_citypersons_ap':
                            best_citypersons_ap,
                            'best_cocopersons_ap':
                            best_cocopersons_ap,
                        },
                        writer.get_logdir(),
                        best_model="citypersons")

                # Saving model
                save_checkpoint(
                    {
                        'model':
                        model.state_dict(),
                        'optimizer':
                        optimizer.state_dict(),
                        'lr_scheduler':
                        lr_scheduler.state_dict(),
                        'warmup_lr_scheduler':
                        warmup_lr_scheduler.state_dict()
                        if warmup_lr_scheduler is not None else None,
                        'epoch':
                        epoch,
                        'iteration':
                        train_step,
                        'best_viped_ap':
                        best_viped_ap,
                        'best_mot19_ap':
                        best_mot19_ap,
                        'best_mot17_ap':
                        best_mot17_ap,
                        'best_crowdhuman_ap':
                        best_crowdhuman_ap,
                        'best_citypersons_ap':
                        best_citypersons_ap,
                        'best_cocopersons_ap':
                        best_cocopersons_ap,
                    }, writer.get_logdir())

                # Setting again to train mode
                model.train()

            lr_scheduler.step()
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    # Opening YAML cfg config file
    with open(args.cfg_file, 'r') as stream:
        try:
            cfg_file = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    # Retrieving cfg
    train_cfg = cfg_file['training']
    model_cfg = cfg_file['model']
    data_cfg = cfg_file['dataset']

    # Setting device
    device = torch.device(model_cfg['device'])

    # No possible to set checkpoint and pre-trained model at the same time
    if train_cfg['checkpoint'] and train_cfg['pretrained_model']:
        print("You can't set checkpoint and pretrained-model at the same time")
        exit(1)

    # Creating tensorboard writer
    if train_cfg['checkpoint']:
        checkpoint = torch.load(train_cfg['checkpoint'])
        writer = SummaryWriter(log_dir=checkpoint['tensorboard_working_dir'])
    else:
        writer = SummaryWriter(comment="_" + train_cfg['tensorboard_filename'])

    # Saving cfg file in the same folder
    copyfile(
        args.cfg_file,
        os.path.join(writer.get_logdir(), os.path.basename(args.cfg_file)))

    #######################
    # Creating model
    #######################
    print("Creating model")
    load_custom_model = False
    if train_cfg['checkpoint'] or train_cfg['pretrained_model']:
        load_custom_model = True
    model, backbone = get_model_detection(num_classes=1,
                                          cfg=model_cfg,
                                          load_custom_model=load_custom_model)

    # Putting model to device and setting eval mode
    model.to(device)
    model.train()

    # Freeze the backbone parameters, if needed
    if backbone is not None and model_cfg['freeze_backbone']:
        for param in backbone.parameters():
            param.requires_grad = False
        print('Backbone is freezed!')

    #####################################
    # Creating datasets and dataloaders
    #####################################
    data_root = data_cfg['root']

    ################################
    # Creating training datasets and dataloaders
    print("Loading training data")
    train_datasets_names = data_cfg['train']

    if train_cfg['mixed_batch']:
        assert train_cfg['tgt_images_in_batch'] > 0, \
            "Using mixed training. You need to specify the tgt_images_in_batch parameter!"
        assert len(train_datasets_names) == 2, "Using mixed training, you need to specify two datasets, " \
                                               "the first one as the source while the second as the target"
        source_dataset = CustomYoloAnnotatedDataset(
            data_root, {
                list(train_datasets_names.keys())[0]:
                list(train_datasets_names.values())[0]
            },
            transforms=get_transform(train=True),
            phase='train')
        target_dataset = CustomYoloAnnotatedDataset(
            data_root, {
                list(train_datasets_names.keys())[1]:
                list(train_datasets_names.values())[1]
            },
            transforms=get_transform(train=True),
            phase='train')
        train_dataset = DatasetsEnsemble(source_dataset=source_dataset,
                                         target_dataset=target_dataset)
        train_dataloader = DataLoader(
            train_dataset,
            collate_fn=train_dataset.source_dataset.standard_collate_fn,
            num_workers=train_cfg['num_workers'],
            batch_sampler=EnsembleBatchSampler(
                train_dataset,
                batch_size=train_cfg['batch_size'],
                shuffle=True,
                tgt_imgs_in_batch=train_cfg['tgt_images_in_batch']))
        print(
            'Using mixed training datasets. Source: {}, Target: {}. In every batch, {}/{} are from {}'
            .format(
                list(train_datasets_names.keys())[0],
                list(train_datasets_names.keys())[1],
                train_cfg['tgt_images_in_batch'], train_cfg['batch_size'],
                list(train_datasets_names.keys())[1]))
    else:
        train_dataset = CustomYoloAnnotatedDataset(
            data_root,
            train_datasets_names,
            transforms=get_transform(train=True),
            phase='train')
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=train_cfg['batch_size'],
            shuffle=False,
            num_workers=train_cfg['num_workers'],
            collate_fn=train_dataset.standard_collate_fn)

    ###############################
    # Creating validation datasets
    print("Loading validation data")
    val_datasets_names = data_cfg['val']

    # Creating dataset(s) and dataloader(s)
    val_dataloaders = dict()
    best_validation_ap = defaultdict(float)
    for dataset_name, dataset_cfg in val_datasets_names.items():
        val_dataset = CustomYoloAnnotatedDataset(
            data_root, {dataset_name: dataset_cfg},
            transforms=get_transform(),
            phase="val",
            percentage=train_cfg["percentage_val"])
        val_dataloader = DataLoader(val_dataset,
                                    batch_size=train_cfg['batch_size'],
                                    shuffle=False,
                                    num_workers=train_cfg['num_workers'],
                                    collate_fn=val_dataset.standard_collate_fn)
        # Adding created dataloader
        val_dataloaders[dataset_name] = val_dataloader
        # Initializing best validation ap value
        best_validation_ap[dataset_name] = 0.0

    #######################################
    # Defining optimizer and LR scheduler
    #######################################
    ##########################
    # Constructing an optimizer
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(
        params,
        lr=train_cfg['lr'],
        momentum=train_cfg['momentum'],
        weight_decay=train_cfg['weight_decay'],
    )

    # and a learning rate scheduler
    if model_cfg['coco_model_pretrained']:
        lr_step_size = min(25000, len(train_dataset))
    else:
        lr_step_size = min(40000, 2 * len(train_dataset))
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=lr_step_size,
                                                   gamma=train_cfg['lr_gamma'])

    # Defining a warm-up lr scheduler
    warmup_iters = min(1000, len(train_dataloader) - 1)
    warmup_factor = 1. / 1000
    warmup_lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters,
                                                    warmup_factor)

    #############################
    # Resuming a model
    #############################
    start_epoch = 0
    train_step = -1
    # Eventually resuming a pre-trained model
    if train_cfg['pretrained_model']:
        print("Resuming pre-trained model")
        if train_cfg['pretrained_model'].startswith('http://') or train_cfg[
                'pretrained_model'].startswith('https://'):
            pre_trained_model = torch.hub.load_state_dict_from_url(
                train_cfg['pretrained_model'],
                map_location='cpu',
                model_dir=model_cfg["cache_folder"])
        else:
            pre_trained_model = torch.load(train_cfg['pretrained_model'],
                                           map_location='cpu')
        model.load_state_dict(pre_trained_model['model'])

    # Eventually resuming from a saved checkpoint
    if train_cfg['checkpoint']:
        print("Resuming from a checkpoint")
        checkpoint = torch.load(train_cfg['checkpoint'])
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        warmup_lr_scheduler.load_state_dict(checkpoint['warmup_lr_scheduler'])
        start_epoch = checkpoint['epoch']
        train_step = checkpoint['iteration']
        for elem_name, elem in checkpoint.items():
            if elem_name.startswith("best_"):
                d_name = elem_name.split("_")[1]
                if d_name in best_validation_ap:
                    best_validation_ap[d_name] = elem
                else:
                    warnings.warn(
                        "The dataset {} was not used in the previous training".
                        format(d_name))
                    best_validation_ap[d_name] = 0.0

    ################
    ################
    # Training
    print("Start training")
    for epoch in range(start_epoch, train_cfg['epochs']):
        model.train()
        metric_logger = utils.MetricLogger(delimiter="  ")
        metric_logger.add_meter(
            'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
        header = 'Epoch: [{}]'.format(epoch)

        for images, targets in metric_logger.log_every(
                train_dataloader,
                print_freq=train_cfg['print_freq'],
                header=header):
            train_step += 1
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device)
                        for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)

            losses = sum(loss for loss in loss_dict.values())

            # reduce losses over all GPUs for logging purposes
            loss_dict_reduced = utils.reduce_dict(loss_dict)
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                for target in targets:
                    image_id = target['image_id'].item()
                    print(train_dataset.images[image_id])
                print("Loss is {}, stopping training".format(loss_value))
                print(loss_dict_reduced)
                sys.exit(1)

            optimizer.zero_grad()
            losses.backward()
            # clip norm
            torch.nn.utils.clip_grad_norm_(model.parameters(), 50)
            optimizer.step()

            if epoch == 0 and train_step < warmup_iters:
                warmup_lr_scheduler.step()

            metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
            metric_logger.update(lr=optimizer.param_groups[0]["lr"])

            if train_step % train_cfg['log_loss'] == 0:
                writer.add_scalar('Training/Learning Rate',
                                  optimizer.param_groups[0]["lr"], train_step)
                writer.add_scalar('Training/Reduced Sum Losses',
                                  losses_reduced, train_step)
                writer.add_scalars('Training/All Losses', loss_dict,
                                   train_step)

            if (train_step % train_cfg['save_freq'] == 0 and train_step != 0) \
                    or ((train_cfg['pretrained_model'] or model_cfg['coco_model_pretrained']) and
                        train_step < 6 * train_cfg['save_freq'] and train_step % 200 == 0 and train_step != 0):
                # Validation
                for val_name, val_dataloader in val_dataloaders.items():
                    print("Validation on {}".format(val_name))
                    coco_evaluator = evaluate(
                        model,
                        val_dataloader,
                        device=device,
                        max_dets=model_cfg["max_dets_per_image"])
                    ap = None
                    for iou_type, coco_eval in coco_evaluator.coco_eval.items(
                    ):
                        ap = coco_eval.stats[1]
                    writer.add_scalar(
                        'COCO mAP Validation/{}'.format(val_name), ap,
                        train_step)

                    # Eventually saving best model
                    if ap > best_validation_ap[val_name]:
                        best_validation_ap[val_name] = ap
                        save_checkpoint(
                            {
                                'model':
                                model.state_dict(),
                                'optimizer':
                                optimizer.state_dict(),
                                'lr_scheduler':
                                lr_scheduler.state_dict(),
                                'warmup_lr_scheduler':
                                warmup_lr_scheduler.state_dict()
                                if warmup_lr_scheduler is not None else None,
                                'epoch':
                                epoch,
                                'iteration':
                                train_step,
                                'best_{}_ap'.format(val_name):
                                best_validation_ap[val_name],
                            },
                            writer.get_logdir(),
                            best_model=val_name)

                # Saving last model
                checkpoint_dict = {
                    'model':
                    model.state_dict(),
                    'optimizer':
                    optimizer.state_dict(),
                    'lr_scheduler':
                    lr_scheduler.state_dict(),
                    'warmup_lr_scheduler':
                    warmup_lr_scheduler.state_dict()
                    if warmup_lr_scheduler is not None else None,
                    'epoch':
                    epoch,
                    'iteration':
                    train_step,
                    'tensorboard_working_dir':
                    writer.get_logdir(),
                }
                for d_name, _ in val_dataloaders.items():
                    checkpoint_dict["best_{}_ap".format(
                        d_name)] = best_validation_ap[d_name]
                save_checkpoint(checkpoint_dict, writer.get_logdir())

                # Setting again to train mode
                model.train()

            # Updating lr scheduler
            lr_scheduler.step()