def build_optimizer(params, cfg, logger=None):
    if cfg['name'] == 'sgd':
        optimizer = torch.optim.SGD(params=params,
                                    lr=cfg['lr']['base_lr'],
                                    momentum=cfg['momentum'],
                                    weight_decay=cfg['weight_decay'],
                                    nesterov=cfg['nesterov'])

    elif cfg['name'] == 'adam':
        optimizer = torch.optim.Adam(params=params,
                                     lr=cfg['lr']['base_lr'],
                                     weight_decay=cfg['weight_decay'])

    else:
        raise ValueError('Unknown optimizer.')

    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=cfg['lr']['milestones'],
        gamma=cfg['lr']['gamma'])
    if 'warmup' in cfg:
        scheduler = GradualWarmupScheduler(
            optimizer,
            multiplier=cfg['warmup']['multiplier'],
            total_epoch=cfg['warmup']['epochs'],
            after_scheduler=scheduler)

    logger.add_line("=" * 30 + "   Optimizer   " + "=" * 30)
    logger.add_line(str(optimizer))
    return optimizer, scheduler
Beispiel #2
0
def main():
    model = ResNet(ResidualBlock).to(device)
    model.reload()
    model.train()
    logger.info("Train: Init model")
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler_after = torch.optim.lr_scheduler.StepLR(optimizer,
                                                      step_size=30,
                                                      gamma=0.5)
    scheduler = GradualWarmupScheduler(optimizer,
                                       8,
                                       10,
                                       after_scheduler=scheduler_after)

    train_dataloader = get_train_data_loader()
    loss_best = 1
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(tqdm(train_dataloader)):
            images = images.to(device)
            labels = labels.to(device)
            labels = labels.long()
            label1, label2 = labels[:, 0], labels[:, 1]

            optimizer.zero_grad()
            y1, y2 = model(images)
            loss1, loss2 = criterion(y1, label1), criterion(y2, label2)
            loss = loss1 + loss2
            # outputs = model(images)
            # loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        scheduler.step()
        logger.info(f"epoch: {epoch}, step: {i}, loss: {loss.item()}")
        model.save()
        if loss_best > loss.item():
            loss_best = loss.item()
            torch.save(model.state_dict(), "model/best.pkl")
            logger.info("Train: Saved best model")
    torch.save(model.state_dict(), "model/final.pkl")
    logger.info("Train: Saved last model")
 def configure_optimizers(self):
     optimizer = torch.optim.AdamW(
         self.parameters(),
         lr=self.args.learning_rate,
         weight_decay=self.args.weight_decay,
     )
     T_0 = self.args.num_train_steps
     scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
         optimizer, T_0, T_mult=1, eta_min=1e-8)
     scheduler_warmup = GradualWarmupScheduler(
         optimizer,
         multiplier=1,
         total_epoch=self.args.warmup_steps,
         after_scheduler=scheduler)
     self.scheduler = scheduler_warmup
     return [optimizer], [{
         'scheduler': scheduler_warmup,
         'interval': 'step',
     }]
Beispiel #4
0
def main():
    if not torch.cuda.is_available():
        print('no gpu device available')
        sys.exit(1)

    writer = None
    num_gpus = torch.cuda.device_count()
    np.random.seed(args.seed)
    args.gpu = args.local_rank % num_gpus
    args.nprocs = num_gpus
    torch.cuda.set_device(args.gpu)
    cudnn.benchmark = True
    cudnn.deterministic = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)

    if args.local_rank == 0:
        args.exp = datetime.datetime.now().strftime("%YY_%mM_%dD_%HH") + "_" + \
            "{:04d}".format(random.randint(0, 1000))

    print('gpu device = %d' % args.gpu)
    print("args = %s", args)

    if args.model_type == "dynamic":
        model = dynamic_resnet20()
    elif args.model_type == "independent":
        model = Independent_resnet20()
    elif args.model_type == "slimmable":
        model = mutableResNet20()
    elif args.model_type == "original":
        model = resnet20()
    else:
        print("Not Implement")

    # model = resnet20()
    model = model.cuda(args.gpu)

    if num_gpus > 1:
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

        args.world_size = torch.distributed.get_world_size()
        args.batch_size = args.batch_size // args.world_size

    # criterion_smooth = CrossEntropyLabelSmooth(args.classes, args.label_smooth)
    # criterion_smooth = criterion_smooth.cuda()
    criterion = torch.nn.CrossEntropyLoss().cuda(args.gpu)
    soft_criterion = CrossEntropyLossSoft()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # scheduler = torch.optim.lr_scheduler.LambdaLR(
    #     optimizer, lambda step: (1.0-step/args.total_iters), last_epoch=-1)
    # a_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    #     optimizer, T_0=5)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    # a_scheduler = torch.optim.lr_scheduler.LambdaLR(
    #     optimizer, lambda epoch: 1 - (epoch / args.epochs))
    a_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[60, 120, 160], last_epoch=-1)  # !!
    scheduler = GradualWarmupScheduler(optimizer,
                                       1,
                                       total_epoch=5,
                                       after_scheduler=a_scheduler)

    if args.local_rank == 0:
        writer = SummaryWriter(
            "./runs/%s-%05d" %
            (time.strftime("%m-%d", time.localtime()), random.randint(0, 100)))

    # Prepare data
    train_loader = get_train_loader(args.batch_size, args.local_rank,
                                    args.num_workers)
    # 原来跟train batch size一样,现在修改小一点 ,
    val_loader = get_val_loader(args.batch_size, args.num_workers)

    archloader = ArchLoader("data/Track1_final_archs.json")

    for epoch in range(args.epochs):
        train(train_loader, val_loader, optimizer, scheduler, model,
              archloader, criterion, soft_criterion, args, args.seed, epoch,
              writer)

        scheduler.step()
        if (epoch + 1) % args.report_freq == 0:
            top1_val, top5_val, objs_val = infer(train_loader, val_loader,
                                                 model, criterion, archloader,
                                                 args, epoch)

            if args.local_rank == 0:
                # model
                if writer is not None:
                    writer.add_scalar("Val/loss", objs_val, epoch)
                    writer.add_scalar("Val/acc1", top1_val, epoch)
                    writer.add_scalar("Val/acc5", top5_val, epoch)

                save_checkpoint({
                    'state_dict': model.state_dict(),
                }, epoch, args.exp)
def train(save_path, save_every, img_size, resume, epochs, opt=None):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    model_name = 'flowNet'
    weights_path = osp.join(save_path, model_name)
    loss_log_path = osp.join(weights_path, 'loss.json')
    mkdir_if_missing(weights_path)
    cfg = {}
    cfg['lr'] = opt.lr
    cfg['height'] = img_size[1]
    cfg['width'] = img_size[0]

    if resume:
        latest_resume = osp.join(weights_path, 'latest.pt')

    torch.backends.cudnn.benchmark = True
    # root = '/home/hunter/Document/torch'
    root = '/data/dgw'

    paths_trainset = './data/flow/MOT16.txt'
    transforms = T.Compose([T.ToTensor()])

    trainset = LoadImagesAndLabels_2(root=root,
                                     path=paths_trainset,
                                     img_size=img_size,
                                     augment=False,
                                     transforms=transforms)

    dataloader_trainset = torch.utils.data.DataLoader(trainset,
                                                      batch_size=1,
                                                      shuffle=True)

    model = flowTracker(img_size)
    # model.train()
    model.cuda().train()

    start_epoch = 0

    optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                       model.parameters()),
                                lr=opt.lr,
                                momentum=.9,
                                weight_decay=5e-4)
    after_scheduler = StepLR(optimizer, 10, 0.1)
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=10,
                                       total_epoch=10,
                                       after_scheduler=after_scheduler)

    if resume:
        checkpoint = torch.load(latest_resume, map_location='cpu')

        # Load weights to resume from
        print(model.load_state_dict(checkpoint['model'], strict=False))

        start_epoch = checkpoint['epoch'] + 1

        del checkpoint  # current, saved

    else:
        with open(osp.join(weights_path, 'model.yaml'), 'w+') as f:
            yaml.dump(cfg, f)

    for epoch in range(epochs):
        epoch = epoch + start_epoch
        print('lr: ', optimizer.param_groups[0]['lr'])
        scheduler.step(epoch)
        loss_epoch_log = 0
        for i, (imgs, labels, img_path,
                _) in enumerate(tqdm(dataloader_trainset)):
            imgs = torch.cat(imgs, dim=0)
            imgs = imgs.permute(1, 0, 2, 3).unsqueeze(0).cuda()
            boxes, target = labels[0][0].cuda(), labels[1][0].cuda()
            loss = model(imgs, boxes, target, img_path)
            if loss is None:
                continue
            loss.backward()
            optimizer.step()
            ## print and log the loss
            if i % 50 == 0:
                print(loss)
            loss_epoch_log += loss

        loss_epoch_log = loss_epoch_log / i
        print("loss in epoch %d: " % (epoch))
        print(loss_epoch_log)

        checkpoint = {'epoch': epoch, 'model': model.state_dict()}
        latest = osp.join(weights_path, 'latest.pt')
        torch.save(checkpoint, latest)
        if epoch % save_every == 0 and epoch != 0:
            torch.save(
                checkpoint,
                osp.join(weights_path, "weights_epoch_" + str(epoch) + ".pt"))
        with open(loss_log_path, 'a+') as f:
            f.write('epoch:' + str(epoch) + '\n')
            json.dump(float(loss_epoch_log), f)
            f.write('\n')
Beispiel #6
0
    def __init__(self, experiment):
        self.experiment = str(experiment)
        config_file = os.path.join(CONFIG_DIR, experiment + '.json')
        with open(config_file, 'r') as f:
            self.config = json.load(f)

        model = self.config['model']
        optimizer = self.config['optimizer']
        scheduler = self.config['scheduler']
        dataset_name = self.config['dataset']

        self.clip_len = int(self.config['clip-length'])
        self.lr = float(self.config['lr'])
        self.weight_decay = float(self.config['weight-decay'])

        self.max_epochs = int(self.config['max-epochs'])
        self.epoch = 0
        self.iteration = 0
        self.train_batch_size = int(self.config['train-batch-size'])
        self.eval_batch_size = int(self.config['test-batch-size'])
        self.warmup_scheduler = bool(self.config['warmup-scheduler'])

        if dataset_name == 'breakfast':
            import data.breakfast_data as dataset_utils
            self.n_classes = dataset_utils.N_CLASSES
        else:
            raise ValueError('no such dataset')

        if model == 'ir-csn':
            import utils.csnet_utils as train_utils
            self.model = IrCsn152(n_classes=dataset_utils.N_CLASSES,
                                  clip_len=self.clip_len,
                                  crop_size=train_utils.CROP_SIZE)
        else:
            raise ValueError('no such model')
        train_video_files, train_labels, train_video_len_files = dataset_utils.get_train_data(
        )
        test_video_files, test_labels, test_video_len_files = dataset_utils.get_test_data(
        )

        self.train_batch_size = self.train_batch_size * torch.cuda.device_count(
        )
        device_ids = list(range(torch.cuda.device_count()))
        self.model = nn.DataParallel(self.model, device_ids=device_ids)
        self.model = self.model.cuda()

        self.loss_fn = nn.CrossEntropyLoss()

        if optimizer == 'adam':
            self.optimizer = Adam(self.model.parameters(),
                                  lr=self.lr,
                                  weight_decay=self.weight_decay)
        elif optimizer == 'sgd':
            self.optimizer = SGD(self.model.parameters(),
                                 lr=self.lr,
                                 weight_decay=self.weight_decay,
                                 momentum=0)
        else:
            raise ValueError('no such optimizer')

        if scheduler == 'step':
            step_size = self.config['lr-decay-step-size']
            lr_decay = self.config['lr-decay-rate']
            self.scheduler = StepLR(self.optimizer,
                                    gamma=lr_decay,
                                    step_size=step_size)
        elif scheduler == 'half-cosine':
            self.scheduler = CosineAnnealingWarmRestarts(
                self.optimizer, T_0=len(train_video_files))
        else:
            raise ValueError('no such scheduler')

        if self.warmup_scheduler:
            if scheduler == 'half-cosine':
                self.scheduler = GradualWarmupScheduler(
                    self.optimizer,
                    multiplier=8,
                    total_epoch=10,
                    after_scheduler=self.scheduler)
            else:
                raise ValueError('no such scheduler')

        self.checkpoint_dir = os.path.join(CHECKPOINT_DIR, self.experiment)
        self.load_checkpoint(ckpt_name='model')
        self.log_dir = os.path.join(LOG_DIR, self.experiment)
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        self.tensorboard_logger = SummaryWriter(self.log_dir)

        if model == 'ir-csn':
            self.train_dataset = train_utils.TrainDataset(
                video_files=train_video_files,
                labels=train_labels,
                video_len_files=train_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN)

            # evaluation datasets
            self.train_eval_dataset = train_utils.EvalDataset(
                video_files=train_video_files,
                labels=train_labels,
                video_len_files=train_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN,
                n_clips=train_utils.N_EVAL_CLIPS)
            self.test_eval_dataset = train_utils.EvalDataset(
                video_files=test_video_files,
                labels=test_labels,
                video_len_files=test_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN,
                n_clips=train_utils.N_EVAL_CLIPS)
        else:
            raise ValueError('no such model... how did you even get here...')
Beispiel #7
0
class Trainer:
    def __init__(self, experiment):
        self.experiment = str(experiment)
        config_file = os.path.join(CONFIG_DIR, experiment + '.json')
        with open(config_file, 'r') as f:
            self.config = json.load(f)

        model = self.config['model']
        optimizer = self.config['optimizer']
        scheduler = self.config['scheduler']
        dataset_name = self.config['dataset']

        self.clip_len = int(self.config['clip-length'])
        self.lr = float(self.config['lr'])
        self.weight_decay = float(self.config['weight-decay'])

        self.max_epochs = int(self.config['max-epochs'])
        self.epoch = 0
        self.iteration = 0
        self.train_batch_size = int(self.config['train-batch-size'])
        self.eval_batch_size = int(self.config['test-batch-size'])
        self.warmup_scheduler = bool(self.config['warmup-scheduler'])

        if dataset_name == 'breakfast':
            import data.breakfast_data as dataset_utils
            self.n_classes = dataset_utils.N_CLASSES
        else:
            raise ValueError('no such dataset')

        if model == 'ir-csn':
            import utils.csnet_utils as train_utils
            self.model = IrCsn152(n_classes=dataset_utils.N_CLASSES,
                                  clip_len=self.clip_len,
                                  crop_size=train_utils.CROP_SIZE)
        else:
            raise ValueError('no such model')
        train_video_files, train_labels, train_video_len_files = dataset_utils.get_train_data(
        )
        test_video_files, test_labels, test_video_len_files = dataset_utils.get_test_data(
        )

        self.train_batch_size = self.train_batch_size * torch.cuda.device_count(
        )
        device_ids = list(range(torch.cuda.device_count()))
        self.model = nn.DataParallel(self.model, device_ids=device_ids)
        self.model = self.model.cuda()

        self.loss_fn = nn.CrossEntropyLoss()

        if optimizer == 'adam':
            self.optimizer = Adam(self.model.parameters(),
                                  lr=self.lr,
                                  weight_decay=self.weight_decay)
        elif optimizer == 'sgd':
            self.optimizer = SGD(self.model.parameters(),
                                 lr=self.lr,
                                 weight_decay=self.weight_decay,
                                 momentum=0)
        else:
            raise ValueError('no such optimizer')

        if scheduler == 'step':
            step_size = self.config['lr-decay-step-size']
            lr_decay = self.config['lr-decay-rate']
            self.scheduler = StepLR(self.optimizer,
                                    gamma=lr_decay,
                                    step_size=step_size)
        elif scheduler == 'half-cosine':
            self.scheduler = CosineAnnealingWarmRestarts(
                self.optimizer, T_0=len(train_video_files))
        else:
            raise ValueError('no such scheduler')

        if self.warmup_scheduler:
            if scheduler == 'half-cosine':
                self.scheduler = GradualWarmupScheduler(
                    self.optimizer,
                    multiplier=8,
                    total_epoch=10,
                    after_scheduler=self.scheduler)
            else:
                raise ValueError('no such scheduler')

        self.checkpoint_dir = os.path.join(CHECKPOINT_DIR, self.experiment)
        self.load_checkpoint(ckpt_name='model')
        self.log_dir = os.path.join(LOG_DIR, self.experiment)
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        self.tensorboard_logger = SummaryWriter(self.log_dir)

        if model == 'ir-csn':
            self.train_dataset = train_utils.TrainDataset(
                video_files=train_video_files,
                labels=train_labels,
                video_len_files=train_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN)

            # evaluation datasets
            self.train_eval_dataset = train_utils.EvalDataset(
                video_files=train_video_files,
                labels=train_labels,
                video_len_files=train_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN,
                n_clips=train_utils.N_EVAL_CLIPS)
            self.test_eval_dataset = train_utils.EvalDataset(
                video_files=test_video_files,
                labels=test_labels,
                video_len_files=test_video_len_files,
                resize=train_utils.RESIZE,
                crop_size=train_utils.CROP_SIZE,
                clip_len=train_utils.CLIP_LEN,
                n_clips=train_utils.N_EVAL_CLIPS)
        else:
            raise ValueError('no such model... how did you even get here...')

    def train(self):
        start_epoch = self.epoch
        for i in range(start_epoch, self.max_epochs):
            print('INFO: epoch {0}/{1}'.format(i + 1, self.max_epochs))
            self.epoch += 1
            self.train_step()
            self.eval_step()
            self.save_checkpoint(ckpt_name='model')
            self.save_checkpoint(ckpt_name='model-{}'.format(self.epoch))

    def train_step(self):
        print('INFO: training at epoch {}'.format(self.epoch))
        dataloader = tdata.DataLoader(self.train_dataset,
                                      batch_size=self.train_batch_size,
                                      collate_fn=self.train_dataset.collate_fn,
                                      shuffle=True,
                                      num_workers=12,
                                      drop_last=True)

        i = 0
        epoch_losses = []
        self.model.train()
        pbar = tqdm(dataloader)
        for frames, labels in pbar:

            frames = frames.cuda()
            labels = labels.cuda()
            self.optimizer.zero_grad()
            logits = self.model(frames)

            loss = self.loss_fn(logits, labels)
            loss.backward()
            self.optimizer.step()
            i += 1

            loss = loss.item()
            pbar.set_postfix({'loss:': loss})
            epoch_losses.append(loss)
        epoch_loss = np.mean(epoch_losses)
        print('INFO: training loss: {}'.format(epoch_loss))

        for loss in epoch_losses:
            train_log = {'loss': loss}
            self.iteration += 1
            self.tensorboard_logger.add_scalars(
                '{}:train'.format(self.experiment), train_log, self.iteration)

    def eval_step(self, evaluate_train=True):
        print('INFO: evaluating...')
        self.model.eval()
        test_accuracy = self.eval_test()
        eval_log = {'test-accuracy': test_accuracy}

        if evaluate_train:
            eval_log['train-accuracy'] = self.eval_train()
        self.tensorboard_logger.add_scalars(
            '{}:evaluation'.format(self.experiment), eval_log, self.epoch)

    def eval_train(self):
        print('INFO: evaluating train dataset...')
        prediction_file = os.path.join(
            LOG_DIR, 'epoch-{0}-train-prediction.json'.format(self.epoch))
        train_accuracy = self.eval_dataset(self.train_eval_dataset,
                                           prediction_file)
        print('INFO: epoch {0} train accuracy: {1}'.format(
            self.epoch, train_accuracy))
        return train_accuracy

    def eval_test(self):
        print('INFO: evaluating test dataset...')
        prediction_file = os.path.join(
            LOG_DIR, 'epoch-{0}-test-prediction.json'.format(self.epoch))
        test_accuracy = self.eval_dataset(self.test_eval_dataset,
                                          prediction_file)
        print('INFO: epoch {0} test accuracy: {1}'.format(
            self.epoch, test_accuracy))
        return test_accuracy

    def eval_dataset(self, dataset, prediction_file):
        dataloader = tdata.DataLoader(dataset=dataset,
                                      batch_size=self.eval_batch_size,
                                      shuffle=False,
                                      num_workers=12,
                                      pin_memory=True,
                                      collate_fn=dataset.collate_fn)
        prediction_dict = dict()
        for i, video in enumerate(dataset.video_files):
            video_prediction = {
                'label': int(dataset.labels[i]),
                'n_clips': 0,
                'logit': None
            }
            prediction_dict[video] = video_prediction

        with torch.no_grad():
            for clips, clip_files in tqdm(dataloader):
                n_clips = clips.shape[0]
                clips = clips.cuda()
                logits = self.model(clips).detach().cpu()

                # update prediction dict.
                for i in range(n_clips):
                    logit = logits[i]
                    clip_file = clip_files[i]
                    video_prediction = prediction_dict[clip_file]

                    if video_prediction['n_clips'] == 0:
                        video_prediction['logit'] = logit
                        video_prediction['n_clips'] = 1
                    else:  # keep a running average of logits
                        video_prediction['n_clips'] += 1
                        n_clips = video_prediction['n_clips']
                        video_prediction['logit'] = video_prediction['logit'] * ((n_clips - 1) / n_clips) + \
                                                    logit / n_clips
        n_correct = 0
        n_videos = len(dataset.video_files)

        for video, video_dict in prediction_dict.items():
            logit = video_dict['logit']
            label = int(video_dict['label'])
            prediction = int(torch.argmax(logit).item())
            video_dict['prediction'] = prediction
            if label == prediction:
                n_correct += 1
            del video_dict['logit']
        accuracy = n_correct / n_videos
        with open(prediction_file, 'w') as f:
            json.dump(prediction_dict, f)
        return accuracy

    def save_checkpoint(self, ckpt_name):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)
        checkpoint_file = os.path.join(self.checkpoint_dir, ckpt_name + '.pth')

        checkpoint_dict = {
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'scheduler': self.scheduler.state_dict(),
            'epoch': self.epoch,
            'iteration': self.iteration
        }
        torch.save(checkpoint_dict, checkpoint_file)
        print('INFO: saved checkpoint {}'.format(checkpoint_file))

    def load_checkpoint(self, ckpt_name='model'):
        checkpoint_file = os.path.join(self.checkpoint_dir, ckpt_name + '.pth')
        if not os.path.exists(checkpoint_file):
            print('WARNING: checkpoint does not exist. Continuing...')
        else:
            checkpoint_dict = torch.load(checkpoint_file,
                                         map_location='cuda:{}'.format(0))
            self.model.load_state_dict(checkpoint_dict['model'])
            self.optimizer.load_state_dict(checkpoint_dict['optimizer'])
            self.scheduler.load_state_dict(checkpoint_dict['scheduler'])
            self.epoch = checkpoint_dict['epoch']
            self.iteration = checkpoint_dict['iteration']

    def __del__(self):
        self.tensorboard_logger.close()
        del self.model
Beispiel #8
0
def train(
        save_path,
        save_every,
        img_size,
        resume,
        epochs,
        batch_size,
        accumulated_batches,
        opt=None
):
    os.environ['CUDA_VISIBLE_DEVICES']=opt.gpu
    model_name = opt.backbone_name + '_img_size' + str(img_size[0]) + '_' + str(img_size[1]) 
    weights_path = osp.join(save_path, model_name)
    loss_log_path = osp.join(weights_path, 'loss.json')
    mkdir_if_missing(weights_path)
    cfg = {}
    cfg['width'] = img_size[0]
    cfg['height'] = img_size[1]
    cfg['backbone_name'] = opt.backbone_name
    cfg['lr'] = opt.lr
    
    if resume:
        latest_resume = osp.join(weights_path, 'latest.pt')

    torch.backends.cudnn.benchmark = True
    # root = '/home/hunter/Document/torch'
    root = '/data/dgw'

    if opt.all_datasets:
        paths_trainset =  {'02':'./data/track/train/MOT16-02.txt',
                        '04':'./data/track/train/MOT16-04.txt',
                        '05':'./data/track/train/MOT16-05.txt',
                        '09':'./data/track/train/MOT16-09.txt',
                        '10':'./data/track/train/MOT16-10.txt',
                        '11':'./data/track/train/MOT16-11.txt',
                        '13':'./data/track/train/MOT16-13.txt',
                          'CT':'./data/detect/CT_train.txt', 
                          'ETH':'./data/detect/ETH.txt',
                          'PRW':'./data/detect/PRW_train.txt', 
                        'CP':'./data/detect/cp_train.txt',
                        'CS':'./data/detect/CUHK_train.txt'}
        paths_valset =    {'02':'./data/track/val/MOT16-02.txt',
                        '04':'./data/track/val/MOT16-04.txt',
                        '05':'./data/track/val/MOT16-05.txt',
                        '09':'./data/track/val/MOT16-09.txt',
                        '10':'./data/track/val/MOT16-10.txt',
                        '11':'./data/track/val/MOT16-11.txt',
                        '13':'./data/track/val/MOT16-13.txt',
                        'CP':'./data/detect/cp_val.txt',
                        'PRW':'./data/detect/PRW_val.txt',
                        'CT':'./data/detect/CT_val.txt',
                        'CS':'./data/detect/CUHK_val.txt'}
    else:
        paths_trainset =  {'02':'./data/track/train/MOT16-02.txt',
                        '04':'./data/track/train/MOT16-04.txt',
                        '05':'./data/track/train/MOT16-05.txt',
                        '09':'./data/track/train/MOT16-09.txt',
                        '10':'./data/track/train/MOT16-10.txt',
                        '11':'./data/track/train/MOT16-11.txt',
                        '13':'./data/track/train/MOT16-13.txt'}
        paths_valset =    {'02':'./data/track/val/MOT16-02.txt',
                        '04':'./data/track/val/MOT16-04.txt',
                        '05':'./data/track/val/MOT16-05.txt',
                        '09':'./data/track/val/MOT16-09.txt',
                        '10':'./data/track/val/MOT16-10.txt',
                        '11':'./data/track/val/MOT16-11.txt',
                        '13':'./data/track/val/MOT16-13.txt'}
    transforms = T.Compose([T.ToTensor()])
    trainset = JointDataset(root=root, paths=paths_trainset, img_size=img_size, augment=True, transforms=transforms)
    valset = JointDataset(root=root, paths=paths_valset, img_size=img_size, augment=False, transforms=transforms)

    dataloader_trainset = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True,
                                                num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn)
    dataloader_valset = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True,
                                                num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn)                                       
    
    cfg['num_ID'] = trainset.nID
    backbone = resnet_fpn_backbone(opt.backbone_name, True)
    backbone.out_channels = 256

    model = Jde_RCNN(backbone, num_ID=trainset.nID, min_size=img_size[1], max_size=img_size[0], version=opt.model_version, len_embeddings=opt.len_embed)
    model.cuda().train()

    # model = torch.nn.DataParallel(model)
    start_epoch = 0

    optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, model.parameters()), lr=opt.lr, momentum=.9, weight_decay=5e-4)
    after_scheduler = StepLR(optimizer, 10, 0.1)
    scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=after_scheduler)

    if resume:
        checkpoint = torch.load(latest_resume, map_location='cpu')

        # Load weights to resume from
        print(model.load_state_dict(checkpoint['model'],strict=False))
        
        start_epoch = checkpoint['epoch_det']

        del checkpoint  # current, saved
        
    else:
        with open(osp.join(weights_path,'model.yaml'), 'w+') as f:
            yaml.dump(cfg, f)
        
    for epoch in range(epochs):
        model.cuda().eval()
        with torch.no_grad():
            if epoch%3==0:
                test_emb(model, dataloader_valset, print_interval=50)[-1]
                test(model, dataloader_valset, conf_thres=0.5, iou_thres=0.2, print_interval=50)

            scheduler.step(epoch+start_epoch)

        model.cuda().train()
        print('lr: ', optimizer.param_groups[0]['lr'])
        loss_epoch_log = dict(loss_total=0, loss_classifier=0, loss_box_reg=0, loss_reid=0, loss_objectness=0, loss_rpn_box_reg=0)
        for i, (imgs, labels, _, _, targets_len) in enumerate(tqdm(dataloader_trainset)):
            targets = []
            imgs = imgs.cuda()
            labels = labels.cuda()
            flag = False
            for target_len, label in zip(targets_len.view(-1,), labels):
                ## convert the input to demanded format
                target = {}
                if target_len==0:
                    flag = True
                if torch.all(label[0:int(target_len), 1]==-1):
                    flag = True
                target['boxes'] = label[0:int(target_len), 2:6]
                target['ids'] = (label[0:int(target_len), 1]).long()
                target['labels'] = torch.ones_like(target['ids'])
                targets.append(target)
            if flag:
                continue
            losses = model(imgs, targets)
            loss = losses['loss_classifier'] + losses['loss_box_reg'] + losses['loss_objectness'] + losses['loss_rpn_box_reg'] + 0.4*losses['loss_reid']
            loss.backward()

            if ((i + 1) % accumulated_batches == 0) or (i == len(dataloader_trainset) - 1):
                optimizer.step()
                optimizer.zero_grad()
        ## print and log the loss

            for key, val in losses.items():
                loss_epoch_log[key] = float(val) + loss_epoch_log[key]
        
        for key, val in loss_epoch_log.items():
            loss_epoch_log[key] =loss_epoch_log[key]/i
        print("loss in epoch %d: "%(epoch))
        print(loss_epoch_log)
                
        epoch_det = epoch + start_epoch
        epoch_reid = epoch + start_epoch

        checkpoint = {'epoch_det': epoch_det,
                      'epoch_reid': epoch_reid,
                      'model': model.state_dict()
                    }
        latest = osp.join(weights_path, 'latest.pt')
        torch.save(checkpoint, latest)
        if epoch % save_every == 0 and epoch != 0:
            torch.save(checkpoint, osp.join(weights_path, "weights_epoch_" + str(epoch_det) + '_' + str(epoch_reid) + ".pt"))
        with open(loss_log_path, 'a+') as f:
            f.write('epoch_det:'+str(epoch_det)+',epoch_reid:'+str(epoch_reid)+'\n')
            json.dump(loss_epoch_log, f) 
            f.write('\n')