Esempio n. 1
0
    def experiment(self):
        r"""

        Actual comet object. To use comet features do the following.

        Example::

            self.logger.experiment.some_comet_function()

        """
        if self._experiment is not None:
            return self._experiment

        if self.mode == "online":
            if self.experiment_key is None:
                self._experiment = CometExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    **self._kwargs)
                self.experiment_key = self._experiment.get_key()
            else:
                self._experiment = CometExistingExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    previous_experiment=self.experiment_key,
                    **self._kwargs)
        else:
            self._experiment = CometOfflineExperiment(
                offline_directory=self.save_dir,
                workspace=self.workspace,
                project_name=self.project_name,
                **self._kwargs)

        return self._experiment
Esempio n. 2
0
    def experiment(self) -> CometBaseExperiment:
        r"""
        Actual Comet object. To use Comet features in your
        :class:`~pytorch_lightning.core.lightning.LightningModule` do the following.

        Example::

            self.logger.experiment.some_comet_function()

        """
        if self._experiment is not None:
            return self._experiment

        if self.mode == "online":
            if self.experiment_key is None:
                self._experiment = CometExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    **self._kwargs)
                self.experiment_key = self._experiment.get_key()
            else:
                self._experiment = CometExistingExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    previous_experiment=self.experiment_key,
                    **self._kwargs)
        else:
            self._experiment = CometOfflineExperiment(
                offline_directory=self.save_dir,
                workspace=self.workspace,
                project_name=self.project_name,
                **self._kwargs)

        return self._experiment
 def __init__(self, *args, **kwargs):
     super(CometLogger, self).__init__()
     self.experiment = CometExperiment(*args, **kwargs)
best_loss = 0  # best test accuracy

if not args.no_log_to_comet:
    if params['local_comet_dir']:
        comet_exp = OfflineExperiment(
            api_key="hIXq6lDzWzz24zgKv7RYz6blo",
            project_name="supercyclecons",
            workspace="cinjon",
            auto_metric_logging=True,
            auto_output_logging=None,
            auto_param_logging=False,
            offline_directory=params['local_comet_dir'])
    else:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="supercyclecons",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    comet_exp.log_parameters(vars(args))
    comet_exp.set_name(params['name'])


def partial_load(pretrained_dict, model):
    model_dict = model.state_dict()

    # 1. filter out unnecessary keys
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items() if k in model_dict
    }
    # 2. overwrite entries in the existing state dict
def BSN_Train_PEM(opt):
    model = PEM(opt)
    model = torch.nn.DataParallel(model).cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["pem_training_lr"],
                           weight_decay=opt["pem_weight_decay"])

    print('Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))

    def collate_fn(batch):
        batch_data = torch.cat([x[0] for x in batch])
        batch_iou = torch.cat([x[1] for x in batch])
        return batch_data, batch_iou

    train_dataset = ProposalDataSet(opt, subset="train")
    train_sampler = ProposalSampler(train_dataset.proposals,
                                    train_dataset.indices,
                                    max_zero_weight=opt['pem_max_zero_weight'])

    global_step = 0
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=model.module.batch_size,
        shuffle=False,
        sampler=train_sampler,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False,
        collate_fn=collate_fn if not opt['pem_do_index'] else None)

    subset = "validation" if opt['dataset'] == 'activitynet' else "test"
    test_loader = torch.utils.data.DataLoader(
        ProposalDataSet(opt, subset=subset),
        batch_size=model.module.batch_size,
        shuffle=True,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False,
        collate_fn=collate_fn if not opt['pem_do_index'] else None)

    milestones = [int(k) for k in opt['pem_lr_milestones'].split(',')]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=milestones, gamma=opt['pem_step_gamma'])

    if opt['log_to_comet']:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="bsnpem",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    elif opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="bsnpem",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = None

    if comet_exp:
        comet_exp.log_parameters(opt)
        comet_exp.set_name(opt['name'])

    test_PEM(test_loader, model, -1, -1, comet_exp, opt)
    for epoch in range(opt["pem_epoch"]):
        global_step = train_PEM(train_loader, model, optimizer, epoch,
                                global_step, comet_exp, opt)
        test_PEM(test_loader, model, epoch, global_step, comet_exp, opt)
        scheduler.step()
def BSN_Train_TEM(opt):
    global_step = 0
    epoch = 0
    if opt['do_representation']:
        model = TEM(opt)
        optimizer = optim.Adam(model.parameters(),
                               lr=opt["tem_training_lr"],
                               weight_decay=opt["tem_weight_decay"])
        global_step, epoch = _maybe_load_checkpoint(
            model, optimizer, global_step, epoch,
            os.path.join(opt["checkpoint_path"], opt['name']))
        if opt['representation_checkpoint']:
            # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0])
            if opt['do_random_model']:
                print('DOING RANDOM MDOEL!!!')
            else:
                print('DOING Pretrianed modelll!!!')
                partial_load(opt['representation_checkpoint'], model)
            # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0])
        if not opt['no_freeze']:
            for param in model.representation_model.parameters():
                param.requires_grad = False
        print(len([p for p in model.representation_model.parameters()]))
    else:
        model = TEM(opt)
        optimizer = optim.Adam(model.parameters(),
                               lr=opt["tem_training_lr"],
                               weight_decay=opt["tem_weight_decay"])
        global_step, epoch = _maybe_load_checkpoint(
            model, optimizer, global_step, epoch,
            os.path.join(opt["checkpoint_path"], opt['name']))

    model = torch.nn.DataParallel(model).cuda()
    # summary(model, (2, 3, 224, 224))

    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))

    if opt['dataset'] == 'gymnastics':
        # default image_dir is '/checkpoint/cinjon/spaceofmotion/sep052019/rawframes.426x240.12'
        img_loading_func = get_img_loader(opt)
        train_data_set = GymnasticsImages(opt,
                                          subset='Train',
                                          img_loading_func=img_loading_func,
                                          image_dir=opt['gym_image_dir'],
                                          video_info_path=os.path.join(
                                              opt['video_info'],
                                              'Train_Annotation.csv'))
        train_sampler = GymnasticsSampler(train_data_set, opt['sampler_mode'])
        test_data_set = GymnasticsImages(opt,
                                         subset="Val",
                                         img_loading_func=img_loading_func,
                                         image_dir=opt['gym_image_dir'],
                                         video_info_path=os.path.join(
                                             opt['video_info'],
                                             'Val_Annotation.csv'))
    elif opt['dataset'] == 'gymnasticsfeatures':
        # feature_dirs should roughly look like:
        # /checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/rgb,/checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/flow
        feature_dirs = opt['feature_dirs'].split(',')
        train_data_set = GymnasticsFeatures(opt,
                                            subset='Train',
                                            feature_dirs=feature_dirs,
                                            video_info_path=os.path.join(
                                                opt['video_info'],
                                                'Train_Annotation.csv'))
        test_data_set = GymnasticsFeatures(opt,
                                           subset='Val',
                                           feature_dirs=feature_dirs,
                                           video_info_path=os.path.join(
                                               opt['video_info'],
                                               'Val_Annotation.csv'))
        train_sampler = None
    elif opt['dataset'] == 'thumosfeatures':
        feature_dirs = opt['feature_dirs'].split(',')
        train_data_set = ThumosFeatures(opt,
                                        subset='Val',
                                        feature_dirs=feature_dirs)
        test_data_set = ThumosFeatures(opt,
                                       subset="Test",
                                       feature_dirs=feature_dirs)
        train_sampler = None
    elif opt['dataset'] == 'thumosimages':
        img_loading_func = get_img_loader(opt)
        train_data_set = ThumosImages(
            opt,
            subset='Val',
            img_loading_func=img_loading_func,
            image_dir=
            '/checkpoint/cinjon/thumos/rawframes.TH14_validation_tal.30',
            video_info_path=os.path.join(opt['video_info'],
                                         'Val_Annotation.csv'))
        test_data_set = ThumosImages(
            opt,
            subset='Test',
            img_loading_func=img_loading_func,
            image_dir='/checkpoint/cinjon/thumos/rawframes.TH14_test_tal.30',
            video_info_path=os.path.join(opt['video_info'],
                                         'Test_Annotation.csv'))
        train_sampler = None
    elif opt['dataset'] == 'activitynet':
        train_sampler = None
        representation_module = opt['representation_module']
        train_transforms = get_video_transforms(representation_module,
                                                opt['do_augment'])
        test_transforms = get_video_transforms(representation_module, False)
        train_data_set = VideoDataset(opt,
                                      train_transforms,
                                      subset='train',
                                      fraction=0.3)
        # We use val because we don't have annotations for test.
        test_data_set = VideoDataset(opt,
                                     test_transforms,
                                     subset='val',
                                     fraction=0.3)

    print('train_loader / val_loader sizes: ', len(train_data_set),
          len(test_data_set))
    train_loader = torch.utils.data.DataLoader(
        train_data_set,
        batch_size=model.module.batch_size,
        shuffle=False if train_sampler else True,
        sampler=train_sampler,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False)

    test_loader = torch.utils.data.DataLoader(
        test_data_set,
        batch_size=model.module.batch_size,
        shuffle=False,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False)
    # test_loader = None

    milestones = [int(k) for k in opt['tem_lr_milestones'].split(',')]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=milestones, gamma=opt['tem_step_gamma'])

    if opt['log_to_comet']:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="bsn",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    elif opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="bsn",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = None

    if comet_exp:
        comet_exp.log_parameters(opt)
        comet_exp.set_name(opt['name'])

    # test_TEM(test_loader, model, optimizer, 0, 0, comet_exp, opt)
    for epoch in range(epoch + 1, opt["tem_epoch"] + 1):
        global_step = train_TEM(train_loader, model, optimizer, epoch,
                                global_step, comet_exp, opt)
        test_TEM(test_loader, model, optimizer, epoch, global_step, comet_exp,
                 opt)
        if opt['dataset'] == 'activitynet':
            test_loader.dataset._subset_dataset(.3)
            train_loader.dataset._subset_dataset(.3)
        scheduler.step()
Esempio n. 7
0
def main(args):
    print('Pretrain? ', not args.not_pretrain)
    print(args.model)
    start_time = time.time()

    if opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="selfcifar",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="selfcifar",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    comet_exp.log_parameters(vars(args))
    comet_exp.set_name(args.name)

    # Build model
    # path = "/misc/kcgscratch1/ChoGroup/resnick/spaceofmotion/zeping/bsn"
    linear_cls = NonLinearModel if args.do_nonlinear else LinearModel

    if args.model == "amdim":
        hparams = load_hparams_from_tags_csv(
            '/checkpoint/cinjon/amdim/meta_tags.csv')
        # hparams = load_hparams_from_tags_csv(os.path.join(path, "meta_tags.csv"))
        model = AMDIMModel(hparams)
        if not args.not_pretrain:
            # _path = os.path.join(path, "_ckpt_epoch_434.ckpt")
            _path = '/checkpoint/cinjon/amdim/_ckpt_epoch_434.ckpt'
            model.load_state_dict(torch.load(_path)["state_dict"])
        else:
            print("AMDIM not loading checkpoint")  # Debug
        linear_model = linear_cls(AMDIM_OUTPUT_DIM, args.num_classes)
    elif args.model == "ccc":
        model = CCCModel(None)
        if not args.not_pretrain:
            # _path = os.path.join(path, "TimeCycleCkpt14.pth")
            _path = '/checkpoint/cinjon/spaceofmotion/bsn/TimeCycleCkpt14.pth'
            checkpoint = torch.load(_path)
            base_dict = {
                '.'.join(k.split('.')[1:]): v
                for k, v in list(checkpoint['state_dict'].items())
            }
            model.load_state_dict(base_dict)
        else:
            print("CCC not loading checkpoint")  # Debug
        linear_model = linaer_cls(CCC_OUTPUT_DIM,
                                  args.num_classes)  #.to(device)
    elif args.model == "corrflow":
        model = CORRFLOWModel(None)
        if not args.not_pretrain:
            _path = '/checkpoint/cinjon/spaceofmotion/supercons/corrflow.kineticsmodel.pth'
            # _path = os.path.join(path, "corrflow.kineticsmodel.pth")
            checkpoint = torch.load(_path)
            base_dict = {
                '.'.join(k.split('.')[1:]): v
                for k, v in list(checkpoint['state_dict'].items())
            }
            model.load_state_dict(base_dict)
        else:
            print("CorrFlow not loading checkpoing")  # Debug
        linear_model = linear_cls(CORRFLOW_OUTPUT_DIM, args.num_classes)
    elif args.model == "resnet":
        if not args.not_pretrain:
            resnet = torchvision.models.resnet50(pretrained=True)
        else:
            resnet = torchvision.models.resnet50(pretrained=False)
            print("ResNet not loading checkpoint")  # Debug
        modules = list(resnet.children())[:-1]
        model = nn.Sequential(*modules)
        linear_model = linear_cls(RESNET_OUTPUT_DIM, args.num_classes)
    else:
        raise Exception("model type has to be amdim, ccc, corrflow or resnet")

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model).to(device)
        linear_model = nn.DataParallel(linear_model).to(device)
    else:
        model = model.to(device)
        linear_model = linear_model.to(device)
    # model = model.to(device)
    # linear_model = linear_model.to(device)

    # Freeze model
    for p in model.parameters():
        p.requires_grad = False
    model.eval()

    if args.optimizer == "Adam":
        optimizer = optim.Adam(linear_model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        print("Optimizer: Adam with weight decay: {}".format(
            args.weight_decay))
    elif args.optimizer == "SGD":
        optimizer = optim.SGD(linear_model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
        print("Optimizer: SGD with weight decay: {} momentum: {}".format(
            args.weight_decay, args.momentum))
    else:
        raise Exception("optimizer should be Adam or SGD")
    optimizer.zero_grad()

    # Set up log dir
    now = datetime.datetime.now()
    log_dir = '/checkpoint/cinjon/spaceofmotion/bsn/cifar-%d-weights/%s/%s' % (
        args.num_classes, args.model, args.name)
    # log_dir = "{}{:%Y%m%dT%H%M}".format(args.model, now)
    # log_dir = os.path.join("weights", log_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    print("Saving to {}".format(log_dir))

    batch_size = args.batch_size * torch.cuda.device_count()
    # CIFAR-10
    if args.num_classes == 10:
        data_path = ("/private/home/cinjon/cifar-data/cifar-10-batches-py")
        _train_dataset = CIFAR_dataset(glob(os.path.join(data_path, "data*")),
                                       args.num_classes, args.model, True)
        # _train_acc_dataset = CIFAR_dataset(
        #     glob(os.path.join(data_path, "data*")),
        #     args.num_classes,
        #     args.model,
        #     False)
        train_dataloader = data.DataLoader(_train_dataset,
                                           shuffle=True,
                                           batch_size=batch_size,
                                           num_workers=args.num_workers)
        # train_split = int(len(_train_dataset) * 0.8)
        # train_dev_split = int(len(_train_dataset) - train_split)
        # train_dataset, train_dev_dataset = data.random_split(
        #     _train_dataset, [train_split, train_dev_split])
        # train_acc_dataloader = data.DataLoader(
        #     train_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dev_acc_dataloader = data.DataLoader(
        #     train_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dataset = data.Subset(_train_dataset, list(range(train_split)))
        # train_dataloader = data.DataLoader(
        #     train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers)
        # train_acc_dataset = data.Subset(
        #     _train_acc_dataset, list(range(train_split)))
        # train_acc_dataloader = data.DataLoader(
        #     train_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dev_acc_dataset = data.Subset(
        #     _train_acc_dataset, list(range(train_split, len(_train_acc_dataset))))
        # train_dev_acc_dataloader = data.DataLoader(
        #     train_dev_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)

        _val_dataset = CIFAR_dataset([os.path.join(data_path, "test_batch")],
                                     args.num_classes, args.model, False)
        val_dataloader = data.DataLoader(_val_dataset,
                                         shuffle=False,
                                         batch_size=batch_size,
                                         num_workers=args.num_workers)
        # val_split = int(len(_val_dataset) * 0.8)
        # val_dev_split = int(len(_val_dataset) - val_split)
        # val_dataset, val_dev_dataset = data.random_split(
        #     _val_dataset, [val_split, val_dev_split])
        # val_dataloader = data.DataLoader(
        #     val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # val_dev_dataloader = data.DataLoader(
        #     val_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
    # CIFAR-100
    elif args.num_classes == 100:
        data_path = ("/private/home/cinjon/cifar-data/cifar-100-python")
        _train_dataset = CIFAR_dataset([os.path.join(data_path, "train")],
                                       args.num_classes, args.model, True)
        train_dataloader = data.DataLoader(_train_dataset,
                                           shuffle=True,
                                           batch_size=batch_size)
        _val_dataset = CIFAR_dataset([os.path.join(data_path, "test")],
                                     args.num_classes, args.model, False)
        val_dataloader = data.DataLoader(_val_dataset,
                                         shuffle=False,
                                         batch_size=batch_size)
    else:
        raise Exception("num_classes should be 10 or 100")

    best_acc = 0.0
    best_epoch = 0

    # Training
    for epoch in range(1, args.epochs + 1):
        current_lr = max(3e-4, args.lr *\
            math.pow(0.5, math.floor(epoch / args.lr_interval)))
        linear_model.train()
        if args.optimizer == "Adam":
            optimizer = optim.Adam(linear_model.parameters(),
                                   lr=current_lr,
                                   weight_decay=args.weight_decay)
        elif args.optimizer == "SGD":
            optimizer = optim.SGD(
                linear_model.parameters(),
                lr=current_lr,
                momentum=args.momentum,
                weight_decay=args.weight_decay,
            )

        ####################################################
        # Train
        t = time.time()
        train_acc = 0
        train_loss_sum = 0.0
        for iter, input in enumerate(train_dataloader):
            if time.time(
            ) - start_time > args.time * 3600 - 300 and comet_exp is not None:
                comet_exp.end()
                sys.exit(-1)

            imgs = input[0].to(device)
            if args.model != "resnet":
                imgs = imgs.unsqueeze(1)
            lbls = input[1].flatten().to(device)

            # output = model(imgs)
            # output = linear_model(output)
            output = linear_model(model(imgs))
            loss = F.cross_entropy(output, lbls)
            train_loss_sum += float(loss.data)
            train_acc += int(sum(torch.argmax(output, dim=1) == lbls))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # log_text = "train epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter"
            if iter % 1500 == 0:
                log_text = "train epoch {}/{}\titer {}/{} loss:{}"
                print(log_text.format(epoch, args.epochs, iter + 1,
                                      len(train_dataloader), loss.data,
                                      time.time() - t),
                      flush=False)
                t = time.time()

        train_acc /= len(_train_dataset)
        train_loss_sum /= len(train_dataloader)
        with comet_exp.train():
            comet_exp.log_metrics({
                'acc': train_acc,
                'loss': train_loss_sum
            },
                                  step=(epoch + 1) * len(train_dataloader),
                                  epoch=epoch + 1)
        print("train acc epoch {}/{} loss:{} train_acc:{}".format(
            epoch, args.epochs, train_loss_sum, train_acc),
              flush=True)

        #######################################################################
        # Train acc
        # linear_model.eval()
        # train_acc = 0
        # train_loss_sum = 0.0
        # for iter, input in enumerate(train_acc_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     # output = model(imgs)
        #     # output = linear_model(output)
        #     output = linear_model(model(imgs))
        #     loss = F.cross_entropy(output, lbls)
        #     train_loss_sum += float(loss.data)
        #     train_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("train acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(train_acc_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        #
        # train_acc /= len(train_acc_dataset)
        # train_loss_sum /= len(train_acc_dataloader)
        # print("train acc epoch {}/{} loss:{} train_acc:{}".format(
        #     epoch, args.epochs, train_loss_sum, train_acc), flush=True)

        #######################################################################
        # Train dev acc
        # # linear_model.eval()
        # train_dev_acc = 0
        # train_dev_loss_sum = 0.0
        # for iter, input in enumerate(train_dev_acc_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     output = model(imgs)
        #     output = linear_model(output)
        #     # output = linear_model(model(imgs))
        #     loss = F.cross_entropy(output, lbls)
        #     train_dev_loss_sum += float(loss.data)
        #     train_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("train dev acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(train_dev_acc_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        # train_dev_acc /= len(train_dev_acc_dataset)
        # train_dev_loss_sum /= len(train_dev_acc_dataloader)
        # print("train dev epoch {}/{} loss:{} train_dev_acc:{}".format(
        #     epoch, args.epochs, train_dev_loss_sum, train_dev_acc), flush=True)

        #######################################################################
        # Val dev
        # # linear_model.eval()
        # val_dev_acc = 0
        # val_dev_loss_sum = 0.0
        # for iter, input in enumerate(val_dev_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     output = model(imgs)
        #     output = linear_model(output)
        #     loss = F.cross_entropy(output, lbls)
        #     val_dev_loss_sum += float(loss.data)
        #     val_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("val dev epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(val_dev_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        # val_dev_acc /= len(val_dev_dataset)
        # val_dev_loss_sum /= len(val_dev_dataloader)
        # print("val dev epoch {}/{} loss:{} val_dev_acc:{}".format(
        #     epoch, args.epochs, val_dev_loss_sum, val_dev_acc), flush=True)

        #######################################################################
        # Val
        linear_model.eval()
        val_acc = 0
        val_loss_sum = 0.0
        for iter, input in enumerate(val_dataloader):
            if time.time(
            ) - start_time > args.time * 3600 - 300 and comet_exp is not None:
                comet_exp.end()
                sys.exit(-1)

            imgs = input[0].to(device)
            if args.model != "resnet":
                imgs = imgs.unsqueeze(1)
            lbls = input[1].flatten().to(device)

            output = model(imgs)
            output = linear_model(output)
            loss = F.cross_entropy(output, lbls)
            val_loss_sum += float(loss.data)
            val_acc += int(sum(torch.argmax(output, dim=1) == lbls))

            # log_text = "val epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter"
            if iter % 1500 == 0:
                log_text = "val epoch {}/{} iter {}/{} loss:{}"
                print(log_text.format(epoch, args.epochs, iter + 1,
                                      len(val_dataloader), loss.data,
                                      time.time() - t),
                      flush=False)
                t = time.time()

        val_acc /= len(_val_dataset)
        val_loss_sum /= len(val_dataloader)
        print("val epoch {}/{} loss:{} val_acc:{}".format(
            epoch, args.epochs, val_loss_sum, val_acc))
        with comet_exp.test():
            comet_exp.log_metrics({
                'acc': val_acc,
                'loss': val_loss_sum
            },
                                  step=(epoch + 1) * len(train_dataloader),
                                  epoch=epoch + 1)

        if val_acc > best_acc:
            best_acc = val_acc
            best_epoch = epoch
            linear_save_path = os.path.join(log_dir,
                                            "{}.linear.pth".format(epoch))
            model_save_path = os.path.join(log_dir,
                                           "{}.model.pth".format(epoch))
            torch.save(linear_model.state_dict(), linear_save_path)
            torch.save(model.state_dict(), model_save_path)

        # Check bias and variance
        print(
            "Epoch {} lr {} total: train_loss:{} train_acc:{} val_loss:{} val_acc:{}"
            .format(epoch, current_lr, train_loss_sum, train_acc, val_loss_sum,
                    val_acc),
            flush=True)
        # print("Epoch {} lr {} total: train_acc:{} train_dev_acc:{} val_dev_acc:{} val_acc:{}".format(
        #     epoch, current_lr, train_acc, train_dev_acc, val_dev_acc, val_acc), flush=True)

    print("The best epoch: {} acc: {}".format(best_epoch, best_acc))
Esempio n. 8
0
from polyaxon_client.tracking import Experiment

import logging
import json

"""
Initialize Parser and define arguments
"""
parser, metadata = get_parser_with_args()
opt = parser.parse_args()

"""
Initialize experiments for polyaxon and comet.ml
"""
comet = CometExperiment('QQFXdJ5M7GZRGri7CWxwGxPDN',
                        project_name=opt.project_name,
                        auto_param_logging=False,
                        parse_args=False, disabled=False)
comet.log_other('status', 'started')
experiment = Experiment()
logging.basicConfig(level=logging.INFO)
comet.log_parameters(vars(opt))


"""
Set up environment: define paths, download data, and set device
"""
dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
logging.info('GPU AVAILABLE? ' + str(torch.cuda.is_available()))
download_dataset(opt.dataset_name, comet)
train_loader, val_loader = get_loaders(opt)