예제 #1
0
    def __init__(self,
                 logger,
                 project_name: Optional[str] = None,
                 experiment_name: Optional[str] = None,
                 api_key: Optional[str] = None,
                 log_dir: Optional[str] = None,
                 offline: bool = False,
                 **kwargs):
        if not _COMET_AVAILABLE:
            raise ImportError(
                "You want to use `comet_ml` logger which is not installed yet,"
                " install it with `pip install comet-ml`.")

        self.project_name = project_name
        self.experiment_name = experiment_name
        self.kwargs = kwargs

        self.timer = Timer()

        if (api_key is not None) and (log_dir is not None):
            self.mode = "offline" if offline else "online"
            self.api_key = api_key
            self.log_dir = log_dir

        elif api_key is not None:
            self.mode = "online"
            self.api_key = api_key
            self.log_dir = None
        elif log_dir is not None:
            self.mode = "offline"
            self.log_dir = log_dir
        else:
            logger.warning(
                "CometLogger requires either api_key or save_dir during initialization."
            )

        if self.mode == "online":
            self.experiment = CometExperiment(
                api_key=self.api_key,
                project_name=self.project_name,
                **self.kwargs,
            )
        else:
            self.experiment = CometOfflineExperiment(
                offline_directory=self.log_dir,
                project_name=self.project_name,
                **self.kwargs,
            )

        if self.experiment_name:
            self.experiment.set_name(self.experiment_name)
예제 #2
0
    def experiment(
        self
    ) -> Union[CometExperiment, CometExistingExperiment,
               CometOfflineExperiment]:
        r"""
        Actual Comet object. To use Comet features in your
        :class:`~pytorch_lightning.core.module.LightningModule` do the following.

        Example::

            self.logger.experiment.some_comet_function()

        """
        if self._experiment is not None:
            return self._experiment

        if self._future_experiment_key is not None:
            os.environ["COMET_EXPERIMENT_KEY"] = self._future_experiment_key

        try:
            if self.mode == "online":
                if self._experiment_key is None:
                    self._experiment = CometExperiment(
                        api_key=self.api_key,
                        project_name=self._project_name,
                        **self._kwargs)
                    self._experiment_key = self._experiment.get_key()
                else:
                    self._experiment = CometExistingExperiment(
                        api_key=self.api_key,
                        project_name=self._project_name,
                        previous_experiment=self._experiment_key,
                        **self._kwargs,
                    )
            else:
                self._experiment = CometOfflineExperiment(
                    offline_directory=self.save_dir,
                    project_name=self._project_name,
                    **self._kwargs)
        finally:
            if self._future_experiment_key is not None:
                os.environ.pop("COMET_EXPERIMENT_KEY")
                self._future_experiment_key = None

        if self._experiment_name:
            self._experiment.set_name(self._experiment_name)

        return self._experiment
예제 #3
0
    def experiment(self):
        if self._experiment is not None:
            return self._experiment

        if self.mode == "online":
            self._experiment = CometExperiment(api_key=self.api_key,
                                               workspace=self.workspace,
                                               project_name=self.project_name,
                                               **self._kwargs)
        else:
            self._experiment = CometOfflineExperiment(
                offline_directory=self.save_dir,
                workspace=self.workspace,
                project_name=self.project_name,
                **self._kwargs)

        return self._experiment
예제 #4
0
    def __init__(self,
                 api_key,
                 workspace,
                 rest_api_key=None,
                 project_name=None,
                 experiment_name=None,
                 *args,
                 **kwargs):
        """
        Initialize a Comet.ml logger

        :param api_key: API key, found on Comet.ml
        :param workspace: Name of workspace for this user
        :param project_name: Optional. Send your experiment to a specific project.
        Otherwise will be sent to Uncategorized Experiments.
        If project name does not already exists Comet.ml will create a new project.
        :param rest_api_key: Optional. Rest API key found in Comet.ml settings. This is used to determine version number
        :param experiment_name: Optional. String representing the name for this particular experiment on Comet.ml
        """
        super(CometLogger, self).__init__()
        self.experiment = CometExperiment(api_key=api_key,
                                          workspace=workspace,
                                          project_name=project_name,
                                          *args,
                                          **kwargs)

        self.workspace = workspace
        self.project_name = project_name

        if rest_api_key is not None:
            # Comet.ml rest API, used to determine version number
            self.rest_api_key = rest_api_key
            self.comet_api = API(self.rest_api_key)
        else:
            self.rest_api_key = None
            self.comet_api = None

        if experiment_name:
            try:
                self._set_experiment_name(experiment_name)
            except TypeError as e:
                print("Failed to set experiment name for comet.ml logger")
예제 #5
0
    def __init__(self, root_dir, use_comet=False, use_wandb=False):
        self.start_time = time.time()

        # define dirs
        self.dir = get_nonexistant_path(root_dir)
        self.project_name = self.dir.split('/')[-2]
        self.exp_name = self.dir.split('/')[-1]
        self.ckpt_dir = join(self.dir, 'ckpt')
        self.code_dir = join(self.dir, 'code')
        self.hparams_file = join(self.dir, 'hparams.yaml')
        self.metrics = []

        # create dirs
        os.makedirs(self.dir, exist_ok=True)
        os.makedirs(self.ckpt_dir, exist_ok=True)
        os.makedirs(self.code_dir, exist_ok=True)
        copy_tree(os.path.abspath("."), self.code_dir)
        logger.info(f"experiment folder: {self.dir}")

        # create writers
        # tensorboard
        self.tb_writer = SummaryWriter(self.dir)

        # comet_ml
        self.comet_exp = None
        if EXTERNAL_LOGGING_AVAILABLE and use_comet:
            self.comet_exp = CometExperiment(api_key="XXX",
                                             project_name=self.project_name,
                                             workspace="YYY")
            self.comet_exp.set_name(self.exp_name)
            self.comet_exp.log_parameter("exp_name", self.exp_name)

        # wandb
        self.wandb_exp = False
        if EXTERNAL_LOGGING_AVAILABLE and use_wandb:
            self.wandb_exp = True
            wandb.init(name=self.exp_name,
                       project=self.project_name,
                       dir=self.dir)

        atexit.register(self.save)
예제 #6
0
    def experiment(self) -> CometBaseExperiment:
        r"""

        Actual comet object. To use comet features do the following.

        Example::

            self.logger.experiment.some_comet_function()

        """
        if self._experiment is not None:
            return self._experiment

        if self.mode == "online":
            if self.experiment_key is None:
                self._experiment = CometExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    **self._kwargs
                )
                self.experiment_key = self._experiment.get_key()
            else:
                self._experiment = CometExistingExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    previous_experiment=self.experiment_key,
                    **self._kwargs
                )
        else:
            self._experiment = CometOfflineExperiment(
                offline_directory=self.save_dir,
                workspace=self.workspace,
                project_name=self.project_name,
                **self._kwargs
            )

        return self._experiment
예제 #7
0
    def experiment(self) -> CometBaseExperiment:
        r"""
        Actual Comet object. To use Comet features in your
        :class:`~pytorch_lightning.core.lightning.LightningModule` do the following.
        Example::
            self.logger.experiment.some_comet_function()
        """
        if self._experiment is not None:
            return self._experiment

        if self.mode == "online":
            if self.experiment_key is None:
                self._experiment = CometExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    **self._kwargs)
                self.experiment_key = self._experiment.get_key()
            else:
                self._experiment = CometExistingExperiment(
                    api_key=self.api_key,
                    workspace=self.workspace,
                    project_name=self.project_name,
                    previous_experiment=self.experiment_key,
                    **self._kwargs)
        else:
            save_dir = Path(self.save_dir) / self.name / self.version
            save_dir.mkdir(exist_ok=True, parents=True)
            self._experiment = CometOfflineExperiment(
                offline_directory=save_dir,
                workspace=self.workspace,
                project_name=self.project_name,
                **self._kwargs)

        if self.experiment_name is not None:
            self._experiment.set_name(self.experiment_name)
        return self._experiment
예제 #8
0
 def _init_comet(self):
     """
     For more information on comet, see our doc/Getting Started
     """
     try:
         if self.comet_key:
             self.comet_exp = ExistingExperiment(
                 previous_experiment=self.comet_key)
         elif self.comet_workspace:
             # New experiment
             # Use trainset name as comet project name
             project_name = self.comet_project
             self.comet_exp = CometExperiment(
                 project_name=project_name,
                 workspace=self.comet_workspace,
                 log_code=False,
                 log_graph=True,
                 auto_param_logging=True,
                 auto_metric_logging=False,
                 parse_args=False,
                 auto_output_logging='native',
                 log_env_details=True,
                 log_env_gpu=True,
                 log_env_cpu=True,
                 log_env_host=False,
                 log_git_metadata=True,
                 log_git_patch=True,
                 display_summary=False)
             self.comet_exp.set_name(self.experiment_name)
             self.comet_exp.log_parameters(self.params)
             self.comet_key = self.comet_exp.get_key()
     except ConnectionError:
         self.logger.warning(
             "Could not connect to Comet.ml, metrics will not be logged "
             "online...")
         self.comet_exp = None
         self.comet_key = None
예제 #9
0
 def __init__(self, *args, **kwargs):
     super(CometLogger, self).__init__()
     self.experiment = CometExperiment(*args, **kwargs)
예제 #10
0
best_loss = 0  # best test accuracy

if not args.no_log_to_comet:
    if params['local_comet_dir']:
        comet_exp = OfflineExperiment(
            api_key="hIXq6lDzWzz24zgKv7RYz6blo",
            project_name="supercyclecons",
            workspace="cinjon",
            auto_metric_logging=True,
            auto_output_logging=None,
            auto_param_logging=False,
            offline_directory=params['local_comet_dir'])
    else:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="supercyclecons",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    comet_exp.log_parameters(vars(args))
    comet_exp.set_name(params['name'])


def partial_load(pretrained_dict, model):
    model_dict = model.state_dict()

    # 1. filter out unnecessary keys
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items() if k in model_dict
    }
    # 2. overwrite entries in the existing state dict
def BSN_Train_PEM(opt):
    model = PEM(opt)
    model = torch.nn.DataParallel(model).cuda()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["pem_training_lr"],
                           weight_decay=opt["pem_weight_decay"])

    print('Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))

    def collate_fn(batch):
        batch_data = torch.cat([x[0] for x in batch])
        batch_iou = torch.cat([x[1] for x in batch])
        return batch_data, batch_iou

    train_dataset = ProposalDataSet(opt, subset="train")
    train_sampler = ProposalSampler(train_dataset.proposals,
                                    train_dataset.indices,
                                    max_zero_weight=opt['pem_max_zero_weight'])

    global_step = 0
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=model.module.batch_size,
        shuffle=False,
        sampler=train_sampler,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False,
        collate_fn=collate_fn if not opt['pem_do_index'] else None)

    subset = "validation" if opt['dataset'] == 'activitynet' else "test"
    test_loader = torch.utils.data.DataLoader(
        ProposalDataSet(opt, subset=subset),
        batch_size=model.module.batch_size,
        shuffle=True,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False,
        collate_fn=collate_fn if not opt['pem_do_index'] else None)

    milestones = [int(k) for k in opt['pem_lr_milestones'].split(',')]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=milestones, gamma=opt['pem_step_gamma'])

    if opt['log_to_comet']:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="bsnpem",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    elif opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="bsnpem",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = None

    if comet_exp:
        comet_exp.log_parameters(opt)
        comet_exp.set_name(opt['name'])

    test_PEM(test_loader, model, -1, -1, comet_exp, opt)
    for epoch in range(opt["pem_epoch"]):
        global_step = train_PEM(train_loader, model, optimizer, epoch,
                                global_step, comet_exp, opt)
        test_PEM(test_loader, model, epoch, global_step, comet_exp, opt)
        scheduler.step()
def BSN_Train_TEM(opt):
    global_step = 0
    epoch = 0
    if opt['do_representation']:
        model = TEM(opt)
        optimizer = optim.Adam(model.parameters(),
                               lr=opt["tem_training_lr"],
                               weight_decay=opt["tem_weight_decay"])
        global_step, epoch = _maybe_load_checkpoint(
            model, optimizer, global_step, epoch,
            os.path.join(opt["checkpoint_path"], opt['name']))
        if opt['representation_checkpoint']:
            # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0])
            if opt['do_random_model']:
                print('DOING RANDOM MDOEL!!!')
            else:
                print('DOING Pretrianed modelll!!!')
                partial_load(opt['representation_checkpoint'], model)
            # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0])
        if not opt['no_freeze']:
            for param in model.representation_model.parameters():
                param.requires_grad = False
        print(len([p for p in model.representation_model.parameters()]))
    else:
        model = TEM(opt)
        optimizer = optim.Adam(model.parameters(),
                               lr=opt["tem_training_lr"],
                               weight_decay=opt["tem_weight_decay"])
        global_step, epoch = _maybe_load_checkpoint(
            model, optimizer, global_step, epoch,
            os.path.join(opt["checkpoint_path"], opt['name']))

    model = torch.nn.DataParallel(model).cuda()
    # summary(model, (2, 3, 224, 224))

    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))

    if opt['dataset'] == 'gymnastics':
        # default image_dir is '/checkpoint/cinjon/spaceofmotion/sep052019/rawframes.426x240.12'
        img_loading_func = get_img_loader(opt)
        train_data_set = GymnasticsImages(opt,
                                          subset='Train',
                                          img_loading_func=img_loading_func,
                                          image_dir=opt['gym_image_dir'],
                                          video_info_path=os.path.join(
                                              opt['video_info'],
                                              'Train_Annotation.csv'))
        train_sampler = GymnasticsSampler(train_data_set, opt['sampler_mode'])
        test_data_set = GymnasticsImages(opt,
                                         subset="Val",
                                         img_loading_func=img_loading_func,
                                         image_dir=opt['gym_image_dir'],
                                         video_info_path=os.path.join(
                                             opt['video_info'],
                                             'Val_Annotation.csv'))
    elif opt['dataset'] == 'gymnasticsfeatures':
        # feature_dirs should roughly look like:
        # /checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/rgb,/checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/flow
        feature_dirs = opt['feature_dirs'].split(',')
        train_data_set = GymnasticsFeatures(opt,
                                            subset='Train',
                                            feature_dirs=feature_dirs,
                                            video_info_path=os.path.join(
                                                opt['video_info'],
                                                'Train_Annotation.csv'))
        test_data_set = GymnasticsFeatures(opt,
                                           subset='Val',
                                           feature_dirs=feature_dirs,
                                           video_info_path=os.path.join(
                                               opt['video_info'],
                                               'Val_Annotation.csv'))
        train_sampler = None
    elif opt['dataset'] == 'thumosfeatures':
        feature_dirs = opt['feature_dirs'].split(',')
        train_data_set = ThumosFeatures(opt,
                                        subset='Val',
                                        feature_dirs=feature_dirs)
        test_data_set = ThumosFeatures(opt,
                                       subset="Test",
                                       feature_dirs=feature_dirs)
        train_sampler = None
    elif opt['dataset'] == 'thumosimages':
        img_loading_func = get_img_loader(opt)
        train_data_set = ThumosImages(
            opt,
            subset='Val',
            img_loading_func=img_loading_func,
            image_dir=
            '/checkpoint/cinjon/thumos/rawframes.TH14_validation_tal.30',
            video_info_path=os.path.join(opt['video_info'],
                                         'Val_Annotation.csv'))
        test_data_set = ThumosImages(
            opt,
            subset='Test',
            img_loading_func=img_loading_func,
            image_dir='/checkpoint/cinjon/thumos/rawframes.TH14_test_tal.30',
            video_info_path=os.path.join(opt['video_info'],
                                         'Test_Annotation.csv'))
        train_sampler = None
    elif opt['dataset'] == 'activitynet':
        train_sampler = None
        representation_module = opt['representation_module']
        train_transforms = get_video_transforms(representation_module,
                                                opt['do_augment'])
        test_transforms = get_video_transforms(representation_module, False)
        train_data_set = VideoDataset(opt,
                                      train_transforms,
                                      subset='train',
                                      fraction=0.3)
        # We use val because we don't have annotations for test.
        test_data_set = VideoDataset(opt,
                                     test_transforms,
                                     subset='val',
                                     fraction=0.3)

    print('train_loader / val_loader sizes: ', len(train_data_set),
          len(test_data_set))
    train_loader = torch.utils.data.DataLoader(
        train_data_set,
        batch_size=model.module.batch_size,
        shuffle=False if train_sampler else True,
        sampler=train_sampler,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False)

    test_loader = torch.utils.data.DataLoader(
        test_data_set,
        batch_size=model.module.batch_size,
        shuffle=False,
        num_workers=opt['data_workers'],
        pin_memory=True,
        drop_last=False)
    # test_loader = None

    milestones = [int(k) for k in opt['tem_lr_milestones'].split(',')]
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=milestones, gamma=opt['tem_step_gamma'])

    if opt['log_to_comet']:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="bsn",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    elif opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="bsn",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = None

    if comet_exp:
        comet_exp.log_parameters(opt)
        comet_exp.set_name(opt['name'])

    # test_TEM(test_loader, model, optimizer, 0, 0, comet_exp, opt)
    for epoch in range(epoch + 1, opt["tem_epoch"] + 1):
        global_step = train_TEM(train_loader, model, optimizer, epoch,
                                global_step, comet_exp, opt)
        test_TEM(test_loader, model, optimizer, epoch, global_step, comet_exp,
                 opt)
        if opt['dataset'] == 'activitynet':
            test_loader.dataset._subset_dataset(.3)
            train_loader.dataset._subset_dataset(.3)
        scheduler.step()
예제 #13
0
def main(args):
    print('Pretrain? ', not args.not_pretrain)
    print(args.model)
    start_time = time.time()

    if opt['local_comet_dir']:
        comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                      project_name="selfcifar",
                                      workspace="cinjon",
                                      auto_metric_logging=True,
                                      auto_output_logging=None,
                                      auto_param_logging=False,
                                      offline_directory=opt['local_comet_dir'])
    else:
        comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo",
                                    project_name="selfcifar",
                                    workspace="cinjon",
                                    auto_metric_logging=True,
                                    auto_output_logging=None,
                                    auto_param_logging=False)
    comet_exp.log_parameters(vars(args))
    comet_exp.set_name(args.name)

    # Build model
    # path = "/misc/kcgscratch1/ChoGroup/resnick/spaceofmotion/zeping/bsn"
    linear_cls = NonLinearModel if args.do_nonlinear else LinearModel

    if args.model == "amdim":
        hparams = load_hparams_from_tags_csv(
            '/checkpoint/cinjon/amdim/meta_tags.csv')
        # hparams = load_hparams_from_tags_csv(os.path.join(path, "meta_tags.csv"))
        model = AMDIMModel(hparams)
        if not args.not_pretrain:
            # _path = os.path.join(path, "_ckpt_epoch_434.ckpt")
            _path = '/checkpoint/cinjon/amdim/_ckpt_epoch_434.ckpt'
            model.load_state_dict(torch.load(_path)["state_dict"])
        else:
            print("AMDIM not loading checkpoint")  # Debug
        linear_model = linear_cls(AMDIM_OUTPUT_DIM, args.num_classes)
    elif args.model == "ccc":
        model = CCCModel(None)
        if not args.not_pretrain:
            # _path = os.path.join(path, "TimeCycleCkpt14.pth")
            _path = '/checkpoint/cinjon/spaceofmotion/bsn/TimeCycleCkpt14.pth'
            checkpoint = torch.load(_path)
            base_dict = {
                '.'.join(k.split('.')[1:]): v
                for k, v in list(checkpoint['state_dict'].items())
            }
            model.load_state_dict(base_dict)
        else:
            print("CCC not loading checkpoint")  # Debug
        linear_model = linaer_cls(CCC_OUTPUT_DIM,
                                  args.num_classes)  #.to(device)
    elif args.model == "corrflow":
        model = CORRFLOWModel(None)
        if not args.not_pretrain:
            _path = '/checkpoint/cinjon/spaceofmotion/supercons/corrflow.kineticsmodel.pth'
            # _path = os.path.join(path, "corrflow.kineticsmodel.pth")
            checkpoint = torch.load(_path)
            base_dict = {
                '.'.join(k.split('.')[1:]): v
                for k, v in list(checkpoint['state_dict'].items())
            }
            model.load_state_dict(base_dict)
        else:
            print("CorrFlow not loading checkpoing")  # Debug
        linear_model = linear_cls(CORRFLOW_OUTPUT_DIM, args.num_classes)
    elif args.model == "resnet":
        if not args.not_pretrain:
            resnet = torchvision.models.resnet50(pretrained=True)
        else:
            resnet = torchvision.models.resnet50(pretrained=False)
            print("ResNet not loading checkpoint")  # Debug
        modules = list(resnet.children())[:-1]
        model = nn.Sequential(*modules)
        linear_model = linear_cls(RESNET_OUTPUT_DIM, args.num_classes)
    else:
        raise Exception("model type has to be amdim, ccc, corrflow or resnet")

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model).to(device)
        linear_model = nn.DataParallel(linear_model).to(device)
    else:
        model = model.to(device)
        linear_model = linear_model.to(device)
    # model = model.to(device)
    # linear_model = linear_model.to(device)

    # Freeze model
    for p in model.parameters():
        p.requires_grad = False
    model.eval()

    if args.optimizer == "Adam":
        optimizer = optim.Adam(linear_model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
        print("Optimizer: Adam with weight decay: {}".format(
            args.weight_decay))
    elif args.optimizer == "SGD":
        optimizer = optim.SGD(linear_model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
        print("Optimizer: SGD with weight decay: {} momentum: {}".format(
            args.weight_decay, args.momentum))
    else:
        raise Exception("optimizer should be Adam or SGD")
    optimizer.zero_grad()

    # Set up log dir
    now = datetime.datetime.now()
    log_dir = '/checkpoint/cinjon/spaceofmotion/bsn/cifar-%d-weights/%s/%s' % (
        args.num_classes, args.model, args.name)
    # log_dir = "{}{:%Y%m%dT%H%M}".format(args.model, now)
    # log_dir = os.path.join("weights", log_dir)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    print("Saving to {}".format(log_dir))

    batch_size = args.batch_size * torch.cuda.device_count()
    # CIFAR-10
    if args.num_classes == 10:
        data_path = ("/private/home/cinjon/cifar-data/cifar-10-batches-py")
        _train_dataset = CIFAR_dataset(glob(os.path.join(data_path, "data*")),
                                       args.num_classes, args.model, True)
        # _train_acc_dataset = CIFAR_dataset(
        #     glob(os.path.join(data_path, "data*")),
        #     args.num_classes,
        #     args.model,
        #     False)
        train_dataloader = data.DataLoader(_train_dataset,
                                           shuffle=True,
                                           batch_size=batch_size,
                                           num_workers=args.num_workers)
        # train_split = int(len(_train_dataset) * 0.8)
        # train_dev_split = int(len(_train_dataset) - train_split)
        # train_dataset, train_dev_dataset = data.random_split(
        #     _train_dataset, [train_split, train_dev_split])
        # train_acc_dataloader = data.DataLoader(
        #     train_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dev_acc_dataloader = data.DataLoader(
        #     train_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dataset = data.Subset(_train_dataset, list(range(train_split)))
        # train_dataloader = data.DataLoader(
        #     train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers)
        # train_acc_dataset = data.Subset(
        #     _train_acc_dataset, list(range(train_split)))
        # train_acc_dataloader = data.DataLoader(
        #     train_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # train_dev_acc_dataset = data.Subset(
        #     _train_acc_dataset, list(range(train_split, len(_train_acc_dataset))))
        # train_dev_acc_dataloader = data.DataLoader(
        #     train_dev_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)

        _val_dataset = CIFAR_dataset([os.path.join(data_path, "test_batch")],
                                     args.num_classes, args.model, False)
        val_dataloader = data.DataLoader(_val_dataset,
                                         shuffle=False,
                                         batch_size=batch_size,
                                         num_workers=args.num_workers)
        # val_split = int(len(_val_dataset) * 0.8)
        # val_dev_split = int(len(_val_dataset) - val_split)
        # val_dataset, val_dev_dataset = data.random_split(
        #     _val_dataset, [val_split, val_dev_split])
        # val_dataloader = data.DataLoader(
        #     val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
        # val_dev_dataloader = data.DataLoader(
        #     val_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers)
    # CIFAR-100
    elif args.num_classes == 100:
        data_path = ("/private/home/cinjon/cifar-data/cifar-100-python")
        _train_dataset = CIFAR_dataset([os.path.join(data_path, "train")],
                                       args.num_classes, args.model, True)
        train_dataloader = data.DataLoader(_train_dataset,
                                           shuffle=True,
                                           batch_size=batch_size)
        _val_dataset = CIFAR_dataset([os.path.join(data_path, "test")],
                                     args.num_classes, args.model, False)
        val_dataloader = data.DataLoader(_val_dataset,
                                         shuffle=False,
                                         batch_size=batch_size)
    else:
        raise Exception("num_classes should be 10 or 100")

    best_acc = 0.0
    best_epoch = 0

    # Training
    for epoch in range(1, args.epochs + 1):
        current_lr = max(3e-4, args.lr *\
            math.pow(0.5, math.floor(epoch / args.lr_interval)))
        linear_model.train()
        if args.optimizer == "Adam":
            optimizer = optim.Adam(linear_model.parameters(),
                                   lr=current_lr,
                                   weight_decay=args.weight_decay)
        elif args.optimizer == "SGD":
            optimizer = optim.SGD(
                linear_model.parameters(),
                lr=current_lr,
                momentum=args.momentum,
                weight_decay=args.weight_decay,
            )

        ####################################################
        # Train
        t = time.time()
        train_acc = 0
        train_loss_sum = 0.0
        for iter, input in enumerate(train_dataloader):
            if time.time(
            ) - start_time > args.time * 3600 - 300 and comet_exp is not None:
                comet_exp.end()
                sys.exit(-1)

            imgs = input[0].to(device)
            if args.model != "resnet":
                imgs = imgs.unsqueeze(1)
            lbls = input[1].flatten().to(device)

            # output = model(imgs)
            # output = linear_model(output)
            output = linear_model(model(imgs))
            loss = F.cross_entropy(output, lbls)
            train_loss_sum += float(loss.data)
            train_acc += int(sum(torch.argmax(output, dim=1) == lbls))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # log_text = "train epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter"
            if iter % 1500 == 0:
                log_text = "train epoch {}/{}\titer {}/{} loss:{}"
                print(log_text.format(epoch, args.epochs, iter + 1,
                                      len(train_dataloader), loss.data,
                                      time.time() - t),
                      flush=False)
                t = time.time()

        train_acc /= len(_train_dataset)
        train_loss_sum /= len(train_dataloader)
        with comet_exp.train():
            comet_exp.log_metrics({
                'acc': train_acc,
                'loss': train_loss_sum
            },
                                  step=(epoch + 1) * len(train_dataloader),
                                  epoch=epoch + 1)
        print("train acc epoch {}/{} loss:{} train_acc:{}".format(
            epoch, args.epochs, train_loss_sum, train_acc),
              flush=True)

        #######################################################################
        # Train acc
        # linear_model.eval()
        # train_acc = 0
        # train_loss_sum = 0.0
        # for iter, input in enumerate(train_acc_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     # output = model(imgs)
        #     # output = linear_model(output)
        #     output = linear_model(model(imgs))
        #     loss = F.cross_entropy(output, lbls)
        #     train_loss_sum += float(loss.data)
        #     train_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("train acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(train_acc_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        #
        # train_acc /= len(train_acc_dataset)
        # train_loss_sum /= len(train_acc_dataloader)
        # print("train acc epoch {}/{} loss:{} train_acc:{}".format(
        #     epoch, args.epochs, train_loss_sum, train_acc), flush=True)

        #######################################################################
        # Train dev acc
        # # linear_model.eval()
        # train_dev_acc = 0
        # train_dev_loss_sum = 0.0
        # for iter, input in enumerate(train_dev_acc_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     output = model(imgs)
        #     output = linear_model(output)
        #     # output = linear_model(model(imgs))
        #     loss = F.cross_entropy(output, lbls)
        #     train_dev_loss_sum += float(loss.data)
        #     train_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("train dev acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(train_dev_acc_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        # train_dev_acc /= len(train_dev_acc_dataset)
        # train_dev_loss_sum /= len(train_dev_acc_dataloader)
        # print("train dev epoch {}/{} loss:{} train_dev_acc:{}".format(
        #     epoch, args.epochs, train_dev_loss_sum, train_dev_acc), flush=True)

        #######################################################################
        # Val dev
        # # linear_model.eval()
        # val_dev_acc = 0
        # val_dev_loss_sum = 0.0
        # for iter, input in enumerate(val_dev_dataloader):
        #     imgs = input[0].to(device)
        #     if args.model != "resnet":
        #         imgs = imgs.unsqueeze(1)
        #     lbls = input[1].flatten().to(device)
        #
        #     output = model(imgs)
        #     output = linear_model(output)
        #     loss = F.cross_entropy(output, lbls)
        #     val_dev_loss_sum += float(loss.data)
        #     val_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls))
        #
        #     print("val dev epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter".format(
        #         epoch,
        #         args.epochs,
        #         iter+1,
        #         len(val_dev_dataloader),
        #         loss.data,
        #         time.time() - t),
        #         flush=True)
        #     t = time.time()
        #
        # val_dev_acc /= len(val_dev_dataset)
        # val_dev_loss_sum /= len(val_dev_dataloader)
        # print("val dev epoch {}/{} loss:{} val_dev_acc:{}".format(
        #     epoch, args.epochs, val_dev_loss_sum, val_dev_acc), flush=True)

        #######################################################################
        # Val
        linear_model.eval()
        val_acc = 0
        val_loss_sum = 0.0
        for iter, input in enumerate(val_dataloader):
            if time.time(
            ) - start_time > args.time * 3600 - 300 and comet_exp is not None:
                comet_exp.end()
                sys.exit(-1)

            imgs = input[0].to(device)
            if args.model != "resnet":
                imgs = imgs.unsqueeze(1)
            lbls = input[1].flatten().to(device)

            output = model(imgs)
            output = linear_model(output)
            loss = F.cross_entropy(output, lbls)
            val_loss_sum += float(loss.data)
            val_acc += int(sum(torch.argmax(output, dim=1) == lbls))

            # log_text = "val epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter"
            if iter % 1500 == 0:
                log_text = "val epoch {}/{} iter {}/{} loss:{}"
                print(log_text.format(epoch, args.epochs, iter + 1,
                                      len(val_dataloader), loss.data,
                                      time.time() - t),
                      flush=False)
                t = time.time()

        val_acc /= len(_val_dataset)
        val_loss_sum /= len(val_dataloader)
        print("val epoch {}/{} loss:{} val_acc:{}".format(
            epoch, args.epochs, val_loss_sum, val_acc))
        with comet_exp.test():
            comet_exp.log_metrics({
                'acc': val_acc,
                'loss': val_loss_sum
            },
                                  step=(epoch + 1) * len(train_dataloader),
                                  epoch=epoch + 1)

        if val_acc > best_acc:
            best_acc = val_acc
            best_epoch = epoch
            linear_save_path = os.path.join(log_dir,
                                            "{}.linear.pth".format(epoch))
            model_save_path = os.path.join(log_dir,
                                           "{}.model.pth".format(epoch))
            torch.save(linear_model.state_dict(), linear_save_path)
            torch.save(model.state_dict(), model_save_path)

        # Check bias and variance
        print(
            "Epoch {} lr {} total: train_loss:{} train_acc:{} val_loss:{} val_acc:{}"
            .format(epoch, current_lr, train_loss_sum, train_acc, val_loss_sum,
                    val_acc),
            flush=True)
        # print("Epoch {} lr {} total: train_acc:{} train_dev_acc:{} val_dev_acc:{} val_acc:{}".format(
        #     epoch, current_lr, train_acc, train_dev_acc, val_dev_acc, val_acc), flush=True)

    print("The best epoch: {} acc: {}".format(best_epoch, best_acc))
예제 #14
0
from polyaxon_client.tracking import Experiment

import logging
import json

"""
Initialize Parser and define arguments
"""
parser, metadata = get_parser_with_args()
opt = parser.parse_args()

"""
Initialize experiments for polyaxon and comet.ml
"""
comet = CometExperiment('QQFXdJ5M7GZRGri7CWxwGxPDN',
                        project_name=opt.project_name,
                        auto_param_logging=False,
                        parse_args=False, disabled=False)
comet.log_other('status', 'started')
experiment = Experiment()
logging.basicConfig(level=logging.INFO)
comet.log_parameters(vars(opt))


"""
Set up environment: define paths, download data, and set device
"""
dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
logging.info('GPU AVAILABLE? ' + str(torch.cuda.is_available()))
download_dataset(opt.dataset_name, comet)
train_loader, val_loader = get_loaders(opt)