def __init__(self, logger, project_name: Optional[str] = None, experiment_name: Optional[str] = None, api_key: Optional[str] = None, log_dir: Optional[str] = None, offline: bool = False, **kwargs): if not _COMET_AVAILABLE: raise ImportError( "You want to use `comet_ml` logger which is not installed yet," " install it with `pip install comet-ml`.") self.project_name = project_name self.experiment_name = experiment_name self.kwargs = kwargs self.timer = Timer() if (api_key is not None) and (log_dir is not None): self.mode = "offline" if offline else "online" self.api_key = api_key self.log_dir = log_dir elif api_key is not None: self.mode = "online" self.api_key = api_key self.log_dir = None elif log_dir is not None: self.mode = "offline" self.log_dir = log_dir else: logger.warning( "CometLogger requires either api_key or save_dir during initialization." ) if self.mode == "online": self.experiment = CometExperiment( api_key=self.api_key, project_name=self.project_name, **self.kwargs, ) else: self.experiment = CometOfflineExperiment( offline_directory=self.log_dir, project_name=self.project_name, **self.kwargs, ) if self.experiment_name: self.experiment.set_name(self.experiment_name)
def experiment( self ) -> Union[CometExperiment, CometExistingExperiment, CometOfflineExperiment]: r""" Actual Comet object. To use Comet features in your :class:`~pytorch_lightning.core.module.LightningModule` do the following. Example:: self.logger.experiment.some_comet_function() """ if self._experiment is not None: return self._experiment if self._future_experiment_key is not None: os.environ["COMET_EXPERIMENT_KEY"] = self._future_experiment_key try: if self.mode == "online": if self._experiment_key is None: self._experiment = CometExperiment( api_key=self.api_key, project_name=self._project_name, **self._kwargs) self._experiment_key = self._experiment.get_key() else: self._experiment = CometExistingExperiment( api_key=self.api_key, project_name=self._project_name, previous_experiment=self._experiment_key, **self._kwargs, ) else: self._experiment = CometOfflineExperiment( offline_directory=self.save_dir, project_name=self._project_name, **self._kwargs) finally: if self._future_experiment_key is not None: os.environ.pop("COMET_EXPERIMENT_KEY") self._future_experiment_key = None if self._experiment_name: self._experiment.set_name(self._experiment_name) return self._experiment
def experiment(self): if self._experiment is not None: return self._experiment if self.mode == "online": self._experiment = CometExperiment(api_key=self.api_key, workspace=self.workspace, project_name=self.project_name, **self._kwargs) else: self._experiment = CometOfflineExperiment( offline_directory=self.save_dir, workspace=self.workspace, project_name=self.project_name, **self._kwargs) return self._experiment
def __init__(self, api_key, workspace, rest_api_key=None, project_name=None, experiment_name=None, *args, **kwargs): """ Initialize a Comet.ml logger :param api_key: API key, found on Comet.ml :param workspace: Name of workspace for this user :param project_name: Optional. Send your experiment to a specific project. Otherwise will be sent to Uncategorized Experiments. If project name does not already exists Comet.ml will create a new project. :param rest_api_key: Optional. Rest API key found in Comet.ml settings. This is used to determine version number :param experiment_name: Optional. String representing the name for this particular experiment on Comet.ml """ super(CometLogger, self).__init__() self.experiment = CometExperiment(api_key=api_key, workspace=workspace, project_name=project_name, *args, **kwargs) self.workspace = workspace self.project_name = project_name if rest_api_key is not None: # Comet.ml rest API, used to determine version number self.rest_api_key = rest_api_key self.comet_api = API(self.rest_api_key) else: self.rest_api_key = None self.comet_api = None if experiment_name: try: self._set_experiment_name(experiment_name) except TypeError as e: print("Failed to set experiment name for comet.ml logger")
def __init__(self, root_dir, use_comet=False, use_wandb=False): self.start_time = time.time() # define dirs self.dir = get_nonexistant_path(root_dir) self.project_name = self.dir.split('/')[-2] self.exp_name = self.dir.split('/')[-1] self.ckpt_dir = join(self.dir, 'ckpt') self.code_dir = join(self.dir, 'code') self.hparams_file = join(self.dir, 'hparams.yaml') self.metrics = [] # create dirs os.makedirs(self.dir, exist_ok=True) os.makedirs(self.ckpt_dir, exist_ok=True) os.makedirs(self.code_dir, exist_ok=True) copy_tree(os.path.abspath("."), self.code_dir) logger.info(f"experiment folder: {self.dir}") # create writers # tensorboard self.tb_writer = SummaryWriter(self.dir) # comet_ml self.comet_exp = None if EXTERNAL_LOGGING_AVAILABLE and use_comet: self.comet_exp = CometExperiment(api_key="XXX", project_name=self.project_name, workspace="YYY") self.comet_exp.set_name(self.exp_name) self.comet_exp.log_parameter("exp_name", self.exp_name) # wandb self.wandb_exp = False if EXTERNAL_LOGGING_AVAILABLE and use_wandb: self.wandb_exp = True wandb.init(name=self.exp_name, project=self.project_name, dir=self.dir) atexit.register(self.save)
def experiment(self) -> CometBaseExperiment: r""" Actual comet object. To use comet features do the following. Example:: self.logger.experiment.some_comet_function() """ if self._experiment is not None: return self._experiment if self.mode == "online": if self.experiment_key is None: self._experiment = CometExperiment( api_key=self.api_key, workspace=self.workspace, project_name=self.project_name, **self._kwargs ) self.experiment_key = self._experiment.get_key() else: self._experiment = CometExistingExperiment( api_key=self.api_key, workspace=self.workspace, project_name=self.project_name, previous_experiment=self.experiment_key, **self._kwargs ) else: self._experiment = CometOfflineExperiment( offline_directory=self.save_dir, workspace=self.workspace, project_name=self.project_name, **self._kwargs ) return self._experiment
def experiment(self) -> CometBaseExperiment: r""" Actual Comet object. To use Comet features in your :class:`~pytorch_lightning.core.lightning.LightningModule` do the following. Example:: self.logger.experiment.some_comet_function() """ if self._experiment is not None: return self._experiment if self.mode == "online": if self.experiment_key is None: self._experiment = CometExperiment( api_key=self.api_key, workspace=self.workspace, project_name=self.project_name, **self._kwargs) self.experiment_key = self._experiment.get_key() else: self._experiment = CometExistingExperiment( api_key=self.api_key, workspace=self.workspace, project_name=self.project_name, previous_experiment=self.experiment_key, **self._kwargs) else: save_dir = Path(self.save_dir) / self.name / self.version save_dir.mkdir(exist_ok=True, parents=True) self._experiment = CometOfflineExperiment( offline_directory=save_dir, workspace=self.workspace, project_name=self.project_name, **self._kwargs) if self.experiment_name is not None: self._experiment.set_name(self.experiment_name) return self._experiment
def _init_comet(self): """ For more information on comet, see our doc/Getting Started """ try: if self.comet_key: self.comet_exp = ExistingExperiment( previous_experiment=self.comet_key) elif self.comet_workspace: # New experiment # Use trainset name as comet project name project_name = self.comet_project self.comet_exp = CometExperiment( project_name=project_name, workspace=self.comet_workspace, log_code=False, log_graph=True, auto_param_logging=True, auto_metric_logging=False, parse_args=False, auto_output_logging='native', log_env_details=True, log_env_gpu=True, log_env_cpu=True, log_env_host=False, log_git_metadata=True, log_git_patch=True, display_summary=False) self.comet_exp.set_name(self.experiment_name) self.comet_exp.log_parameters(self.params) self.comet_key = self.comet_exp.get_key() except ConnectionError: self.logger.warning( "Could not connect to Comet.ml, metrics will not be logged " "online...") self.comet_exp = None self.comet_key = None
def __init__(self, *args, **kwargs): super(CometLogger, self).__init__() self.experiment = CometExperiment(*args, **kwargs)
best_loss = 0 # best test accuracy if not args.no_log_to_comet: if params['local_comet_dir']: comet_exp = OfflineExperiment( api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="supercyclecons", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False, offline_directory=params['local_comet_dir']) else: comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="supercyclecons", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False) comet_exp.log_parameters(vars(args)) comet_exp.set_name(params['name']) def partial_load(pretrained_dict, model): model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # 2. overwrite entries in the existing state dict
def BSN_Train_PEM(opt): model = PEM(opt) model = torch.nn.DataParallel(model).cuda() optimizer = optim.Adam(model.parameters(), lr=opt["pem_training_lr"], weight_decay=opt["pem_weight_decay"]) print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) def collate_fn(batch): batch_data = torch.cat([x[0] for x in batch]) batch_iou = torch.cat([x[1] for x in batch]) return batch_data, batch_iou train_dataset = ProposalDataSet(opt, subset="train") train_sampler = ProposalSampler(train_dataset.proposals, train_dataset.indices, max_zero_weight=opt['pem_max_zero_weight']) global_step = 0 train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=model.module.batch_size, shuffle=False, sampler=train_sampler, num_workers=opt['data_workers'], pin_memory=True, drop_last=False, collate_fn=collate_fn if not opt['pem_do_index'] else None) subset = "validation" if opt['dataset'] == 'activitynet' else "test" test_loader = torch.utils.data.DataLoader( ProposalDataSet(opt, subset=subset), batch_size=model.module.batch_size, shuffle=True, num_workers=opt['data_workers'], pin_memory=True, drop_last=False, collate_fn=collate_fn if not opt['pem_do_index'] else None) milestones = [int(k) for k in opt['pem_lr_milestones'].split(',')] scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=opt['pem_step_gamma']) if opt['log_to_comet']: comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="bsnpem", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False) elif opt['local_comet_dir']: comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="bsnpem", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False, offline_directory=opt['local_comet_dir']) else: comet_exp = None if comet_exp: comet_exp.log_parameters(opt) comet_exp.set_name(opt['name']) test_PEM(test_loader, model, -1, -1, comet_exp, opt) for epoch in range(opt["pem_epoch"]): global_step = train_PEM(train_loader, model, optimizer, epoch, global_step, comet_exp, opt) test_PEM(test_loader, model, epoch, global_step, comet_exp, opt) scheduler.step()
def BSN_Train_TEM(opt): global_step = 0 epoch = 0 if opt['do_representation']: model = TEM(opt) optimizer = optim.Adam(model.parameters(), lr=opt["tem_training_lr"], weight_decay=opt["tem_weight_decay"]) global_step, epoch = _maybe_load_checkpoint( model, optimizer, global_step, epoch, os.path.join(opt["checkpoint_path"], opt['name'])) if opt['representation_checkpoint']: # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0]) if opt['do_random_model']: print('DOING RANDOM MDOEL!!!') else: print('DOING Pretrianed modelll!!!') partial_load(opt['representation_checkpoint'], model) # print(model.representation_model.backbone.inception_5b_3x3.weight[0][0]) if not opt['no_freeze']: for param in model.representation_model.parameters(): param.requires_grad = False print(len([p for p in model.representation_model.parameters()])) else: model = TEM(opt) optimizer = optim.Adam(model.parameters(), lr=opt["tem_training_lr"], weight_decay=opt["tem_weight_decay"]) global_step, epoch = _maybe_load_checkpoint( model, optimizer, global_step, epoch, os.path.join(opt["checkpoint_path"], opt['name'])) model = torch.nn.DataParallel(model).cuda() # summary(model, (2, 3, 224, 224)) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if opt['dataset'] == 'gymnastics': # default image_dir is '/checkpoint/cinjon/spaceofmotion/sep052019/rawframes.426x240.12' img_loading_func = get_img_loader(opt) train_data_set = GymnasticsImages(opt, subset='Train', img_loading_func=img_loading_func, image_dir=opt['gym_image_dir'], video_info_path=os.path.join( opt['video_info'], 'Train_Annotation.csv')) train_sampler = GymnasticsSampler(train_data_set, opt['sampler_mode']) test_data_set = GymnasticsImages(opt, subset="Val", img_loading_func=img_loading_func, image_dir=opt['gym_image_dir'], video_info_path=os.path.join( opt['video_info'], 'Val_Annotation.csv')) elif opt['dataset'] == 'gymnasticsfeatures': # feature_dirs should roughly look like: # /checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/rgb,/checkpoint/cinjon/spaceofmotion/sep052019/tsn.1024.426x240.12.no-oversample/csv/flow feature_dirs = opt['feature_dirs'].split(',') train_data_set = GymnasticsFeatures(opt, subset='Train', feature_dirs=feature_dirs, video_info_path=os.path.join( opt['video_info'], 'Train_Annotation.csv')) test_data_set = GymnasticsFeatures(opt, subset='Val', feature_dirs=feature_dirs, video_info_path=os.path.join( opt['video_info'], 'Val_Annotation.csv')) train_sampler = None elif opt['dataset'] == 'thumosfeatures': feature_dirs = opt['feature_dirs'].split(',') train_data_set = ThumosFeatures(opt, subset='Val', feature_dirs=feature_dirs) test_data_set = ThumosFeatures(opt, subset="Test", feature_dirs=feature_dirs) train_sampler = None elif opt['dataset'] == 'thumosimages': img_loading_func = get_img_loader(opt) train_data_set = ThumosImages( opt, subset='Val', img_loading_func=img_loading_func, image_dir= '/checkpoint/cinjon/thumos/rawframes.TH14_validation_tal.30', video_info_path=os.path.join(opt['video_info'], 'Val_Annotation.csv')) test_data_set = ThumosImages( opt, subset='Test', img_loading_func=img_loading_func, image_dir='/checkpoint/cinjon/thumos/rawframes.TH14_test_tal.30', video_info_path=os.path.join(opt['video_info'], 'Test_Annotation.csv')) train_sampler = None elif opt['dataset'] == 'activitynet': train_sampler = None representation_module = opt['representation_module'] train_transforms = get_video_transforms(representation_module, opt['do_augment']) test_transforms = get_video_transforms(representation_module, False) train_data_set = VideoDataset(opt, train_transforms, subset='train', fraction=0.3) # We use val because we don't have annotations for test. test_data_set = VideoDataset(opt, test_transforms, subset='val', fraction=0.3) print('train_loader / val_loader sizes: ', len(train_data_set), len(test_data_set)) train_loader = torch.utils.data.DataLoader( train_data_set, batch_size=model.module.batch_size, shuffle=False if train_sampler else True, sampler=train_sampler, num_workers=opt['data_workers'], pin_memory=True, drop_last=False) test_loader = torch.utils.data.DataLoader( test_data_set, batch_size=model.module.batch_size, shuffle=False, num_workers=opt['data_workers'], pin_memory=True, drop_last=False) # test_loader = None milestones = [int(k) for k in opt['tem_lr_milestones'].split(',')] scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=opt['tem_step_gamma']) if opt['log_to_comet']: comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="bsn", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False) elif opt['local_comet_dir']: comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="bsn", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False, offline_directory=opt['local_comet_dir']) else: comet_exp = None if comet_exp: comet_exp.log_parameters(opt) comet_exp.set_name(opt['name']) # test_TEM(test_loader, model, optimizer, 0, 0, comet_exp, opt) for epoch in range(epoch + 1, opt["tem_epoch"] + 1): global_step = train_TEM(train_loader, model, optimizer, epoch, global_step, comet_exp, opt) test_TEM(test_loader, model, optimizer, epoch, global_step, comet_exp, opt) if opt['dataset'] == 'activitynet': test_loader.dataset._subset_dataset(.3) train_loader.dataset._subset_dataset(.3) scheduler.step()
def main(args): print('Pretrain? ', not args.not_pretrain) print(args.model) start_time = time.time() if opt['local_comet_dir']: comet_exp = OfflineExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="selfcifar", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False, offline_directory=opt['local_comet_dir']) else: comet_exp = CometExperiment(api_key="hIXq6lDzWzz24zgKv7RYz6blo", project_name="selfcifar", workspace="cinjon", auto_metric_logging=True, auto_output_logging=None, auto_param_logging=False) comet_exp.log_parameters(vars(args)) comet_exp.set_name(args.name) # Build model # path = "/misc/kcgscratch1/ChoGroup/resnick/spaceofmotion/zeping/bsn" linear_cls = NonLinearModel if args.do_nonlinear else LinearModel if args.model == "amdim": hparams = load_hparams_from_tags_csv( '/checkpoint/cinjon/amdim/meta_tags.csv') # hparams = load_hparams_from_tags_csv(os.path.join(path, "meta_tags.csv")) model = AMDIMModel(hparams) if not args.not_pretrain: # _path = os.path.join(path, "_ckpt_epoch_434.ckpt") _path = '/checkpoint/cinjon/amdim/_ckpt_epoch_434.ckpt' model.load_state_dict(torch.load(_path)["state_dict"]) else: print("AMDIM not loading checkpoint") # Debug linear_model = linear_cls(AMDIM_OUTPUT_DIM, args.num_classes) elif args.model == "ccc": model = CCCModel(None) if not args.not_pretrain: # _path = os.path.join(path, "TimeCycleCkpt14.pth") _path = '/checkpoint/cinjon/spaceofmotion/bsn/TimeCycleCkpt14.pth' checkpoint = torch.load(_path) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } model.load_state_dict(base_dict) else: print("CCC not loading checkpoint") # Debug linear_model = linaer_cls(CCC_OUTPUT_DIM, args.num_classes) #.to(device) elif args.model == "corrflow": model = CORRFLOWModel(None) if not args.not_pretrain: _path = '/checkpoint/cinjon/spaceofmotion/supercons/corrflow.kineticsmodel.pth' # _path = os.path.join(path, "corrflow.kineticsmodel.pth") checkpoint = torch.load(_path) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } model.load_state_dict(base_dict) else: print("CorrFlow not loading checkpoing") # Debug linear_model = linear_cls(CORRFLOW_OUTPUT_DIM, args.num_classes) elif args.model == "resnet": if not args.not_pretrain: resnet = torchvision.models.resnet50(pretrained=True) else: resnet = torchvision.models.resnet50(pretrained=False) print("ResNet not loading checkpoint") # Debug modules = list(resnet.children())[:-1] model = nn.Sequential(*modules) linear_model = linear_cls(RESNET_OUTPUT_DIM, args.num_classes) else: raise Exception("model type has to be amdim, ccc, corrflow or resnet") if torch.cuda.device_count() > 1: model = nn.DataParallel(model).to(device) linear_model = nn.DataParallel(linear_model).to(device) else: model = model.to(device) linear_model = linear_model.to(device) # model = model.to(device) # linear_model = linear_model.to(device) # Freeze model for p in model.parameters(): p.requires_grad = False model.eval() if args.optimizer == "Adam": optimizer = optim.Adam(linear_model.parameters(), lr=args.lr, weight_decay=args.weight_decay) print("Optimizer: Adam with weight decay: {}".format( args.weight_decay)) elif args.optimizer == "SGD": optimizer = optim.SGD(linear_model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) print("Optimizer: SGD with weight decay: {} momentum: {}".format( args.weight_decay, args.momentum)) else: raise Exception("optimizer should be Adam or SGD") optimizer.zero_grad() # Set up log dir now = datetime.datetime.now() log_dir = '/checkpoint/cinjon/spaceofmotion/bsn/cifar-%d-weights/%s/%s' % ( args.num_classes, args.model, args.name) # log_dir = "{}{:%Y%m%dT%H%M}".format(args.model, now) # log_dir = os.path.join("weights", log_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) print("Saving to {}".format(log_dir)) batch_size = args.batch_size * torch.cuda.device_count() # CIFAR-10 if args.num_classes == 10: data_path = ("/private/home/cinjon/cifar-data/cifar-10-batches-py") _train_dataset = CIFAR_dataset(glob(os.path.join(data_path, "data*")), args.num_classes, args.model, True) # _train_acc_dataset = CIFAR_dataset( # glob(os.path.join(data_path, "data*")), # args.num_classes, # args.model, # False) train_dataloader = data.DataLoader(_train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers) # train_split = int(len(_train_dataset) * 0.8) # train_dev_split = int(len(_train_dataset) - train_split) # train_dataset, train_dev_dataset = data.random_split( # _train_dataset, [train_split, train_dev_split]) # train_acc_dataloader = data.DataLoader( # train_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dev_acc_dataloader = data.DataLoader( # train_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dataset = data.Subset(_train_dataset, list(range(train_split))) # train_dataloader = data.DataLoader( # train_dataset, shuffle=True, batch_size=batch_size, num_workers=args.num_workers) # train_acc_dataset = data.Subset( # _train_acc_dataset, list(range(train_split))) # train_acc_dataloader = data.DataLoader( # train_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # train_dev_acc_dataset = data.Subset( # _train_acc_dataset, list(range(train_split, len(_train_acc_dataset)))) # train_dev_acc_dataloader = data.DataLoader( # train_dev_acc_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) _val_dataset = CIFAR_dataset([os.path.join(data_path, "test_batch")], args.num_classes, args.model, False) val_dataloader = data.DataLoader(_val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # val_split = int(len(_val_dataset) * 0.8) # val_dev_split = int(len(_val_dataset) - val_split) # val_dataset, val_dev_dataset = data.random_split( # _val_dataset, [val_split, val_dev_split]) # val_dataloader = data.DataLoader( # val_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # val_dev_dataloader = data.DataLoader( # val_dev_dataset, shuffle=False, batch_size=batch_size, num_workers=args.num_workers) # CIFAR-100 elif args.num_classes == 100: data_path = ("/private/home/cinjon/cifar-data/cifar-100-python") _train_dataset = CIFAR_dataset([os.path.join(data_path, "train")], args.num_classes, args.model, True) train_dataloader = data.DataLoader(_train_dataset, shuffle=True, batch_size=batch_size) _val_dataset = CIFAR_dataset([os.path.join(data_path, "test")], args.num_classes, args.model, False) val_dataloader = data.DataLoader(_val_dataset, shuffle=False, batch_size=batch_size) else: raise Exception("num_classes should be 10 or 100") best_acc = 0.0 best_epoch = 0 # Training for epoch in range(1, args.epochs + 1): current_lr = max(3e-4, args.lr *\ math.pow(0.5, math.floor(epoch / args.lr_interval))) linear_model.train() if args.optimizer == "Adam": optimizer = optim.Adam(linear_model.parameters(), lr=current_lr, weight_decay=args.weight_decay) elif args.optimizer == "SGD": optimizer = optim.SGD( linear_model.parameters(), lr=current_lr, momentum=args.momentum, weight_decay=args.weight_decay, ) #################################################### # Train t = time.time() train_acc = 0 train_loss_sum = 0.0 for iter, input in enumerate(train_dataloader): if time.time( ) - start_time > args.time * 3600 - 300 and comet_exp is not None: comet_exp.end() sys.exit(-1) imgs = input[0].to(device) if args.model != "resnet": imgs = imgs.unsqueeze(1) lbls = input[1].flatten().to(device) # output = model(imgs) # output = linear_model(output) output = linear_model(model(imgs)) loss = F.cross_entropy(output, lbls) train_loss_sum += float(loss.data) train_acc += int(sum(torch.argmax(output, dim=1) == lbls)) optimizer.zero_grad() loss.backward() optimizer.step() # log_text = "train epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter" if iter % 1500 == 0: log_text = "train epoch {}/{}\titer {}/{} loss:{}" print(log_text.format(epoch, args.epochs, iter + 1, len(train_dataloader), loss.data, time.time() - t), flush=False) t = time.time() train_acc /= len(_train_dataset) train_loss_sum /= len(train_dataloader) with comet_exp.train(): comet_exp.log_metrics({ 'acc': train_acc, 'loss': train_loss_sum }, step=(epoch + 1) * len(train_dataloader), epoch=epoch + 1) print("train acc epoch {}/{} loss:{} train_acc:{}".format( epoch, args.epochs, train_loss_sum, train_acc), flush=True) ####################################################################### # Train acc # linear_model.eval() # train_acc = 0 # train_loss_sum = 0.0 # for iter, input in enumerate(train_acc_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # # output = model(imgs) # # output = linear_model(output) # output = linear_model(model(imgs)) # loss = F.cross_entropy(output, lbls) # train_loss_sum += float(loss.data) # train_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("train acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(train_acc_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # # train_acc /= len(train_acc_dataset) # train_loss_sum /= len(train_acc_dataloader) # print("train acc epoch {}/{} loss:{} train_acc:{}".format( # epoch, args.epochs, train_loss_sum, train_acc), flush=True) ####################################################################### # Train dev acc # # linear_model.eval() # train_dev_acc = 0 # train_dev_loss_sum = 0.0 # for iter, input in enumerate(train_dev_acc_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # output = model(imgs) # output = linear_model(output) # # output = linear_model(model(imgs)) # loss = F.cross_entropy(output, lbls) # train_dev_loss_sum += float(loss.data) # train_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("train dev acc epoch {}/{}\titer {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(train_dev_acc_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # train_dev_acc /= len(train_dev_acc_dataset) # train_dev_loss_sum /= len(train_dev_acc_dataloader) # print("train dev epoch {}/{} loss:{} train_dev_acc:{}".format( # epoch, args.epochs, train_dev_loss_sum, train_dev_acc), flush=True) ####################################################################### # Val dev # # linear_model.eval() # val_dev_acc = 0 # val_dev_loss_sum = 0.0 # for iter, input in enumerate(val_dev_dataloader): # imgs = input[0].to(device) # if args.model != "resnet": # imgs = imgs.unsqueeze(1) # lbls = input[1].flatten().to(device) # # output = model(imgs) # output = linear_model(output) # loss = F.cross_entropy(output, lbls) # val_dev_loss_sum += float(loss.data) # val_dev_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # # print("val dev epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter".format( # epoch, # args.epochs, # iter+1, # len(val_dev_dataloader), # loss.data, # time.time() - t), # flush=True) # t = time.time() # # val_dev_acc /= len(val_dev_dataset) # val_dev_loss_sum /= len(val_dev_dataloader) # print("val dev epoch {}/{} loss:{} val_dev_acc:{}".format( # epoch, args.epochs, val_dev_loss_sum, val_dev_acc), flush=True) ####################################################################### # Val linear_model.eval() val_acc = 0 val_loss_sum = 0.0 for iter, input in enumerate(val_dataloader): if time.time( ) - start_time > args.time * 3600 - 300 and comet_exp is not None: comet_exp.end() sys.exit(-1) imgs = input[0].to(device) if args.model != "resnet": imgs = imgs.unsqueeze(1) lbls = input[1].flatten().to(device) output = model(imgs) output = linear_model(output) loss = F.cross_entropy(output, lbls) val_loss_sum += float(loss.data) val_acc += int(sum(torch.argmax(output, dim=1) == lbls)) # log_text = "val epoch {}/{} iter {}/{} loss:{} {:.3f}s/iter" if iter % 1500 == 0: log_text = "val epoch {}/{} iter {}/{} loss:{}" print(log_text.format(epoch, args.epochs, iter + 1, len(val_dataloader), loss.data, time.time() - t), flush=False) t = time.time() val_acc /= len(_val_dataset) val_loss_sum /= len(val_dataloader) print("val epoch {}/{} loss:{} val_acc:{}".format( epoch, args.epochs, val_loss_sum, val_acc)) with comet_exp.test(): comet_exp.log_metrics({ 'acc': val_acc, 'loss': val_loss_sum }, step=(epoch + 1) * len(train_dataloader), epoch=epoch + 1) if val_acc > best_acc: best_acc = val_acc best_epoch = epoch linear_save_path = os.path.join(log_dir, "{}.linear.pth".format(epoch)) model_save_path = os.path.join(log_dir, "{}.model.pth".format(epoch)) torch.save(linear_model.state_dict(), linear_save_path) torch.save(model.state_dict(), model_save_path) # Check bias and variance print( "Epoch {} lr {} total: train_loss:{} train_acc:{} val_loss:{} val_acc:{}" .format(epoch, current_lr, train_loss_sum, train_acc, val_loss_sum, val_acc), flush=True) # print("Epoch {} lr {} total: train_acc:{} train_dev_acc:{} val_dev_acc:{} val_acc:{}".format( # epoch, current_lr, train_acc, train_dev_acc, val_dev_acc, val_acc), flush=True) print("The best epoch: {} acc: {}".format(best_epoch, best_acc))
from polyaxon_client.tracking import Experiment import logging import json """ Initialize Parser and define arguments """ parser, metadata = get_parser_with_args() opt = parser.parse_args() """ Initialize experiments for polyaxon and comet.ml """ comet = CometExperiment('QQFXdJ5M7GZRGri7CWxwGxPDN', project_name=opt.project_name, auto_param_logging=False, parse_args=False, disabled=False) comet.log_other('status', 'started') experiment = Experiment() logging.basicConfig(level=logging.INFO) comet.log_parameters(vars(opt)) """ Set up environment: define paths, download data, and set device """ dev = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') logging.info('GPU AVAILABLE? ' + str(torch.cuda.is_available())) download_dataset(opt.dataset_name, comet) train_loader, val_loader = get_loaders(opt)