def __init__( self, log_path: str = None, writer: SummaryWriter = None, name: str = "tensorboard", enabled: bool = True, ): if tensorboard_import_error: raise tensorboard_import_error if writer and log_path: raise ValueError( ( "log_path given:{} and writer object passed in, " "to create a writer at the log path set writer=None" ).format(log_path) ) elif not writer and not log_path: log_path = os.path.join(".", "tensorboard") if log_path: create_dirs(log_path) self._writer = writer if writer is not None else SummaryWriter(log_path) super().__init__(lambda_func=self._log_lambda, name=name, enabled=enabled)
def _download_and_extract(self): if self._dataset_size == ImagenetteSize.full: url = "https://s3.amazonaws.com/fast-ai-imageclas/imagewoof.tgz" elif self._dataset_size == ImagenetteSize.s320: url = "https://s3.amazonaws.com/fast-ai-imageclas/imagewoof-320.tgz" elif self._dataset_size == ImagenetteSize.s160: url = "https://s3.amazonaws.com/fast-ai-imageclas/imagewoof-160.tgz" else: raise ValueError("unknown imagenette size given of {}".format( self._dataset_size)) create_dirs(self._extracted_root) file_path = "{}.tar".format(self._extracted_root) if os.path.exists(file_path): print("already downloaded imagewoof {}".format(self._dataset_size)) return download_file( url, file_path, overwrite=False, progress_title="downloading imagewoof {}".format( self._dataset_size), ) with tarfile.open(file_path, "r:gz") as tar: tar.extractall(path=self.download_root)
def setup_save_and_log_dirs(args) -> Tuple[str, Optional[str]]: # Saving dir setup save_dir = os.path.abspath(os.path.expanduser(args.save_dir)) if not args.model_tag: model_tag = "{}@{}".format(args.arch_key, args.dataset) model_id = model_tag model_inc = 0 while os.path.exists(os.path.join(save_dir, model_id)): model_inc += 1 model_id = "{}__{:02d}".format(model_tag, model_inc) else: model_id = args.model_tag save_dir = os.path.join(save_dir, model_id) create_dirs(save_dir) LOGGER.info("Model directory is set to {}".format(save_dir)) # log dir setup log_dir = ( os.path.abspath(os.path.expanduser(args.log_dir)) if args.command == TRAIN_COMMAND else None ) if args.command == TRAIN_COMMAND: log_dir = os.path.join(log_dir, model_id) create_dirs(log_dir) LOGGER.info("Logging directory is set to {}".format(log_dir)) else: log_dir = None return save_dir, log_dir
def _setup_save_dirs(args) -> Tuple[str, Optional[str]]: # logging and saving setup save_dir = os.path.abspath(os.path.expanduser(args.save_dir)) logs_dir = (os.path.abspath(os.path.expanduser(os.path.join( args.logs_dir))) if args.command == TRAIN_COMMAND else None) if not args.model_tag: model_tag = "{}_{}".format(args.arch_key.replace("/", "."), args.dataset) model_id = model_tag model_inc = 0 # set location to check for models with same name model_main_dir = logs_dir or save_dir while os.path.exists(os.path.join(model_main_dir, model_id)): model_inc += 1 model_id = "{}__{:02d}".format(model_tag, model_inc) else: model_id = args.model_tag save_dir = os.path.join(save_dir, model_id) create_dirs(save_dir) # logs dir setup if args.command == TRAIN_COMMAND: logs_dir = os.path.join(logs_dir, model_id) create_dirs(logs_dir) else: logs_dir = None LOGGER.info("Model id is set to {}".format(model_id)) return save_dir, logs_dir
def tensors_export( tensors: Union[Tensor, Iterable[Tensor]], export_dir: str, name_prefix: str, counter: int = 0, break_batch: bool = False, ) -> List[str]: """ :param tensors: the tensors to export to a saved numpy array file :param export_dir: the directory to export the files in :param name_prefix: the prefix name for the tensors to save as, will append info about the position of the tensor in a list or dict in addition to the .npy file format :param counter: the current counter to save the tensor at :param break_batch: treat the tensor as a batch and break apart into multiple tensors :return: the exported paths """ create_dirs(export_dir) exported_paths = [] if break_batch: _tensors_export_batch(tensors, export_dir, name_prefix, counter, exported_paths) else: _tensors_export_recursive(tensors, export_dir, name_prefix, counter, exported_paths) return exported_paths
def __init__( self, root: str, train: bool = True, image_size: int = 32, pre_resize_transforms: Union[SplitsTransforms, None] = SplitsTransforms( train=(preprocess_for_train, ), val=(preprocess_for_eval, )), post_resize_transforms: Union[SplitsTransforms, None] = SplitsTransforms( train=None, val=None, ), download: bool = True, ): create_dirs(root) self._download_dir = os.path.join(root, "download") self._extract_dir = os.path.join(root, "extract") self._train_dir = os.path.join(root, "train") self._test_dir = os.path.join(root, "test") if download and not os.path.exists(self._download_dir): self._download_and_extract() self._create_image_folders() self._per_pixel_mean = None super().__init__( root, train, image_size=image_size, pre_resize_transforms=pre_resize_transforms, post_resize_transforms=post_resize_transforms, )
def _save_checkpoint(args, sess, save_dir, checkpoint_name) -> str: checkpoint_path = os.path.join( os.path.join(save_dir, checkpoint_name, "model")) create_dirs(checkpoint_path) saver = ModelRegistry.saver(args.arch_key) saved_name = saver.save(sess, checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, saved_name) LOGGER.info("Checkpoint saved to {}".format(checkpoint_path)) return checkpoint_path
def _download_and_extract(self): """ Download and extract the dataset into root """ create_dirs(self._download_dir) file_path = os.path.join(self._download_dir, "cifar-100-python.tar.gz") url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" download_file( url, file_path, overwrite=False, progress_title="downloading CIFAR-100", ) create_dirs(self._extract_dir) with tarfile.open(file_path, "r:gz") as tar: tar.extractall(path=self._extract_dir)
def export_named_samples( self, inp_dict: Dict[Union[str, tf_compat.Tensor], numpy.ndarray], out_dict: Dict[Union[str, tf_compat.Tensor], numpy.ndarray], ): """ Export sample inputs and outputs for the model to the local system. :param inp_dict: the inputs to save :param out_dict: the outputs to save """ inp_dict = OrderedDict( [ (tens if isinstance(tens, str) else tens.name, val) for tens, val in inp_dict.items() ] ) out_dict = OrderedDict( [ (tens if isinstance(tens, str) else tens.name, val) for tens, val in out_dict.items() ] ) create_dirs(self.sample_inputs_path) create_dirs(self.sample_outputs_path) exp_counter = path_file_count(self.sample_inputs_path, "inp*.npz") tensors_export( inp_dict, self.sample_inputs_path, name_prefix="inp", counter=exp_counter, break_batch=True, ) tensors_export( out_dict, self.sample_outputs_path, name_prefix="out", counter=exp_counter, break_batch=True, )
def get_save_dir_and_loggers( args: Any, task: Optional[Tasks] = None) -> Tuple[Union[str, None], List]: if args.is_main_process: save_dir = os.path.abspath(os.path.expanduser(args.save_dir)) logs_dir = (os.path.abspath( os.path.expanduser(os.path.join(args.logs_dir))) if task == Tasks.TRAIN else None) if not args.model_tag: dataset_name = (f"{args.dataset}-{args.dataset_kwargs['year']}" if "year" in args.dataset_kwargs else args.dataset) model_tag = f"{args.arch_key.replace('/', '.')}_{dataset_name}" model_id = model_tag model_inc = 0 # set location to check for models with same name model_main_dir = logs_dir or save_dir while os.path.exists(os.path.join(model_main_dir, model_id)): model_inc += 1 model_id = f"{model_tag}__{model_inc:02d}" else: model_id = args.model_tag save_dir = os.path.join(save_dir, model_id) create_dirs(save_dir) # loggers setup loggers = [PythonLogger()] if task == Tasks.TRAIN: logs_dir = os.path.join(logs_dir, model_id) create_dirs(logs_dir) loggers.append(TensorBoardLogger(log_path=logs_dir)) print(f"Model id is set to {model_id}") else: # do not log for non main processes save_dir = None loggers = [] return save_dir, loggers
def _create_image_folders(self): create_dirs(self._train_dir) create_dirs(self._test_dir) batches_dir = os.path.join(self._extract_dir, "cifar-10-batches-py") # Train image_tensors = [] [ create_dirs(os.path.join(self._train_dir, str(label))) for label in range(10) ] batch_files = ["data_batch_{}".format(i) for i in range(1, 6)] for fname in batch_files: fpath = os.path.join(batches_dir, fname) print("Processing {}...".format(fpath)) if not os.path.exists(fpath): raise ValueError("Train data batch {} not found".format(fpath)) with open(fpath, "rb") as fo: batch_dict = pickle.load(fo, encoding="bytes") image_tensors.append( self._save_images( batch_dict[b"labels"], batch_dict[b"data"], batch_dict[b"filenames"], self._train_dir, )) image_tensors = np.concatenate(image_tensors) per_pixel_mean = np.mean(image_tensors, axis=0) np.save( os.path.join(self._train_dir, os.pardir, "per_pixel_mean_image.npy"), per_pixel_mean, ) del image_tensors # Test [ create_dirs(os.path.join(self._test_dir, str(label))) for label in range(10) ] fpath = os.path.join(batches_dir, "test_batch") print("Processing {}...".format(fpath)) if not os.path.exists(fpath): raise ValueError("Test data batch {} not found".format(fpath)) with open(fpath, "rb") as fo: batch_dict = pickle.load(fo, encoding="bytes") self._save_images( batch_dict[b"labels"], batch_dict[b"data"], batch_dict[b"filenames"], self._test_dir, )
def main(args): ############################ # logging and saving setup # ############################ save_dir = os.path.abspath(os.path.expanduser(args.save_dir)) # get unique model tag, defaults to '{model_name}' if not args.model_tag: model_tag = args.model.replace("/", ".") model_id = model_tag model_inc = 0 while os.path.exists(os.path.join(args.save_dir, model_id)): model_inc += 1 model_id = "{}__{:02d}".format(model_tag, model_inc) else: model_id = args.model_tag save_dir = os.path.join(save_dir, model_id) create_dirs(save_dir) print("Model id is set to {}".format(model_id)) ########################### # standard training setup # ########################### # create data loaders train_loader, _, _ = _create_imagefolder_dataloader(args, train=True) val_loader, num_classes, image_shape = _create_imagefolder_dataloader( args, train=False ) dataloaders = {"train": train_loader, "val": val_loader} # create model model = _get_torchvision_model( args.model, num_classes, args.pretrained, args.checkpoint_path, ) print("created model: {}".format(model)) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print("using device: {}".format(device)) # create standard SGD optimizer and cross entropy loss function criterion = CrossEntropyLoss() optimizer = SGD( model.parameters(), lr=0.001, momentum=0.9 ) # lr will be overridden by recipe ########################## # add sparseml modifiers # ########################## manager = ScheduledModifierManager.from_yaml(args.recipe_path) optimizer = ScheduledOptimizer( optimizer, model, manager, steps_per_epoch=len(train_loader), loggers=[PythonLogger()], ) ######################## # torchvision training # ######################## model, val_acc_history = train_model( model, dataloaders, criterion, optimizer, device, num_epochs=manager.max_epochs, is_inception="inception" in args.model, ) ######################## # export trained model # ######################## exporter = ModuleExporter(model, save_dir) sample_input = torch.randn(image_shape).unsqueeze(0) # sample batch for ONNX export exporter.export_onnx(sample_input) exporter.export_pytorch() print("Model ONNX export and PyTorch weights saved to {}".format(save_dir))
def setup_filesystem(self): """ Setup the local file system so that it can be used with the data """ create_dirs(self.dir_path)
def __init__(self, save_dir: str): self._save_dir = save_dir self._idx = 0 create_dirs(save_dir)