def load_checkpoint( checkpoint_folder, device=CPU_DEVICE, checkpoint_file=CHECKPOINT_FILE ): """ Loads a state variable from the specified checkpoint folder. """ if not checkpoint_folder: return None assert device is not None, "Please specify what device to load checkpoint on" assert device.type in ["cpu", "cuda"], f"Unknown device: {device}" if device.type == "cuda": assert torch.cuda.is_available() if not PathManager.exists(checkpoint_folder): logging.warning("Checkpoint folder '%s' not found" % checkpoint_folder) return None logging.info("Attempting to load checkpoint from '%s'" % checkpoint_folder) # read what the latest model file is: filename = f"{checkpoint_folder}/{checkpoint_file}" if not PathManager.exists(filename): logging.warning("Checkpoint file %s not found." % filename) return None # load model on specified device and not on saved device for model and return # the checkpoint with PathManager.open(filename, "rb") as f: checkpoint = torch.load(f, map_location=device) logging.info(f"Loaded checkpoint from {filename}") return checkpoint
def main(): parser = argparse.ArgumentParser( description="Create the iNaturalist2018 data information file." ) parser.add_argument( "-i", "--input_dir_path", type=str, help="Path to the parent directory of the iNaturalist2018 data set", ) parser.add_argument( "-o", "--output_dir_path", type=str, help="Folder where the classification dataset will be written", ) parser.add_argument( "-d", "--download", action="store_const", const=True, default=False, help="To download the original dataset and decompress it in the input folder", ) args = parser.parse_args() # Make sure that the input and output directories exist. assert PathManager.exists( args.input_dir_path ), "Data input directory not found! Please create the directory" assert PathManager.exists( args.output_dir_path ), "Data output directory not found! Please create the directory" # Download dataset to input path if args.download: download_dataset(args.input_dir_path) # Process training and validation datasets into numpy arrays logger.info("========Preparing train data files========") train_images, train_labels = get_images_labels_info( "/train2018.json", args.input_dir_path ) logger.info("========Preparing val data files========") val_images, val_labels = get_images_labels_info( "/val2018.json", args.input_dir_path ) # Save as numpy files to output path logger.info("=================Saving train data files=======================") train_label_file_name = f"{ args.output_dir_path }/train_labels.npy" train_image_file_name = f"{ args.output_dir_path }/train_images.npy" save_file(train_images, train_image_file_name) save_file(train_labels, train_label_file_name) logger.info("=================Saving val data files=======================") val_label_file_name = f"{ args.output_dir_path }/val_labels.npy" val_image_file_name = f"{ args.output_dir_path }/val_images.npy" save_file(val_images, val_image_file_name) save_file(val_labels, val_label_file_name)
def test_bad_args(self) -> None: with self.assertRaises(NotImplementedError): PathManager.copy( self._remote_uri, self._remote_uri, foo="foo" # type: ignore ) with self.assertRaises(NotImplementedError): PathManager.exists(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.get_local_path( self._remote_uri, foo="foo" # type: ignore ) with self.assertRaises(NotImplementedError): PathManager.isdir(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.isfile(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.ls(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.mkdirs(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(ValueError): PathManager.open(self._remote_uri, foo="foo") # type: ignore with self.assertRaises(NotImplementedError): PathManager.rm(self._remote_uri, foo="foo") # type: ignore PathManager.set_strict_kwargs_checking(False) PathManager.get_local_path(self._remote_uri, foo="foo") # type: ignore f = PathManager.open(self._remote_uri, foo="foo") # type: ignore f.close() PathManager.set_strict_kwargs_checking(True)
def _build_model(self): """ - Builds and returns model used for task. The returned model is not copied to gpu yet (if using gpu) and neither wrapped with DDP yet. This is done later by self.prepare() - We also convert the model BatchNorm layers to SyncBatchNorm if user has set the config option. We support PyTorch and Apex SyncBatchNorms both. - If the model is set to be in evaluation model and the full model must be frozen, we freeze the model. - If the model must be initialized from a checkpoint or user passed weights file we initialize the model from the checkpoint or the weights. """ logging.info("Building model....") # Instantiate the raw model as specified model = build_model(self.config["MODEL"], self.config["OPTIMIZER"]) # Convert the BatchNorm layers to SyncBatchNorm if needed # Both Apex and Pytorch SyncBatchNorms are GPU only if (self.config["MODEL"]["SYNC_BN_CONFIG"]["CONVERT_BN_TO_SYNC_BN"] and self.config["MACHINE"]["DEVICE"] == "gpu"): model = convert_sync_bn(self.config, model) # Enforce eval mode, no matter what the prior tranforms have done. # For instance apex converts batch-norms and sets `requires_grad` to True if self.config["MODEL"]["FEATURE_EVAL_SETTINGS"]["EVAL_MODE_ON"]: if self.config["MODEL"]["FEATURE_EVAL_SETTINGS"][ "FREEZE_TRUNK_ONLY"]: logging.info( "config.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_ONLY=True, " "will freeze trunk...") model.freeze_trunk() elif self.config["MODEL"]["FEATURE_EVAL_SETTINGS"][ "FREEZE_TRUNK_AND_HEAD"]: logging.info( "config.MODEL.FEATURE_EVAL_SETTINGS.FREEZE_TRUNK_AND_HEAD=True, will " "freeze trunk and head...") model.freeze_head_and_trunk() # assert that if the user set the PARAMS_FILE, it must exist and be valid. if (self.checkpoint_path is None and self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]): assert PathManager.exists( self.config["MODEL"]["WEIGHTS_INIT"] ["PARAMS_FILE"]), "Specified PARAMS_FILE does NOT exist" # If we want to initialize the model in case of finetuning or evaluation, # we do it here. But we check that there is no checkpoint existing before # This is important in cases when the model training dies. if (self.checkpoint_path is None and self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"] and PathManager.exists( self.config["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"])): model = self._restore_model_weights(model) return model
def check_data_exists(data_files): """ Check that the input data files exist. If the data_files is a list, we iteratively check for each file in the list. """ if isinstance(data_files, list): return np.all([PathManager.exists(item) for item in data_files]) else: return PathManager.exists(data_files)
def test_rm(self): with open(os.path.join(self._tmpdir, "test_rm.txt"), "w") as f: rm_file = f.name f.write(self._tmpfile_contents) f.flush() self.assertTrue(PathManager.exists(rm_file)) self.assertTrue(PathManager.isfile(rm_file)) PathManager.rm(rm_file) self.assertFalse(PathManager.exists(rm_file)) self.assertFalse(PathManager.isfile(rm_file))
def on_start(self, task) -> None: if not is_primary() or getattr(task, "test_only", False): return if not PathManager.exists(self.torchscript_folder): err_msg = "Torchscript folder '{}' does not exist.".format( self.torchscript_folder) raise FileNotFoundError(err_msg)
def get_images_labels_info(split, args): assert PathManager.exists( args.data_source_dir), "Data source NOT found. Abort!" data_dir = f"{args.data_source_dir}/{split}" class_idx = get_all_classes(data_dir) logger.info("Number of classes in {} data: {}".format( split, len(class_idx))) all_classes = class_idx.keys() image_paths, image_classes, img_ids = [], [], [] for class_name in all_classes: class_label = class_idx[class_name] class_dir = f"{data_dir}/{class_name}" # get all the images in this dir for item in os.listdir(class_dir): if item not in [".", ".."]: image_paths.append(f"{class_dir}/{item}") img_ids.append(f"{class_name}/{item}") image_classes.append(class_label) output_dict = {} if args.generate_json: for idx in range(len(img_ids)): id = img_ids[idx] lbl = image_classes[idx] output_dict[id] = lbl return image_paths, image_classes, output_dict
def get_eval_dataset(cfg, root_dataset_path, eval_dataset_name, eval_binary_path): eval_data_path = f"{root_dataset_path}/{eval_dataset_name}" assert PathManager.exists(eval_data_path), f"Unknown path: {eval_data_path}" num_samples = ( None if cfg.IMG_RETRIEVAL.NUM_DATABASE_SAMPLES == -1 else cfg.IMG_RETRIEVAL.NUM_DATABASE_SAMPLES ) if is_revisited_dataset(eval_dataset_name): eval_dataset = RevisitedInstanceRetrievalDataset( eval_dataset_name, root_dataset_path, num_samples=num_samples ) elif is_instre_dataset(eval_dataset_name): eval_dataset = InstreDataset(eval_data_path, num_samples=num_samples) elif is_copdays_dataset(eval_dataset_name): eval_dataset = CopyDaysDataset( data_path=eval_data_path, num_samples=num_samples, use_distractors=cfg.IMG_RETRIEVAL.USE_DISTRACTORS, ) else: eval_dataset = InstanceRetrievalDataset( eval_data_path, eval_binary_path, num_samples=num_samples ) return eval_dataset
def step(self, iteration: int, **kwargs: Any) -> None: """ Perform the appropriate action at the given iteration. Args: iteration (int): the current iteration, ranged in [0, max_iter-1]. kwargs (Any): extra data to save, same as in :meth:`Checkpointer.save`. """ iteration = int(iteration) additional_state = {"iteration": iteration} additional_state.update(kwargs) if (iteration + 1) % self.period == 0: self.checkpointer.save("model_{:07d}".format(iteration), **additional_state) if self.max_to_keep is not None: self.recent_checkpoints.append(self.checkpointer.get_checkpoint_file()) # pyre-fixme[6]: Expected `int` for 1st param but got `Optional[int]`. # pyre-fixme[6]: Expected `int` for 1st param but got `Optional[int]`. if len(self.recent_checkpoints) > self.max_to_keep: file_to_delete = self.recent_checkpoints.pop(0) if PathManager.exists( file_to_delete ) and not file_to_delete.endswith("model_final.pth"): PathManager.rm(file_to_delete) if iteration >= self.max_iter - 1: # pyre-ignore self.checkpointer.save("model_final", **additional_state)
def build_retrieval_model(cfg): """ Builds the model on 1-gpu and initializes from the weight. """ logging.info("Building model....") model = build_model(cfg.MODEL, cfg.OPTIMIZER) if PathManager.exists(cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE): init_weights_path = cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE logging.info(f"Initializing model from: {init_weights_path}") weights = torch.load(init_weights_path, map_location=torch.device("cuda")) skip_layers = cfg.MODEL.WEIGHTS_INIT.get("SKIP_LAYERS", []) replace_prefix = cfg.MODEL.WEIGHTS_INIT.get("REMOVE_PREFIX", None) append_prefix = cfg.MODEL.WEIGHTS_INIT.get("APPEND_PREFIX", None) state_dict_key_name = cfg.MODEL.WEIGHTS_INIT.get( "STATE_DICT_KEY_NAME", None) init_model_from_consolidated_weights( cfg, model, weights, state_dict_key_name=state_dict_key_name, skip_layers=skip_layers, replace_prefix=replace_prefix, append_prefix=append_prefix, ) else: # We only throw the warning if not weights file is provided. We want to # benchmark the random initialization model too and hence support that. logging.warning("Model is randomly initialized....") logging.info(f"Model is:\n {model}") return model
def load_input_data(self, data_file, targets_file): """ Given the input data (features) and targets (labels) files, load the features of shape N x D and labels of shape (N,) """ assert PathManager.exists(data_file), "Data file not found. Abort!" assert PathManager.exists( targets_file), "Targets file not found. Abort!" # load the features and the targets logging.info("loading features and targets...") targets = load_file(targets_file) features = np.array(load_file(data_file)).astype(np.float64) assert features.shape[0] == targets.shape[0], "Mismatched #images" logging.info( f"Loaded features: {features.shape} and targets: {targets.shape}") return features, targets
def step(self, iteration: int, **kwargs: Any): """ Perform the appropriate action at the given iteration. Args: iteration (int): the current iteration, ranged in [0, max_iter-1]. kwargs (Any): extra data to save, same as in :meth:`Checkpointer.save`. """ iteration = int(iteration) additional_state = {"iteration": iteration} additional_state.update(kwargs) if (iteration + 1) % self.period == 0: self.checkpointer.save("model_{:07d}".format(iteration), **additional_state) if self.max_to_keep is not None: all_checkpoint_files = ( self.checkpointer.get_all_checkpoint_files()) all_checkpoint_files = [ item for item in all_checkpoint_files if not item.endswith("model_final.pth") ] all_checkpoint_files.sort() files_to_delete = all_checkpoint_files[:-self.max_to_keep] for file in files_to_delete: if PathManager.exists(file): PathManager.rm(file) if iteration >= self.max_iter - 1: self.checkpointer.save("model_final", **additional_state)
def on_start(self, task) -> None: if not is_master() or getattr(task, "test_only", False): return if not PathManager.exists(self.checkpoint_folder): err_msg = "Checkpoint folder '{}' does not exist.".format( self.checkpoint_folder) raise FileNotFoundError(err_msg)
def get_data_files(split, args): data_dir = f"{args.data_source_dir}/ImageSets/Main" assert PathManager.exists(data_dir), "Data: {} doesn't exist".format( data_dir) test_data_files = glob(os.path.join(data_dir, "*_test.txt")) test_data_files = validate_files(test_data_files) if args.separate_partitions > 0: train_data_files = glob(os.path.join(data_dir, "*_train.txt")) val_data_files = glob(os.path.join(data_dir, "*_val.txt")) train_data_files = validate_files(train_data_files) val_data_files = validate_files(val_data_files) assert len(train_data_files) == len(val_data_files) if split == "train": data_files = train_data_files elif split == "test": data_files = test_data_files else: data_files = val_data_files else: train_data_files = glob(os.path.join(data_dir, "*_trainval.txt")) if len(test_data_files) == 0: # For VOC2012 dataset, we have trainval, val and train data. train_data_files = glob(os.path.join(data_dir, "*_train.txt")) test_data_files = glob(os.path.join(data_dir, "*_val.txt")) test_data_files = validate_files(test_data_files) train_data_files = validate_files(train_data_files) data_files = train_data_files if (split == "train") else test_data_files assert len(train_data_files) == len(test_data_files), "Missing classes" return data_files
def get_coco_imgs_labels_info(split, data_source_dir, args): # pycocotools is an optional dependency for VISSL from pycocotools.coco import COCO json_file = f"{data_source_dir}/annotations/instances_{split}2014.json" assert PathManager.exists( json_file), "Annotations file does not exist. Abort" json_data = json.load(PathManager.open(json_file, "r")) image_index = [x["id"] for x in json_data["images"]] coco = COCO(json_file) num_cats = len(json_data["categories"]) logging.info("partition: {} num_cats: {} num_images: {}".format( split, num_cats, len(image_index))) cat_ids = [x["id"] for x in json_data["categories"]] coco_to_me = {val: ind for ind, val in enumerate(cat_ids)} cat_names = [str(x["name"]) for x in json_data["categories"]] cat_name_to_id, cat_id_to_name = {}, {} for ind, name in enumerate(cat_names): cat_name_to_id[name] = ind cat_id_to_name[ind] = name class_ids = cat_id_to_name.keys() assert len(list(class_ids)) == num_cats assert min(class_ids) == 0 assert max(class_ids) == len(class_ids) - 1 assert len(set(class_ids)) == len(class_ids) # label_matrix = np.zeros((len(image_index), len(cat_names)), dtype=np.float32) # area_matrix = np.zeros((len(image_index), len(cat_names)), dtype=np.float32) img_labels_map = {} num_classes = len(cat_names) for _, im_id in enumerate(image_index): ann_ids = coco.getAnnIds(imgIds=im_id) entry = coco.imgs[im_id] img_name = entry["file_name"] objs = coco.loadAnns(ann_ids) valid_objs = get_valid_objs(entry, objs) if img_name not in img_labels_map: img_labels_map[img_name] = np.zeros(num_classes, dtype=np.int32) for _, obj in enumerate(valid_objs): cocoCatId = obj["category_id"] myId = coco_to_me[cocoCatId] img_labels_map[img_name][myId] = 1.0 # label = 1 (present), 0 (not present) img_paths, img_labels = [], [] train_imgs_path = f"{data_source_dir}/train2014" val_imgs_path = f"{data_source_dir}/val2014" prefix = train_imgs_path if split == "train" else val_imgs_path for item in sorted(img_labels_map.keys()): img_paths.append(f"{prefix}/{item}") img_labels.append(img_labels_map[item]) # save to the datasets folder and return the path output_dir = get_output_dir() img_info_out_path = f"{output_dir}/{split}_images.npy" label_info_out_path = f"{output_dir}/{split}_labels.npy" save_file(np.array(img_paths), img_info_out_path) save_file(np.array(img_labels), label_info_out_path) return [img_info_out_path, label_info_out_path]
def merge(self): """merge all clip features of a video into one/several fix-size matrix(es) """ if not PathManager.exists(self.merge_dir): PathManager.mkdirs(self.merge_dir) for video_name in PathManager.ls(self.save_dir): video_dir = os.path.join(self.save_dir, video_name) num_feats = len(PathManager.ls(video_dir)) if self.min_length <= num_feats <= self.max_length: merged_feat = torch.zeros((num_feats, self.dim), dtype=torch.float32) for clip_idx in range(num_feats): feat = torch.load( os.path.join(video_dir, f'{clip_idx}.pth')) merged_feat[clip_idx, :] = torch.from_numpy(feat) torch.save(merged_feat, os.path.join(self.merge_dir, f'{video_name}.pth')) else: # TODO print(video_name)
def process_train_image(i, out_dir): if i % LOG_FREQUENCY == 0: logging.info(f"Train Image: {i}"), fname_out = f"{out_dir}/{i}.npy" if PathManager.exists(fname_out): feat = load_file(fname_out) train_features.append(feat) else: fname_in = train_dataset.get_filename(i) if is_revisited_dataset(train_dataset_name): img = image_helper.load_and_prepare_revisited_image(fname_in) elif is_whiten_dataset(train_dataset_name): img = image_helper.load_and_prepare_whitening_image(fname_in) else: img = image_helper.load_and_prepare_image(fname_in, roi=None) v = torch.autograd.Variable(img.unsqueeze(0)) vc = v.cuda() # the model output is a list always. activation_map = model(vc)[0].cpu() # once we have the features, # we can perform: rmac | gem pooling | l2 norm if cfg.IMG_RETRIEVAL.FEATS_PROCESSING_TYPE == "rmac": descriptors = get_rmac_descriptors(activation_map, spatial_levels) else: descriptors = activation_map save_file(descriptors.data.numpy(), fname_out) train_features.append(descriptors.data.numpy())
def get_train_dataset(cfg, root_dataset_path, train_dataset_name, eval_binary_path): # We only create the train dataset if we need PCA or whitening training. # Otherwise not. if cfg.IMG_RETRIEVAL.SHOULD_TRAIN_PCA_OR_WHITENING: train_data_path = f"{root_dataset_path}/{train_dataset_name}" assert PathManager.exists( train_data_path), f"Unknown path: {train_data_path}" num_samples = 10 if cfg.IMG_RETRIEVAL.DEBUG_MODE else None if is_revisited_dataset(train_dataset_name): train_dataset = RevisitedInstanceRetrievalDataset( train_dataset_name, root_dataset_path) elif is_whiten_dataset(train_dataset_name): train_dataset = WhiteningTrainingImageDataset( train_data_path, cfg.IMG_RETRIEVAL.WHITEN_IMG_LIST, num_samples=num_samples, ) else: train_dataset = InstanceRetrievalDataset(train_data_path, eval_binary_path, num_samples=num_samples) else: train_dataset = None return train_dataset
def convert_and_save_model(args, replace_prefix): assert PathManager.exists( args.output_dir), "Output directory does NOT exist" # load the model model_path = args.model_url_or_file if is_url(model_path): logger.info(f"Loading from url: {model_path}") model = load_state_dict_from_url(model_path) else: model = torch.load(model_path, map_location=torch.device("cpu")) # get the model trunk to rename if "classy_state_dict" in model.keys(): model_trunk = model["classy_state_dict"]["base_model"]["model"][ "trunk"] elif "model_state_dict" in model.keys(): model_trunk = model["model_state_dict"] else: model_trunk = model logger.info( f"Input model loaded. Number of params: {len(model_trunk.keys())}") # convert the trunk converted_model = replace_module_prefix(model_trunk, "_feature_blocks.") logger.info( f"Converted model. Number of params: {len(converted_model.keys())}") # save the state output_filename = f"converted_vissl_{args.output_name}.torch" output_model_filepath = f"{args.output_dir}/{output_filename}" logger.info(f"Saving model: {output_model_filepath}") torch.save(converted_model, output_model_filepath) logger.info("DONE!")
def _restore_model_weights(self, model): """ If using a weights file to initialize the model, we load the weights and initialize the model. Since the weights file specified by user might not be VISSL trained weights, we expose several config options like APPEND_PREFIX, etc to allow successful loading of the weights. See MODEL.WEIGHTS_INIT description in vissl/config/defaults.yaml for details. """ params_from_file = self.config["MODEL"]["WEIGHTS_INIT"] init_weights_path = params_from_file["PARAMS_FILE"] assert init_weights_path, "Shouldn't call this when init_weight_path is empty" logging.info(f"Initializing model from: {init_weights_path}") if PathManager.exists(init_weights_path): weights = load_and_broadcast_checkpoint(init_weights_path, device=torch.device("cpu")) skip_layers = params_from_file.get("SKIP_LAYERS", []) replace_prefix = params_from_file.get("REMOVE_PREFIX", None) append_prefix = params_from_file.get("APPEND_PREFIX", None) state_dict_key_name = params_from_file.get("STATE_DICT_KEY_NAME", None) # we initialize the weights from this checkpoint. However, we # don't care about the other metadata like iteration number etc. # So the method only reads the state_dict init_model_from_weights( self.config, model, weights, state_dict_key_name=state_dict_key_name, skip_layers=skip_layers, replace_prefix=replace_prefix, append_prefix=append_prefix, ) return model
def _construct_loader(self): """ Construct the video loader. """ path_to_file = os.path.join( self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format("train" if self.mode == "train" else "val"), ) assert PathManager.exists(path_to_file), "{} dir not found".format( path_to_file) (self._path_to_videos, self._labels) = utils.load_image_lists(path_to_file, self.cfg.DATA.PATH_PREFIX, return_list=True) if self.mode != "train": # Form video-level labels from frame level annotations. self._labels = utils.convert_to_video_level_labels(self._labels) self._path_to_videos = list( chain.from_iterable([[x] * self._num_clips for x in self._path_to_videos])) self._labels = list( chain.from_iterable([[x] * self._num_clips for x in self._labels])) self._spatial_temporal_idx = list( chain.from_iterable( [range(self._num_clips) for _ in range(len(self._labels))])) logger.info( "Charades dataloader constructed (size: {}) from {}".format( len(self._path_to_videos), path_to_file))
def has_checkpoint(self) -> bool: """ Returns: bool: whether a checkpoint exists in the target directory. """ save_file = os.path.join(self.save_dir, "last_checkpoint") return PathManager.exists(save_file)
def _construct_loader(self): """ Construct the video loader. """ path_to_file = os.path.join(self.cfg.DATA.PATH_TO_DATA_DIR, "{}.txt".format(self.mode)) assert PathManager.exists(path_to_file), "{} dir not found".format( path_to_file) self._path_to_videos = [] self._labels = [] self._duration = [] self._spatial_temporal_idx = [] with PathManager.open(path_to_file, "r") as f: for clip_idx, path_label in enumerate(f.read().splitlines()): path, start, end, label = path_label.split() for idx in range(self._num_clips): self._path_to_videos.append( os.path.join(self.cfg.DATA.PATH_PREFIX, path)) self._labels.append(int(label)) self._duration.append((float(start), float(end))) self._spatial_temporal_idx.append(idx) assert (len(self._path_to_videos) > 0), "Failed to load Alimedia split {} from {}".format( self._split_idx, path_to_file) logger.info( "Constructing Alimedia dataloader (size: {}) from {}".format( len(self._path_to_videos), path_to_file)) self._path_to_videos = np.array(self._path_to_videos, dtype=np.string_)
def _evaluate_checkpoints(self): for checkpoint_str, benchmarks in self.evaluation_results.items(): # TODO: Can we possible retrieve this from CheckpointWriter, to consolidate logic. checkpoint_str = os.path.join(self.training_config.CHECKPOINT.DIR, f"{ checkpoint_str }.torch") if PathManager.exists(checkpoint_str): self._evaluate_checkpoint(checkpoint_str, benchmarks)
def load_monolingual_dataset( bin_path, is_source=False, char_source_dict=None, log_verbose=True, num_examples_limit: Optional[int] = None, ): if log_verbose: print("Starting to load binarized monolingual data file.", flush=True) if not PathManager.exists(bin_path): raise ValueError(f"Monolingual binary path {bin_path} not found!") if char_source_dict is not None and is_source: dataset = char_data.InMemoryNumpyWordCharDataset.create_from_file( path=bin_path) else: dataset = pytorch_translate_data.InMemoryIndexedDataset.create_from_file( path=bin_path, num_examples_limit=num_examples_limit) if log_verbose: print(f"Finished loading dataset {bin_path}", flush=True) print(f"""| Loaded {len(dataset)} monolingual examples for """ f"""{"source" if is_source else "target"}""") return dataset
def convert_to_coco_json(dataset_name, output_file, allow_cached=True): """ Converts dataset into COCO format and saves it to a json file. dataset_name must be registered in DatasetCatalog and in detectron2's standard format. Args: dataset_name: reference from the config file to the catalogs must be registered in DatasetCatalog and in detectron2's standard format output_file: path of json file that will be saved to allow_cached: if json file is already present then skip conversion """ # TODO: The dataset or the conversion script *may* change, # a checksum would be useful for validating the cached data PathManager.mkdirs(os.path.dirname(output_file)) with file_lock(output_file): if PathManager.exists(output_file) and allow_cached: logger.warning( f"Using previously cached COCO format annotations at '{output_file}'. " "You need to clear the cache file if your dataset has been modified." ) else: logger.info( f"Converting annotations of dataset '{dataset_name}' to COCO format ...)" ) coco_dict = convert_to_coco_dict(dataset_name) logger.info( f"Caching COCO format annotations at '{output_file}' ...") with PathManager.open(output_file, "w") as f: json.dump(coco_dict, f)
def get_local_path(input_file, dest_dir): """ If user specified copying data to a local directory, get the local path where the data files were copied. - If input_file is just a file, we return the dest_dir/filename - If the intput_file is a directory, then we check if the environemt is SLURM and use slurm_dir or otherwise dest_dir to look up copy_complete file is available. If available, we return the directory. - If both above fail, we return the input_file as is. """ out = "" if PathManager.isfile(input_file): out = os.path.join(dest_dir, os.path.basename(input_file)) elif PathManager.isdir(input_file): data_name = input_file.strip("/").split("/")[-1] if "SLURM_JOBID" in os.environ: dest_dir = get_slurm_dir(dest_dir) dest_dir = os.path.join(dest_dir, data_name) complete_flag = os.path.join(dest_dir, "copy_complete") if PathManager.isfile(complete_flag): out = dest_dir if PathManager.exists(out): return out else: return input_file
def _construct_loader(self): """ Construct the video loader. """ path_to_file = os.path.join(self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode)) assert PathManager.exists(path_to_file), "{} dir not found".format( path_to_file) self._path_to_videos = [] self._labels = [] self._spatial_temporal_idx = [] with PathManager.open(path_to_file, "r") as f: for clip_idx, path_label in enumerate(f.read().splitlines()): assert (len( path_label.split(self.cfg.DATA.PATH_LABEL_SEPARATOR)) == 2) path, label = path_label.split( self.cfg.DATA.PATH_LABEL_SEPARATOR) for idx in range(self._num_clips): self._path_to_videos.append( os.path.join(self.cfg.DATA.PATH_PREFIX, path)) self._labels.append(int(label)) self._spatial_temporal_idx.append(idx) self._video_meta[clip_idx * self._num_clips + idx] = {} assert (len(self._path_to_videos) > 0), "Failed to load Kinetics split {} from {}".format( self._split_idx, path_to_file) logger.info( "Constructing kinetics dataloader (size: {}) from {}".format( len(self._path_to_videos), path_to_file))
def main(): parser = argparse.ArgumentParser( description="Sample Low-shot data for Places/VOC") parser.add_argument( "--dataset_name", type=str, default=None, help= "choose between places | voc. These are valid choices if your dataset is similar", ) parser.add_argument( "--layername", type=str, default=None, help="Layer for which low shot is being general. Valid for voc07 only", ) parser.add_argument( "--targets_data_file", type=str, default=None, help="Numpy file containing image labels", ) parser.add_argument( "--images_data_file", type=str, default=None, help="Numpy file containing images information", ) parser.add_argument( "--output_path", type=str, default=None, help="path where low-shot samples should be saved", ) parser.add_argument( "--k_values", type=str, default="1,2,4,8,16,32,64,96", help="Low-shot k-values for svm testing.", ) parser.add_argument("--num_samples", type=int, default=5, help="Number of independent samples.") opts = parser.parse_args() assert PathManager.exists( opts.targets_data_file), "Target file not found. Abort" targets = load_file(opts.targets_data_file) sample_ids = list(range(1, 1 + opts.num_samples)) generate_low_shot_samples( opts.dataset_name, targets, opts.k_values, sample_ids, opts.output_path, opts.layername, opts.images_data_file, )