def featurize(self, input_files, labels=None, weights=None, in_memory=False): """Featurizes image files. Parameters ---------- input_files: list Each file in this list should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). in_memory: bool If true, return in-memory NumpyDataset. Else return ImageDataset. """ if not isinstance(input_files, list): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) extension = extension.lower() # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) extension = extension.lower() if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder if in_memory: return NumpyDataset(self.load_img(image_files), y=labels, w=weights, ids=image_files) else: return ImageDataset(image_files, y=labels, w=weights, ids=image_files)
def create_dataset(self, inputs: Union[OneOrMany[str], Tuple[Any]], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192, in_memory: bool = False) -> Dataset: """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights. Parameters ---------- inputs: `Union[OneOrMany[str], Tuple[Any]]` The inputs provided should be one of the following - filename - list of filenames - Tuple (list of filenames, labels) - Tuple (list of filenames, labels, weights) Each file in a given list of filenames should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). If `labels` or `weights` are provided, they must correspond to the sorted order of all filenames provided, with one label/weight per file. data_dir: str, optional (default None) Directory to store featurized dataset. shard_size: int, optional (default 8192) Shard size when loading data. in_memory: bool, optioanl (default False) If true, return in-memory NumpyDataset. Else return ImageDataset. Returns ------- Dataset A `Dataset` object containing a featurized representation of data from `input_files`, `labels`, and `weights`. """ labels, weights = None, None if isinstance(inputs, tuple): if len(inputs) == 1: input_files = inputs[0] if isinstance(inputs, str): input_files = [inputs] elif len(inputs) == 2: input_files, labels = inputs elif len(inputs) == 3: input_files, labels, weights = inputs else: raise ValueError("Input must be a tuple of length 1, 2, or 3") else: input_files = inputs if isinstance(input_files, str): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) extension = extension.lower() # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) extension = extension.lower() if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder # Sort image files image_files = sorted(image_files) if in_memory: if data_dir is None: return NumpyDataset(load_image_files(image_files), y=labels, w=weights, ids=image_files) else: dataset = DiskDataset.from_numpy(load_image_files(image_files), y=labels, w=weights, ids=image_files, tasks=self.tasks, data_dir=data_dir) if shard_size is not None: dataset.reshard(shard_size) return dataset else: return ImageDataset(image_files, y=labels, w=weights, ids=image_files)