def get_dataset(clazz, ext='', override=False): # ====== all path ====== # name = clazz.get_name(ext) + '.zip' path = base64.decodebytes(DataLoader.ORIGIN).decode() + name zip_path = clazz.get_zip_path(ext) out_path = clazz.get_ds_path(ext) # ====== check out_path ====== # if os.path.isfile(out_path): raise RuntimeError("Found a file at path: %s, we need a folder " "to unzip downloaded files." % out_path) elif os.path.isdir(out_path): if override or len(os.listdir(out_path)) == 0: shutil.rmtree(out_path) else: return Dataset(out_path, read_only=True) # ====== download the file ====== # if os.path.exists(zip_path) and override: os.remove(zip_path) if not os.path.exists(zip_path): get_file(name, path, DataLoader.BASE_DIR) # ====== upzip dataset ====== # unzip_aes(in_path=zip_path, out_path=out_path) ds = Dataset(out_path, read_only=True) if os.path.exists(zip_path): os.remove(zip_path) return ds
def get_dataset(clazz, ext='', override=False): # ====== all path ====== # name = clazz.get_name(ext) + '.zip' path = base64.decodebytes(DataLoader.ORIGIN).decode() + name zip_path = clazz.get_zip_path(ext) out_path = clazz.get_ds_path(ext) # ====== check out_path ====== # if os.path.isfile(out_path): raise RuntimeError("Found a file at path: %s, we need a folder " "to unzip downloaded files." % out_path) elif os.path.isdir(out_path): if override or len(os.listdir(out_path)) == 0: shutil.rmtree(out_path) else: return Dataset(out_path, read_only=True) # ====== download the file ====== # if os.path.exists(zip_path) and override: os.remove(zip_path) if not os.path.exists(zip_path): get_file(name, path, DataLoader.BASE_DIR) # ====== upzip dataset ====== # unzip_aes(in_path=zip_path, out_path=out_path) ds = Dataset(out_path, read_only=True) if ds.md5 != clazz.md5(): ds.close() shutil.rmtree(out_path) raise RuntimeError("Incorrect password for loading DIGITS dataset") else: os.remove(zip_path) return ds
def load_voxceleb_list(): link = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL3ZveGNlbGViX2xpc3RzLnppcA==\n' link = str(base64.decodebytes(link), 'utf-8') ds_path = get_datasetpath(name='voxceleb_lists', root='~', is_folder=False, override=False) if not os.path.exists(ds_path): path = get_file(fname=os.path.basename(link), origin=link, outdir=get_datasetpath(root='~')) unzip_folder(zip_path=path, out_path=os.path.dirname(path), remove_zip=True) return Dataset(ds_path, read_only=True)
def load_sre_list(): link = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL1NSRV9GSUxFUy56aXA=\n' link = str(base64.decodebytes(link), 'utf-8') ds_path = get_datasetpath(name='SRE_FILES', root='~', is_folder=False, override=False) if os.path.exists(ds_path) and len(os.listdir(ds_path)) != 24: shutil.rmtree(ds_path) if not os.path.exists(ds_path): path = get_file(fname=os.path.basename(link), origin=link, outdir=get_datasetpath(root='~')) unzip_folder(zip_path=path, out_path=ds_path, remove_zip=True) return Dataset(ds_path, read_only=True)
def save_cache(self, path, name=None, dtype=None, batch_size=1024): """ Save all preprocessed data to a Dataset Parameters ---------- path: string path to a folder name: None, or list of string specific name for each returned `numpy.ndarray` during iteration dtype: None, or list of dtype, or single dtype specific dtype for all or each of returned `numpy.ndarray` during iteration batch_size: int amount of samples for each batch (higher the faster iteration) Note ---- Only returned `numpy.ndarray` are saved """ from odin.fuel.dataset import Dataset if not is_string(path): raise ValueError("`path` must be string path to a folder.") if os.path.exists(path) and os.path.isfile(path): raise ValueError("`path` is a file, required a folder for " "saving all cache data.") # ====== start caching ====== # prog = Progbar(target=len(self), name='Saving cache of preprocessed data', print_report=True, print_summary=True) ds = Dataset(path, override=True) with self.set_batch_context(batch_size=int(batch_size), seed=None, start=0, end=-1, shuffle_level=0): for X in self: if not isinstance(X, (tuple, list)): X = (X,) n = 0 i = 0 # saving preprocessed data for x in X: if isinstance(x, np.ndarray): # checking name if name is None: x_name = 'X%d' % i else: x_name = name[i] # checking dtype if isinstance(dtype, (tuple, list)): x = x.astype(dtype[i]) elif dtype is not None: x = x.astype(dtype) # saving to the dataset if x_name in ds: ds[x_name].append(x) else: ds[(x_name, 'memmap')] = x # update samples count, and data count n = x.shape[0] i += 1 # print progress prog.add(n) # ====== flush and close everything ====== # ds.flush() ds.close() with open(os.path.join(path, 'README'), 'wb') as f: f.write(str(self)) # end # ====== check one more time ====== # ds = Dataset(path, read_only=True) print(ds) print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)') ds.close() return self
def save_cache(self, path, name=None, dtype=None, batch_size=1024): """ Save all preprocessed data to a Dataset Parameters ---------- path: string path to a folder name: None, or list of string specific name for each returned `numpy.ndarray` during iteration dtype: None, or list of dtype, or single dtype specific dtype for all or each of returned `numpy.ndarray` during iteration batch_size: int amount of samples for each batch (higher the faster iteration) Note ---- Only returned `numpy.ndarray` are saved """ from odin.fuel.dataset import Dataset if not is_string(path): raise ValueError("`path` must be string path to a folder.") if os.path.exists(path) and os.path.isfile(path): raise ValueError("`path` is a file, required a folder for " "saving all cache data.") # ====== start caching ====== # prog = Progbar(target=len(self), name='Saving cache of preprocessed data', print_report=True, print_summary=True) ds = Dataset(path, override=True) with self.set_batch_context(batch_size=int(batch_size), seed=None, start=0, end=-1, shuffle_level=0): for X in self: if not isinstance(X, (tuple, list)): X = (X, ) n = 0 i = 0 # saving preprocessed data for x in X: if isinstance(x, np.ndarray): # checking name if name is None: x_name = 'X%d' % i else: x_name = name[i] # checking dtype if isinstance(dtype, (tuple, list)): x = x.astype(dtype[i]) elif dtype is not None: x = x.astype(dtype) # saving to the dataset if x_name in ds: ds[x_name].append(x) else: ds[(x_name, 'memmap')] = x # update samples count, and data count n = x.shape[0] i += 1 # print progress prog.add(n) # ====== flush and close everything ====== # ds.flush() ds.close() with open(os.path.join(path, 'README'), 'wb') as f: f.write(str(self)) # end # ====== check one more time ====== # ds = Dataset(path, read_only=True) print(ds) print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)') ds.close() return self