Beispiel #1
0
 def get_dataset(clazz, ext='', override=False):
     # ====== all path ====== #
     name = clazz.get_name(ext) + '.zip'
     path = base64.decodebytes(DataLoader.ORIGIN).decode() + name
     zip_path = clazz.get_zip_path(ext)
     out_path = clazz.get_ds_path(ext)
     # ====== check out_path ====== #
     if os.path.isfile(out_path):
         raise RuntimeError("Found a file at path: %s, we need a folder "
                            "to unzip downloaded files." % out_path)
     elif os.path.isdir(out_path):
         if override or len(os.listdir(out_path)) == 0:
             shutil.rmtree(out_path)
         else:
             return Dataset(out_path, read_only=True)
     # ====== download the file ====== #
     if os.path.exists(zip_path) and override:
         os.remove(zip_path)
     if not os.path.exists(zip_path):
         get_file(name, path, DataLoader.BASE_DIR)
     # ====== upzip dataset ====== #
     unzip_aes(in_path=zip_path, out_path=out_path)
     ds = Dataset(out_path, read_only=True)
     if os.path.exists(zip_path):
         os.remove(zip_path)
     return ds
Beispiel #2
0
 def get_dataset(clazz, ext='', override=False):
   # ====== all path ====== #
   name = clazz.get_name(ext) + '.zip'
   path = base64.decodebytes(DataLoader.ORIGIN).decode() + name
   zip_path = clazz.get_zip_path(ext)
   out_path = clazz.get_ds_path(ext)
   # ====== check out_path ====== #
   if os.path.isfile(out_path):
     raise RuntimeError("Found a file at path: %s, we need a folder "
                        "to unzip downloaded files." % out_path)
   elif os.path.isdir(out_path):
     if override or len(os.listdir(out_path)) == 0:
       shutil.rmtree(out_path)
     else:
       return Dataset(out_path, read_only=True)
   # ====== download the file ====== #
   if os.path.exists(zip_path) and override:
     os.remove(zip_path)
   if not os.path.exists(zip_path):
     get_file(name, path, DataLoader.BASE_DIR)
   # ====== upzip dataset ====== #
   unzip_aes(in_path=zip_path, out_path=out_path)
   ds = Dataset(out_path, read_only=True)
   if ds.md5 != clazz.md5():
     ds.close()
     shutil.rmtree(out_path)
     raise RuntimeError("Incorrect password for loading DIGITS dataset")
   else:
     os.remove(zip_path)
   return ds
Beispiel #3
0
def load_voxceleb_list():
    link = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL3ZveGNlbGViX2xpc3RzLnppcA==\n'
    link = str(base64.decodebytes(link), 'utf-8')
    ds_path = get_datasetpath(name='voxceleb_lists',
                              root='~',
                              is_folder=False,
                              override=False)
    if not os.path.exists(ds_path):
        path = get_file(fname=os.path.basename(link),
                        origin=link,
                        outdir=get_datasetpath(root='~'))
        unzip_folder(zip_path=path,
                     out_path=os.path.dirname(path),
                     remove_zip=True)
    return Dataset(ds_path, read_only=True)
Beispiel #4
0
def load_sre_list():
    link = b'aHR0cHM6Ly9zMy5hbWF6b25hd3MuY29tL2FpLWRhdGFzZXRzL1NSRV9GSUxFUy56aXA=\n'
    link = str(base64.decodebytes(link), 'utf-8')
    ds_path = get_datasetpath(name='SRE_FILES',
                              root='~',
                              is_folder=False,
                              override=False)
    if os.path.exists(ds_path) and len(os.listdir(ds_path)) != 24:
        shutil.rmtree(ds_path)
    if not os.path.exists(ds_path):
        path = get_file(fname=os.path.basename(link),
                        origin=link,
                        outdir=get_datasetpath(root='~'))
        unzip_folder(zip_path=path, out_path=ds_path, remove_zip=True)
    return Dataset(ds_path, read_only=True)
Beispiel #5
0
  def save_cache(self, path, name=None, dtype=None, batch_size=1024):
    """ Save all preprocessed data to a Dataset

    Parameters
    ----------
    path: string
        path to a folder
    name: None, or list of string
        specific name for each returned `numpy.ndarray` during iteration
    dtype: None, or list of dtype, or single dtype
        specific dtype for all or each of returned `numpy.ndarray`
        during iteration
    batch_size: int
        amount of samples for each batch (higher the faster iteration)

    Note
    ----
    Only returned `numpy.ndarray` are saved
    """
    from odin.fuel.dataset import Dataset
    if not is_string(path):
      raise ValueError("`path` must be string path to a folder.")
    if os.path.exists(path) and os.path.isfile(path):
      raise ValueError("`path` is a file, required a folder for "
                       "saving all cache data.")
    # ====== start caching ====== #
    prog = Progbar(target=len(self),
                   name='Saving cache of preprocessed data',
                   print_report=True, print_summary=True)
    ds = Dataset(path, override=True)
    with self.set_batch_context(batch_size=int(batch_size), seed=None,
                                start=0, end=-1, shuffle_level=0):
      for X in self:
        if not isinstance(X, (tuple, list)):
          X = (X,)
        n = 0
        i = 0
        # saving preprocessed data
        for x in X:
          if isinstance(x, np.ndarray):
            # checking name
            if name is None:
              x_name = 'X%d' % i
            else:
              x_name = name[i]
            # checking dtype
            if isinstance(dtype, (tuple, list)):
              x = x.astype(dtype[i])
            elif dtype is not None:
              x = x.astype(dtype)
            # saving to the dataset
            if x_name in ds:
              ds[x_name].append(x)
            else:
              ds[(x_name, 'memmap')] = x
            # update samples count, and data count
            n = x.shape[0]
            i += 1
        # print progress
        prog.add(n)
    # ====== flush and close everything ====== #
    ds.flush()
    ds.close()
    with open(os.path.join(path, 'README'), 'wb') as f:
      f.write(str(self))
    # end
    # ====== check one more time ====== #
    ds = Dataset(path, read_only=True)
    print(ds)
    print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)')
    ds.close()
    return self
Beispiel #6
0
    def save_cache(self, path, name=None, dtype=None, batch_size=1024):
        """ Save all preprocessed data to a Dataset

    Parameters
    ----------
    path: string
        path to a folder
    name: None, or list of string
        specific name for each returned `numpy.ndarray` during iteration
    dtype: None, or list of dtype, or single dtype
        specific dtype for all or each of returned `numpy.ndarray`
        during iteration
    batch_size: int
        amount of samples for each batch (higher the faster iteration)

    Note
    ----
    Only returned `numpy.ndarray` are saved
    """
        from odin.fuel.dataset import Dataset
        if not is_string(path):
            raise ValueError("`path` must be string path to a folder.")
        if os.path.exists(path) and os.path.isfile(path):
            raise ValueError("`path` is a file, required a folder for "
                             "saving all cache data.")
        # ====== start caching ====== #
        prog = Progbar(target=len(self),
                       name='Saving cache of preprocessed data',
                       print_report=True,
                       print_summary=True)
        ds = Dataset(path, override=True)
        with self.set_batch_context(batch_size=int(batch_size),
                                    seed=None,
                                    start=0,
                                    end=-1,
                                    shuffle_level=0):
            for X in self:
                if not isinstance(X, (tuple, list)):
                    X = (X, )
                n = 0
                i = 0
                # saving preprocessed data
                for x in X:
                    if isinstance(x, np.ndarray):
                        # checking name
                        if name is None:
                            x_name = 'X%d' % i
                        else:
                            x_name = name[i]
                        # checking dtype
                        if isinstance(dtype, (tuple, list)):
                            x = x.astype(dtype[i])
                        elif dtype is not None:
                            x = x.astype(dtype)
                        # saving to the dataset
                        if x_name in ds:
                            ds[x_name].append(x)
                        else:
                            ds[(x_name, 'memmap')] = x
                        # update samples count, and data count
                        n = x.shape[0]
                        i += 1
                # print progress
                prog.add(n)
        # ====== flush and close everything ====== #
        ds.flush()
        ds.close()
        with open(os.path.join(path, 'README'), 'wb') as f:
            f.write(str(self))
        # end
        # ====== check one more time ====== #
        ds = Dataset(path, read_only=True)
        print(ds)
        print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)')
        ds.close()
        return self