Python flatten_list Beispiele, odin.utils.flatten_list Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: stats.py Projekt: imito/odin

def _split_list(x, rng, train=0.6, idfunc=None, inc_test=True):
  # ====== shuffle input ====== #
  if idfunc is not None:
    x_id = defaultdict(list)
    for i in x:
      x_id[idfunc(i)].append(i)
  else:
    x_id = {i: [j] for i, j in enumerate(x)}
  # shuffle ID(s)
  id_list = list(x_id.keys())
  rng.shuffle(id_list)
  # ====== split ====== #
  N = len(id_list)
  if N == 1:
    raise ValueError("Only find 1 sample, cannot split")
  train = int(np.floor(float(train) * N))
  if train >= N:
    raise ValueError("train proportion must larger than 0 and smaller than 1.")
  valid = (N - train) // (2 if inc_test else 1)
  # ====== return splitted ====== #
  rets = (flatten_list((x_id[i] for i in id_list[:train]), level=1),
          flatten_list((x_id[i] for i in id_list[train: train + valid]), level=1))
  if inc_test:
    rets += (flatten_list((x_id[i] for i in id_list[train + valid:]), level=1),)
  else:
    rets += ([],)
  assert sum(len(r) for r in rets) == len(x), \
      "Number of returned data inconsitent from original data, %d != %d" % \
      (sum(len(r) for r in rets), len(x))
  return rets

Beispiel #2

0

Datei anzeigen

Datei: stats.py Projekt: johndpope/odin-ai

def _split_list(x, rng, train=0.6, idfunc=None, inc_test=True):
    # ====== shuffle input ====== #
    if idfunc is not None:
        x_id = defaultdict(list)
        for i in x:
            x_id[idfunc(i)].append(i)
    else:
        x_id = {i: [j] for i, j in enumerate(x)}
    # shuffle ID(s)
    id_list = list(x_id.keys())
    rng.shuffle(id_list)
    # ====== split ====== #
    N = len(id_list)
    if N == 1:
        raise ValueError("Only find 1 sample, cannot split")
    train = int(np.floor(float(train) * N))
    if train >= N:
        raise ValueError(
            "train proportion must larger than 0 and smaller than 1.")
    valid = (N - train) // (2 if inc_test else 1)
    # ====== return splitted ====== #
    rets = (flatten_list((x_id[i] for i in id_list[:train]), level=1),
            flatten_list((x_id[i] for i in id_list[train:train + valid]),
                         level=1))
    if inc_test:
        rets += (flatten_list((x_id[i] for i in id_list[train + valid:]),
                              level=1), )
    else:
        rets += ([], )
    assert sum(len(r) for r in rets) == len(x), \
        "Number of returned data inconsitent from original data, %d != %d" % \
        (sum(len(r) for r in rets), len(x))
    return rets

Beispiel #3

0

Datei anzeigen

 def __init__(self, *posteriors, verbose=True):
     super(ResultsSheet, self).__init__()
     posteriors = flatten_list(posteriors)
     assert len(posteriors) > 0, "No posteriors for analysis!"
     assert all(isinstance(i, Posterior) for i in posteriors)
     self._posteriors = posteriors
     self.verbose = bool(verbose)

Beispiel #4

0

Datei anzeigen

Datei: helpers.py Projekt: professorlust/odin-ai

 def __init__(self, inputs, outputs, updates=[], defaults={},
              training=None, batch_size=None, batch_vars=[],
              strict=False):
   self.training = training
   self.batch_size = batch_size
   self.batch_vars = list(batch_vars)
   self._strict = bool(strict)
   # ====== validate input ====== #
   if isinstance(inputs, Mapping):
     self.inputs_name = inputs.keys()
     inputs = inputs.values()
   elif not isinstance(inputs, (tuple, list)):
     inputs = [inputs]
   self.inputs = flatten_list(inputs, level=None)
   if not hasattr(self, 'inputs_name'):
     self.inputs_name = [i.name.split(':')[0] for i in self.inputs]
   # ====== defaults ====== #
   defaults = dict(defaults)
   self.defaults = defaults
   # ====== validate outputs ====== #
   return_list = True
   if not isinstance(outputs, (tuple, list)):
     outputs = (outputs,)
     return_list = False
   self.outputs = flatten_list(list(outputs), level=None)
   self._return_list = return_list
   # ====== validate updates ====== #
   if isinstance(updates, Mapping):
     updates = updates.items()
   with tf.control_dependencies(self.outputs):
     # create updates ops
     if not isinstance(updates, tf.Operation):
       updates_ops = []
       for update in updates:
         if isinstance(update, (tuple, list)):
           p, new_p = update
           updates_ops.append(tf.assign(p, new_p))
         else: # assumed already an assign op
           updates_ops.append(update)
       self.updates_ops = tf.group(*updates_ops)
     else: # already an tensorflow Ops
       self.updates_ops = updates
   # ====== cached shape ====== #
   self._input_shape = [tuple(i.shape.as_list()) for i in self.inputs]
   self._output_shape = [tuple(i.shape.as_list()) for i in self.outputs]

Beispiel #5

0

Datei anzeigen

Datei: helpers.py Projekt: professorlust/odin-ai

 def __init__(self, outputs=None, trace_up=False):
   # it is important to don't have duplicated outputs
   # otherwise, it can go into infinite loop
   outputs = list(set([o for o in flatten_list(as_list(outputs),
                                          level=None)
                       if o is not None]))
   self.outputs = outputs
   self._trace_up = trace_up
   self._get_variables()

Beispiel #6

0

Datei anzeigen

Datei: recipe_base.py Projekt: imito/odin

 def set_recipes(self, *recipes):
   # filter out None value
   recipes = flatten_list(as_tuple(recipes))
   recipes = [rcp for rcp in recipes
              if rcp is not None and isinstance(rcp, FeederRecipe)]
   # ====== set the new recipes ====== #
   if len(recipes) > 0:
     self._recipes = recipes
     for rcp in self._recipes:
       rcp.set_feeder_info(self.nb_desc)
   return self

Beispiel #7

0

Datei anzeigen

Datei: recipe_base.py Projekt: professorlust/odin-ai

 def set_recipes(self, *recipes):
     # filter out None value
     recipes = flatten_list(as_tuple(recipes))
     recipes = [
         rcp for rcp in recipes
         if rcp is not None and isinstance(rcp, FeederRecipe)
     ]
     # ====== set the new recipes ====== #
     if len(recipes) > 0:
         self._recipes = recipes
         for rcp in self._recipes:
             rcp.set_feeder_info(self.nb_desc)
     return self

Beispiel #8

0

Datei anzeigen

Datei: base.py Projekt: imito/odin

def set_extractor_debug(extractors, debug):
  # ====== prepare ====== #
  if isinstance(extractors, (tuple, list)):
    extractors = [i for i in flatten_list(extractors)
                  if isinstance(i, Extractor)]
  elif isinstance(extractors, Pipeline):
    extractors = [i[-1] for i in extractors.steps]
  elif isinstance(extractors, Mapping):
    extractors = [i[-1] for i in extractors.items()]
  else:
    raise ValueError("No support for `extractors` type: %s" % type(extractors))
  # ====== set the value ====== #
  for i in extractors:
    i._debug = bool(debug)
  return extractors

Beispiel #9

0

Datei anzeigen

Datei: base.py Projekt: trungnt13/odin-ai

def set_extractor_debug(extractors, debug):
    # ====== prepare ====== #
    if isinstance(extractors, (tuple, list)):
        extractors = [
            i for i in flatten_list(extractors) if isinstance(i, Extractor)
        ]
    elif isinstance(extractors, Pipeline):
        extractors = [i[-1] for i in extractors.steps]
    elif isinstance(extractors, Mapping):
        extractors = [i[-1] for i in extractors.items()]
    else:
        raise ValueError("No support for `extractors` type: %s" %
                         type(extractors))
    # ====== set the value ====== #
    for i in extractors:
        i._debug = bool(debug)
    return extractors

Beispiel #10

0

Datei anzeigen

Datei: bnf.py Projekt: imito/odin

 def _initialize(self):
   param_names = flatten_list([('b%d' % i, 'w%d' % i)
                               for i in range(self.nb_layers)])
   weights = self.get_loaded_param(param_names)
   # ====== create ====== #
   layers = []
   for i in range(self.nb_layers):
     b = weights[i * 2].ravel()
     W = weights[i * 2 + 1].T
     num_units = b.shape[0]
     if i == self.nb_layers - 1:
       name = 'Bottleneck'
       nonlinearity = K.linear
     else:
       name = "Layer%d" % (i + 1)
       nonlinearity = K.relu
     op = Dense(num_units=num_units, W_init=W, b_init=b,
                activation=nonlinearity, name=name)
     layers.append(op)
   self.layers = layers

Beispiel #11

0

Datei anzeigen

Datei: bnf.py Projekt: professorlust/odin-ai

 def _initialize(self):
     param_names = flatten_list([('b%d' % i, 'w%d' % i)
                                 for i in range(self.nb_layers)])
     weights = self.get_loaded_param(param_names)
     # ====== create ====== #
     layers = []
     for i in range(self.nb_layers):
         b = weights[i * 2].ravel()
         W = weights[i * 2 + 1].T
         num_units = b.shape[0]
         if i == self.nb_layers - 1:
             name = 'Bottleneck'
             nonlinearity = K.linear
         else:
             name = "Layer%d" % (i + 1)
             nonlinearity = K.relu
         op = Dense(num_units=num_units,
                    W_init=W,
                    b_init=b,
                    activation=nonlinearity,
                    name=name)
         layers.append(op)
     self.layers = layers

Beispiel #12

0

Datei anzeigen

Datei: role.py Projekt: professorlust/odin-ai

def role_scope(*roles):
    """
  Example
  -------
  >>> X = K.variable(np.random.rand(12, 8))
  >>> with role_scope(Weight, Variational, VariationalMean):
  ...     add_roles(X)
  >>> print(X.tag.roles)
  ... # [<class 'odin.basic.Weight'>, <class 'odin.basic.VariationalMean'>]
  """
    roles = [
        r for r in flatten_list(roles, level=None)
        if isinstance(r, type) and issubclass(r, Role)
    ]
    # ====== shrink the roles so there is NO subrole ====== #
    roles = __ROLE_STACK[-1] + roles
    roles = [
        r for r in roles
        if not any(r != r0 and issubclass(r0, r) for r0 in roles)
    ]
    __ROLE_STACK.append(roles)
    yield roles
    __ROLE_STACK.pop()

Beispiel #13

0

Datei anzeigen

Datei: decompositions.py Projekt: imito/odin

def fast_pca(*x, n_components=None, algo='rpca', y=None,
             batch_size=1024, return_model=False,
             random_state=5218):
  """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
  batch_size = int(batch_size)
  algo = str(algo).lower()
  if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
    raise ValueError("`algo` must be one of the following: 'pca', "
                     "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
  if algo in ('sppca', 'plda') and y is None:
    raise RuntimeError("`y` must be not None if `algo='sppca'`")
  x = flatten_list(x, level=None)
  x = [i[:] if i.__class__.__name__ == 'MmapData' else i
       for i in x]
  # ====== check input ====== #
  x_train = x[0]
  x_test = x[1:]
  input_shape = None
  if x_train.ndim > 2: # only 2D for PCA
    input_shape = (-1,) + x_train.shape[1:]
    new_shape = (-1, np.prod(input_shape[1:]))
    x_train = np.reshape(x_train, new_shape)
    x_test = [np.reshape(x, new_shape) for x in x_test]
    if n_components is not None: # no need to reshape back
      input_shape = None
  # ====== train PCA ====== #
  if algo == 'sppca':
    pca = SupervisedPPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'plda':
    from odin.ml import PLDA
    pca = PLDA(n_phi=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'pca':
    pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  elif algo == 'rpca':
    # we copy the implementation of RandomizedPCA because
    # it is significantly faster than PCA(svd_solver='randomize')
    pca = RandomizedPCA(n_components=n_components, iterated_power=2,
                        random_state=random_state)
    pca.fit(x_train)
  elif algo == 'ipca':
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    prog = Progbar(target=x_train.shape[0],
                   print_report=False, print_summary=False, name="Fitting PCA")
    for start, end in batching(batch_size=batch_size, n=x_train.shape[0],
                               seed=5218):
      pca.partial_fit(x_train[start:end], check_input=False)
      prog.add(end - start)
  elif algo == 'ppca':
    pca = PPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  # ====== transform ====== #
  x_train = pca.transform(x_train)
  x_test = [pca.transform(x) for x in x_test]
  # reshape back to original shape if necessary
  if input_shape is not None:
    x_train = np.reshape(x_train, input_shape)
    x_test = [np.reshape(x, input_shape) for x in x_test]
  # return the results
  if len(x_test) == 0:
    return x_train if not return_model else (pca, x_train)
  return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)

Beispiel #14

0

Datei anzeigen

Datei: dataset.py Projekt: professorlust/odin-ai

    def copy(self,
             destination,
             indices_filter=None,
             data_filter=None,
             override=False):
        """ Copy the dataset to a new folder and closed
    the old dataset

    """
        from distutils.dir_util import copy_tree
        read_only = self.read_only
        # indices
        if indices_filter is not None and \
        not is_callable(indices_filter) and \
        not isinstance(indices_filter, (tuple, list)):
            raise ValueError(
                '`indices_filter` must be callable, tuple, list or None')
        if isinstance(indices_filter, (tuple, list)):
            tmp = tuple(indices_filter)
            indices_filter = lambda x: x in tmp
        # data name
        if data_filter is not None and \
        not is_callable(data_filter) and \
        not isinstance(data_filter, (tuple, list)):
            raise ValueError(
                '`data_filter` must be callable, tuple, list or None')
        if isinstance(data_filter, (tuple, list)):
            tmp = tuple(data_filter)
            data_filter = lambda x: x in tmp
        # ====== other files which are not Data ====== #
        other_files = [i for i in os.listdir(self.path) if i not in self]
        # ====== preprocessing ====== #
        destination = os.path.abspath(str(destination))
        if not os.path.exists(destination):
            os.mkdir(destination)
        elif not os.path.isdir(destination):
            raise ValueError('path at "%s" must be a folder' % destination)
        elif override:
            shutil.rmtree(destination)
            os.mkdir(destination)
        else:
            raise ValueError(
                "A folder exist at path: '%s', cannot be overrided." %
                destination)
        # ====== copy everything ====== #
        if indices_filter is None and data_filter is None:
            print("Copying %s files from '%s' to '%s' ..." %
                  (ctext(len(self), 'cyan'), ctext(
                      self.path, 'yellow'), ctext(destination, 'yellow')))
            copy_tree(self.path, destination)
        # ====== only data_filter ====== #
        elif indices_filter is None:
            data_list = [i for i in self.keys() if data_filter(i)]
            # copy all the data
            for name in data_list:
                org_path = os.path.join(self.path, name)
                dst_path = os.path.join(destination, name)
                print("Copying from '%s' to '%s' ..." %
                      (ctext(org_path, 'yellow'), ctext(dst_path, 'yellow')))
                shutil.copy2(org_path, dst_path)
            # copy all the related indices
            for name in self.keys():
                org_path = os.path.join(self.path, name)
                dst_path = os.path.join(destination, name)
                if not os.path.exists(dst_path) and \
                ('indices' == name or any(i in data_list for i in name.split('_')[1:])):
                    print("Copying Indices from '%s' to '%s'" %
                          (ctext(org_path, 'cyan'), ctext(dst_path, 'cyan')))
                    shutil.copy2(org_path, dst_path)
        # ====== use indices_filter and data_filter ====== #
        else:
            if data_filter is None:
                all_data = list(self.keys())
            else:
                all_data = [i for i in self.keys() if data_filter(i)]
            # list of data with separated indices
            separated_data = flatten_list(
                [k.split('_')[1:] for k in self.keys() if 'indices_' == k[:8]])
            # iterate over indices and copy one by one data
            for ids_name in [k for k in self.keys() if 'indices' == k[:7]]:
                indices = [(n, (s, e)) for n, (s, e) in self[ids_name]
                           if indices_filter(n)]
                # no match indices, skip
                if len(indices) == 0:
                    continue
                nb_samples = sum(e - s for n, (s, e) in indices)
                # get all data assigned to given indices
                data = ids_name.split('_')[1:]
                if len(data) == 0:
                    data = [i for i in all_data if i not in separated_data]
                else:
                    data = [i for i in data if i in all_data]
                # if still no data found, skip
                if len(data) == 0:
                    continue
                # copy each data
                for data_name in data:
                    X = self[data_name]
                    # copy big MmapDict
                    if isinstance(X, MmapDict) and len(X) == len(
                            self[ids_name]):
                        new_path = os.path.join(destination,
                                                os.path.basename(X.path))
                        print("Copying MmapDict from '%s' to '%s'" %
                              (ctext(X.path, 'cyan'), ctext(new_path, 'cyan')))
                        new_dict = MmapDict(new_path,
                                            cache_size=80000,
                                            read_only=False)
                        for n, (s, e) in indices:
                            new_dict[n] = X[n]
                        new_dict.flush(save_all=True)
                        new_dict.close()
                    # copy MmapData
                    elif isinstance(X, MmapData):
                        Y = MmapData(path=os.path.join(destination, data_name),
                                     dtype=X.dtype,
                                     shape=(0, ) + X.shape[1:],
                                     read_only=False)
                        prog = Progbar(target=nb_samples,
                                       print_report=True,
                                       print_summary=True,
                                       name="Copying data: '%s' to path:'%s'" %
                                       (ctext(data_name, 'yellow'),
                                        ctext(Y.data_info, 'cyan')))
                        for n, (s, e) in indices:
                            Y.append(X[s:e])
                            prog.add(e - s)
                    # unknown data-type
                    else:
                        org_path = os.path.join(self.path, data_name)
                        new_path = os.path.join(destination, data_name)
                        # just copy directly the files
                        if os.path.isfile(org_path) or \
                        not os.path.exists(new_path):
                            shutil.copy2(org_path, new_path)
                            print("Copying '%s' to '%s' ..." % (ctext(
                                org_path, 'cyan'), ctext(new_path, 'yellow')))
                        else:
                            wprint("Cannot copy: '%s' - %s" %
                                   (ctext(data_name, 'cyan'),
                                    ctext(type(self[data_name]), 'yellow')))
                # copy the indices
                new_indices = MmapDict(os.path.join(destination, ids_name),
                                       cache_size=80000,
                                       read_only=False)
                start = 0
                for n, (s, e) in indices:
                    size = e - s
                    new_indices[n] = (start, start + size)
                    start += size
                new_indices.flush(save_all=True)
                new_indices.close()
        # ====== copy others files ====== #
        for f in other_files:
            org_path = os.path.join(self.path, f)
            dst_path = os.path.join(destination, f)
            if not os.path.exists(dst_path):
                if os.path.isdir(org_path):  # directory
                    copy_tree(org_path, dst_path)
                else:  # single file
                    shutil.copy2(org_path, dst_path)
        # ====== readme ====== #
        readme_name = os.path.basename(self._readme_path)
        dst_path = os.path.join(destination, readme_name)
        if not os.path.exists(dst_path):
            shutil.copy2(self._readme_path, dst_path)
        return Dataset(destination, read_only=read_only)

Beispiel #15

0

Datei anzeigen

Datei: dataset.py Projekt: imito/odin

  def copy(self, destination,
           indices_filter=None, data_filter=None,
           override=False):
    """ Copy the dataset to a new folder and closed
    the old dataset

    """
    from distutils.dir_util import copy_tree
    read_only = self.read_only
    # indices
    if indices_filter is not None and \
    not is_callable(indices_filter) and \
    not isinstance(indices_filter, (tuple, list)):
      raise ValueError('`indices_filter` must be callable, tuple, list or None')
    if isinstance(indices_filter, (tuple, list)):
      tmp = tuple(indices_filter)
      indices_filter = lambda x: x in tmp
    # data name
    if data_filter is not None and \
    not is_callable(data_filter) and \
    not isinstance(data_filter, (tuple, list)):
      raise ValueError('`data_filter` must be callable, tuple, list or None')
    if isinstance(data_filter, (tuple, list)):
      tmp = tuple(data_filter)
      data_filter = lambda x: x in tmp
    # ====== other files which are not Data ====== #
    other_files = [i for i in os.listdir(self.path)
                   if i not in self]
    # ====== preprocessing ====== #
    destination = os.path.abspath(str(destination))
    if not os.path.exists(destination):
      os.mkdir(destination)
    elif not os.path.isdir(destination):
      raise ValueError('path at "%s" must be a folder' % destination)
    elif override:
      shutil.rmtree(destination)
      os.mkdir(destination)
    else:
      raise ValueError("A folder exist at path: '%s', cannot be overrided." %
                       destination)
    # ====== copy everything ====== #
    if indices_filter is None and data_filter is None:
      print("Copying %s files from '%s' to '%s' ..." %
        (ctext(len(self), 'cyan'),
         ctext(self.path, 'yellow'),
         ctext(destination, 'yellow')))
      copy_tree(self.path, destination)
    # ====== only data_filter ====== #
    elif indices_filter is None:
      data_list = [i for i in self.keys() if data_filter(i)]
      # copy all the data
      for name in data_list:
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        print("Copying from '%s' to '%s' ..." %
              (ctext(org_path, 'yellow'),
               ctext(dst_path, 'yellow')))
        shutil.copy2(org_path, dst_path)
      # copy all the related indices
      for name in self.keys():
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        if not os.path.exists(dst_path) and \
        ('indices' == name or any(i in data_list for i in name.split('_')[1:])):
          print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'),
                                                       ctext(dst_path, 'cyan')))
          shutil.copy2(org_path, dst_path)
    # ====== use indices_filter and data_filter ====== #
    else:
      if data_filter is None:
        all_data = list(self.keys())
      else:
        all_data = [i for i in self.keys()
                    if data_filter(i)]
      # list of data with separated indices
      separated_data = flatten_list(
          [k.split('_')[1:] for k in self.keys()
         if 'indices_' == k[:8]])
      # iterate over indices and copy one by one data
      for ids_name in [k for k in self.keys() if 'indices' == k[:7]]:
        indices = [(n, (s, e))
                   for n, (s, e) in self[ids_name]
                   if indices_filter(n)]
        # no match indices, skip
        if len(indices) == 0:
          continue
        nb_samples = sum(e - s for n, (s, e) in indices)
        # get all data assigned to given indices
        data = ids_name.split('_')[1:]
        if len(data) == 0:
          data = [i for i in all_data if i not in separated_data]
        else:
          data = [i for i in data if i in all_data]
        # if still no data found, skip
        if len(data) == 0:
          continue
        # copy each data
        for data_name in data:
          X = self[data_name]
          # copy big MmapDict
          if isinstance(X, MmapDict) and len(X) == len(self[ids_name]):
            new_path = os.path.join(destination, os.path.basename(X.path))
            print("Copying MmapDict from '%s' to '%s'" % (
                ctext(X.path, 'cyan'),
                ctext(new_path, 'cyan')))
            new_dict = MmapDict(new_path, cache_size=80000, read_only=False)
            for n, (s, e) in indices:
              new_dict[n] = X[n]
            new_dict.flush(save_all=True)
            new_dict.close()
          # copy MmapData
          elif isinstance(X, MmapData):
            Y = MmapData(path=os.path.join(destination, data_name),
                         dtype=X.dtype, shape=(0,) + X.shape[1:],
                         read_only=False)
            prog = Progbar(target=nb_samples,
                           print_report=True, print_summary=True,
                           name="Copying data: '%s' to path:'%s'" %
                           (ctext(data_name, 'yellow'),
                            ctext(Y.data_info, 'cyan')))
            for n, (s, e) in indices:
              Y.append(X[s:e])
              prog.add(e - s)
          # unknown data-type
          else:
            org_path = os.path.join(self.path, data_name)
            new_path = os.path.join(destination, data_name)
            # just copy directly the files
            if os.path.isfile(org_path) or \
            not os.path.exists(new_path):
              shutil.copy2(org_path, new_path)
              print("Copying '%s' to '%s' ..." %
                (ctext(org_path, 'cyan'), ctext(new_path, 'yellow')))
            else:
              wprint("Cannot copy: '%s' - %s" %
                (ctext(data_name, 'cyan'),
                 ctext(type(self[data_name]), 'yellow')))
        # copy the indices
        new_indices = MmapDict(os.path.join(destination, ids_name),
                               cache_size=80000, read_only=False)
        start = 0
        for n, (s, e) in indices:
          size = e - s
          new_indices[n] = (start, start + size)
          start += size
        new_indices.flush(save_all=True)
        new_indices.close()
    # ====== copy others files ====== #
    for f in other_files:
      org_path = os.path.join(self.path, f)
      dst_path = os.path.join(destination, f)
      if not os.path.exists(dst_path):
        if os.path.isdir(org_path): # directory
          copy_tree(org_path, dst_path)
        else: # single file
          shutil.copy2(org_path, dst_path)
    # ====== readme ====== #
    readme_name = os.path.basename(self._readme_path)
    dst_path = os.path.join(destination, readme_name)
    if not os.path.exists(dst_path):
      shutil.copy2(self._readme_path, dst_path)
    return Dataset(destination, read_only=read_only)

Beispiel #16

0

Datei anzeigen

Datei: processor.py Projekt: professorlust/odin-ai

def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()

Beispiel #17

0

Datei anzeigen

Datei: helpers.py Projekt: professorlust/odin-ai

 def __call__(self, *inputs, **kwargs):
   show_progress = kwargs.pop('show_progress', False)
   # dictionary as inputs
   if len(kwargs) == len(self.inputs_name):
     inputs = [kwargs[i] for i in self.inputs_name]
   # ====== delete un-matchede inputs ====== #
   inputs_new = []
   tmp = list(inputs)
   shapes = list(self._input_shape)
   # this process iteratively remove inputs with mismatch shape
   # to current given input
   for s in shapes:
     for i in tuple(tmp):
       if len(i.shape) != len(s) or \
       any(a is not None and a > 0 and a != b
               for a, b in zip(s, i.shape)): # different ndim, or shape
         tmp.remove(i)
       else:
         inputs_new.append(i)
         tmp.remove(i)
         break
   if len(inputs_new) != len(self.inputs):
     raise ValueError("Given inputs have shape: %s, cannot match the shape of "
                      "defined inputs: %s" %
                      ('; '.join([str(i.shape) for i in inputs]),
                       '; '.join([str(i) for i in self.input_shape])))
   if not self._strict:
     inputs = inputs_new
   # ====== create feed_dict ====== #
   feed_dict = {}
   inputs = flatten_list(inputs, level=None)
   for tensor, value in zip(self.inputs, inputs):
     feed_dict[tensor] = value
   feed_dict.update(self.defaults)
   # check if modifying training mode
   if self.training is None:
     pass
   elif self.training:
     feed_dict.update({is_training(): True})
   else:
     feed_dict.update({is_training(): False})
   session = get_session()
   outputs = None
   # ====== mini-batches ====== #
   if self.batch_size is not None:
     batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)]
                   if len(self.batch_vars) == 0 else self.batch_vars)
     batch_vars = [i for i in batch_vars
                   if i in feed_dict and hasattr(feed_dict[i], 'shape')]
     n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars))
     assert len(n_samples) == 1, \
     "Data have multiple batching dimension: %s" % str(n_samples)
     n_samples = n_samples[0]
     # only continue if we have more samples than `batch_size`
     if n_samples > self.batch_size:
       n_output = len(self.outputs)
       outputs = []
       all_batches = []
       # (optional) showing progress
       if show_progress:
         prog = Progbar(target=n_samples,
                        print_report=False, print_summary=False,
                        name='')
       for s, e in batching(batch_size=int(self.batch_size),
                            n=n_samples):
         if show_progress:
           prog.add(e - s)
         all_batches.append(e - s)
         feed_dict_minibatch = OrderedDict([(k, v[s:e])
                                            if k in batch_vars else (k, v)
                                            for k, v in feed_dict.items()])
         updated = session.run(self.outputs + [self.updates_ops],
                               feed_dict=feed_dict_minibatch)
         updated = updated[:n_output]
         if not self._return_list:
           updated = updated[0]
         outputs.append(updated)
       ## concatenate all outputs
       if not self._return_list:
         o_ndim = outputs[0].ndim
         if o_ndim == 0: # returned scalars
           outputs = np.array(outputs)
         else: # returned array
           for o_axis in range(o_ndim):
             all_n = [o.shape[o_axis] for o in outputs]
             if all_n == all_batches:
               break
           outputs = np.concatenate(outputs, axis=o_axis)
       ## returning a list of outputs
       else:
         new_outputs = []
         for output_idx in range(len(outputs[0])):
           o = [x[output_idx] for x in outputs]
           o_ndim = o[0].ndim
           if o_ndim == 0: # returned scalars
             o = np.array(o)
           else: # returned array
             for o_axis in range(o[0].ndim):
               all_n = [val.shape[o_axis] for val in o]
               if all_n == all_batches:
                 break
             o = np.concatenate(o, axis=o_axis)
           new_outputs.append(o)
         outputs = new_outputs
   # ====== single batch ====== #
   if outputs is None:
     updated = session.run(self.outputs + [self.updates_ops],
                           feed_dict=feed_dict)
     outputs = updated[:len(self.outputs)]
     if not self._return_list:
       outputs = outputs[0]
   # ====== return final output ====== #
   return outputs

Beispiel #18

0

Datei anzeigen

Datei: decompositions.py Projekt: johndpope/odin-ai

def fast_pca(*x,
             n_components=None,
             algo='rpca',
             y=None,
             batch_size=1024,
             return_model=False,
             random_state=1234):
    """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
    batch_size = int(batch_size)
    algo = str(algo).lower()
    if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
        raise ValueError(
            "`algo` must be one of the following: 'pca', "
            "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
    if algo in ('sppca', 'plda') and y is None:
        raise RuntimeError("`y` must be not None if `algo='sppca'`")
    x = flatten_list(x, level=None)
    x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x]
    # ====== check input ====== #
    x_train = x[0]
    x_test = x[1:]
    input_shape = None
    if x_train.ndim > 2:  # only 2D for PCA
        input_shape = (-1, ) + x_train.shape[1:]
        new_shape = (-1, np.prod(input_shape[1:]))
        x_train = np.reshape(x_train, new_shape)
        x_test = [np.reshape(x, new_shape) for x in x_test]
        if n_components is not None:  # no need to reshape back
            input_shape = None
    # ====== train PCA ====== #
    if algo == 'sppca':
        pca = SupervisedPPCA(n_components=n_components,
                             random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'plda':
        from odin.ml import PLDA
        pca = PLDA(n_phi=n_components, random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'pca':
        pca = PCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    elif algo == 'rpca':
        # we copy the implementation of RandomizedPCA because
        # it is significantly faster than PCA(svd_solver='randomize')
        pca = RandomizedPCA(n_components=n_components,
                            iterated_power=2,
                            random_state=random_state)
        pca.fit(x_train)
    elif algo == 'ipca':
        pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        prog = Progbar(target=x_train.shape[0],
                       print_report=False,
                       print_summary=False,
                       name="Fitting PCA")
        for start, end in batching(batch_size=batch_size,
                                   n=x_train.shape[0],
                                   seed=1234):
            pca.partial_fit(x_train[start:end], check_input=False)
            prog.add(end - start)
    elif algo == 'ppca':
        pca = PPCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    # ====== transform ====== #
    x_train = pca.transform(x_train)
    x_test = [pca.transform(x) for x in x_test]
    # reshape back to original shape if necessary
    if input_shape is not None:
        x_train = np.reshape(x_train, input_shape)
        x_test = [np.reshape(x, input_shape) for x in x_test]
    # return the results
    if len(x_test) == 0:
        return x_train if not return_model else (pca, x_train)
    return tuple([x_train] +
                 x_test) if not return_model else tuple([pca, x_train] +
                                                        x_test)

Beispiel #19

0

Datei anzeigen

Datei: processor.py Projekt: imito/odin

def validate_features(ds_or_processor, path, nb_samples=25,
                      override=False, seed=12082518, fig_width=4):
  # TODO: add PCA visualization
  # TODO: update to match new indices style
  def logger(title, tag, check):
    check = bool(check)
    text_color = 'yellow' if check else 'red'
    print(ctext('   *', 'cyan'),
          ctext(str(title), text_color),
          ctext(str(tag), 'magenta'),
          ctext("✓", text_color) if check else ctext("✗", text_color))
  import matplotlib
  matplotlib.use('Agg')
  from odin.visual import plot_save, plot_multiple_features
  # ====== check path to dataset ====== #
  should_close_ds = True
  if isinstance(ds_or_processor, FeatureProcessor):
    ds = Dataset(ds_or_processor.path, read_only=True)
  elif is_string(ds_or_processor):
    ds = Dataset(ds_or_processor, read_only=True)
  elif isinstance(ds_or_processor, Dataset):
    ds = ds_or_processor
    should_close_ds = False
  else:
    raise ValueError("`ds` can be None, string, or Dataset. No "
                     "support for given input type: %s" % str(type(ds)))
  print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
  # ====== extract the config of the dataset ====== #
  if 'config' not in ds:
    raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` "
                       "which must contain `config` MmapDict of extracted "
                       "features configuration.")
  # config = ds['config']
  # pipeline = ds['pipeline']
  # ====== output path ====== #
  path = str(path)
  if not os.path.exists(path):
    os.mkdir(path)
  elif override:
    if os.path.isfile(path):
      os.remove(path)
    else:
      shutil.rmtree(path)
    os.mkdir(path)
  else:
    raise ValueError("`path`=%s exists, cannot override." % path)
  prev_stdio = get_stdio_path()
  stdio(path=os.path.join(path, 'log.txt'))
  nb_samples = int(nb_samples)
  # ====== get all features ====== #
  # [(name, dtype, statistic-able), ...]
  all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
  # store all features (included the features in external_indices
  all_features = []
  # the external indices can be: indices_mfcc_bnf
  external_indices = flatten_list([k.split('_')[1:] for k in all_keys
                                   if 'indices' in k and k != 'indices'])
  # ====== checking indices ====== #
  main_indices = {name: (start, end)
                  for name, (start, end) in ds['indices'].items()}
  for ids_name in (k for k in all_keys if 'indices' in k):
    ids = sorted([(name, start, end)
                  for name, (start, end) in ds[ids_name].items()],
                 key=lambda x: x[1])
    for prev, now in zip(ids, ids[1:]):
      assert prev[2] == now[1], "Zero length in indices"
      assert prev[2] - prev[1] > 0, "Zero length in indices"
      assert now[2] - now[1] > 0, "Zero length in indices"
    # final length match length of Data
    if ids_name != 'indices':
      for feat_name in ids_name.split('_')[1:]:
        assert now[-1] == len(ds[feat_name]), \
            "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
            (ids_name, feat_name)
        all_features.append(feat_name)
    else:
      for feat_name in all_keys:
        if feat_name not in external_indices and \
        'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
        'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
        isinstance(ds[feat_name], MmapData):
          assert now[-1] == len(ds[feat_name]), \
          "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
          all_features.append(feat_name)
    # logging
    logger("Checked all:", ids_name, True)
  # ====== check all dictionary types ====== #
  for name in all_keys:
    if isinstance(ds[name], MmapDict) and 'indices' not in name:
      data = ds[name]
      # special cases
      if name == 'sr':
        checking_func = lambda x: x > 0 # for sr
      else:
        checking_func = lambda x: True
      # check
      for key, val in data.items():
        assert key in main_indices, \
        "Dictionary with name:'%s' has key not found in indices." % name
        assert checking_func(val)
      logger("Checked dictionary: ", name, True)
  # ====== checking each type of data ====== #
  # get all stats name
  all_stats = defaultdict(list)
  for k in all_keys:
    if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
    'mean' == k[-4:] or 'std' == k[-3:]:
      all_stats[k[:-4].split('_')[0]].append(k)
  # get all pca name
  all_pca = {i: i + '_pca' for i in all_features
             if i + '_pca' in ds}
  # checking one-by-one numpy.ndarray features array
  for feat_name in all_features:
    dtype = str(ds[feat_name].dtype)
    # checking all data
    indices = ds.find_prefix(feat_name, 'indices')
    prog = Progbar(target=len(indices), interval=0.1,
                   print_report=True,
                   name='Checking: %s(%s)' % (feat_name, dtype))
    # start iterating over all data file
    fail_test = False
    for file_name, (start, end) in indices:
      dat = ds[feat_name][start:end]
      # No NaN value
      if np.any(np.isnan(dat)):
        logger("NaN values", file_name + ':' + feat_name, False)
        fail_test = True
      # not all value closed to zeros
      if np.all(np.isclose(dat, 0.)):
        logger("All-closed-zeros values", file_name + ':' + feat_name,
               False)
        fail_test = True
      prog['Name'] = file_name
      prog.add(1)
    if not fail_test:
      logger("Check data incredibility for: ", feat_name, True)
    # checking statistics
    if feat_name in all_stats:
      fail_test = False
      for stat_name in all_stats[feat_name]:
        X = ds[stat_name]
        if X.ndim >= 1:
          X = X[:]
        if np.any(np.isnan(X)):
          logger("NaN values", feat_name + ':' + stat_name, False)
          fail_test = True
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values", feat_name + ':' + stat_name,
                 False)
          fail_test = True
      if not fail_test:
        logger("Check statistics for: ", feat_name, True)
    # check PCA
    if feat_name in all_pca:
      pca = ds[all_pca[feat_name]]
      n = ds[feat_name].shape[0]
      nb_feats = ds[feat_name].shape[-1]
      fail_test = False
      # performing PCA on random samples
      for i in range(nb_samples):
        start = np.random.randint(0, n - nb_samples - 1)
        X = pca.transform(
            ds[feat_name][start:(start + nb_samples)],
            n_components=max(nb_feats // 2, 1))
        if np.any(np.isnan(X)):
          logger("NaN values in PCA", feat_name, False)
          fail_test = True
          break
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values in PCA", feat_name, False)
          fail_test = True
          break
      if not fail_test:
        logger("Check PCA for: ", feat_name, True)
  # ====== Do sampling ====== #
  np.random.seed(seed) # seed for reproceducible
  all_samples = np.random.choice(list(ds['indices'].keys()),
                                 size=nb_samples,
                                 replace=False)
  # plotting all samples
  for sample_id, file_name in enumerate(all_samples):
    X = {}
    for feat_name in all_features:
      start, end = ds.find_prefix(feat_name, 'indices')[file_name]
      feat = ds[feat_name][start:end]
      X[feat_name] = feat
      # some special handling
      try:
        _special_cases(X=feat, feat_name=feat_name, file_name=file_name,
                       ds=ds, path=path)
      except Exception as e:
        logger("Special case error: %s" % str(e),
               file_name + ':' + feat_name, False)
    plot_multiple_features(X, title=file_name, fig_width=fig_width)
    figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name))
    plot_save(figure_path, log=False, clear_all=True)
    logger("Sample figure saved at: ", figure_path, True)
  # plotting the statistic
  figure_path = os.path.join(path, 'stats.pdf')
  for feat_name, stat_name in all_stats.items():
    X = {name: ds[name][:]
         for name in stat_name
         if ds[name].ndim >= 1}
    if len(X) > 0:
      plot_multiple_features(X, title=feat_name, fig_width=fig_width)
  plot_save(figure_path, log=False, clear_all=True)
  logger("Stats figure save at: ", figure_path, True)
  logger("All reports at folder: ", os.path.abspath(path), True)
  # ====== cleaning ====== #
  stdio(path=prev_stdio)
  if should_close_ds:
    ds.close()