Example #1
0
 def send_notification(self, msg):
   if self._log:
     add_notification(
         '[%s] %s' % (ctext(self.__class__.__name__, 'magenta'), msg))
   return self
Example #2
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()
Example #3
0
 def _show_noti(self, msg):
   if self._verbose > 1 and self._verbose != 3:
     add_notification(msg)
Example #4
0
 def _show_noti(self, msg):
   if self._verbose > 1 and self._verbose != 3:
     add_notification(msg)
Example #5
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
  """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
  # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
  # add reading data from indices also
  # ====== check input dataset ====== #
  own_dataset = True
  if is_string(dataset) and os.path.isdir(dataset):
    dataset = Dataset(dataset, read_only=True)
  elif isinstance(dataset, Dataset):
    own_dataset = False
  elif isinstance(dataset, FeatureProcessor):
    dataset = Dataset(dataset.path, read_only=True)
  else:
    raise ValueError("Cannot acquire Dataset from input: %s" %
                     str(dataset))
  # ====== extract all feat_name ====== #
  if is_string(feat_name) and feat_name == 'auto':
    feat_name = []
    for k in dataset.keys():
      X = dataset[k]
      if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
        feat_name.append(k)
  else:
    feat_name = [name
                 for name in as_tuple(feat_name, t=str)
                 if name in dataset]
  # ====== load PCA ====== #
  from odin.ml import MiniBatchPCA
  # init PCA
  nb_samples = 0
  for feat in feat_name:
    nb_samples += dataset[feat].shape[0]
  # ====== prepare MPI PCA ====== #
  add_notification("Selected features for PCA: " +
      ctext(', '.join(feat_name), 'yellow'))

  def map_pca(name):
    X = dataset[name]
    # found exist pca model
    if 'pca_' + feat in dataset and not override:
      pca = dataset['pca_' + feat]
    # create new PCA
    else:
      pca = MiniBatchPCA(n_components=None, whiten=False,
                         copy=True, batch_size=None)
    # No shuffling make iter much faster
    for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0):
      pca.partial_fit(x)
      yield x.shape[0]
    # save PCA model
    with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
      cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
    # finish return feature name
    yield name
  mpi = MPI(jobs=feat_name, func=map_pca,
            ncpu=None, batch=1, hwm=12082518,
            backend='python')
  # ====== running the MPI ====== #
  remain_features = list(feat_name)
  finished_features = []
  prog = Progbar(target=nb_samples, print_summary=True, print_report=True,
                 name='PCA')
  for n in mpi:
    if is_string(n):
      remain_features.remove(n)
      finished_features.append(n)
    else:
      prog['Remain'] = ', '.join(remain_features)
      prog['Finished'] = ', '.join(finished_features)
      prog.add(n)
  # ====== return ====== #
  if own_dataset:
    dataset.close()