Exemple #1
0
    def __init__(self,
                 num_units,
                 W_init=init_ops.glorot_uniform_initializer(seed=randint()),
                 b_init=init_ops.constant_initializer(0.),
                 rnn_mode='lstm',
                 num_layers=1,
                 skip_input=False,
                 is_bidirectional=False,
                 return_states=False,
                 dropout=0.,
                 **kwargs):
        super(CudnnRNN, self).__init__(**kwargs)
        # ====== defaults recurrent control ====== #
        self.num_units = int(num_units)
        self.num_layers = int(num_layers)
        self.rnn_mode = str(rnn_mode)
        self.skip_input = bool(skip_input)
        self.is_bidirectional = bool(is_bidirectional)
        self.return_states = bool(return_states)
        self.dropout = dropout

        self.W_init = W_init
        self.b_init = b_init
        if skip_input:
            wprint("`skip_input` is not supported in Tensorflow.")
Exemple #2
0
def _preprocessing_losses(losses,
                          y_true,
                          y_pred,
                          inherit_losses=None,
                          sample_weights=None):
    """ Can be used for both objectives and metrics """
    from odin import backend as K
    # ====== special cases, only one inputs outputs, and multiple loss ====== #
    nb_losses = len(losses)
    if len(y_true) == 0:
        y_true = [None] * nb_losses
    elif len(y_true) == 1:
        y_true = y_true * nb_losses
    if len(y_pred) == 0:
        y_pred = [None] * nb_losses
    elif len(y_pred) == 1:
        y_pred = y_pred * nb_losses
    # ====== applying ====== #
    cost = []
    for idx, fn in enumerate(as_tuple(losses)):
        weight = 1
        kwargs = {}
        # preprocess
        if isinstance(fn, (tuple, list)):
            if len(fn) == 1:
                fn = fn[0]
            else:
                weight = [i for i in fn if is_number(i)]
                weight = 1 if len(weight) == 0 else weight[0]
                kwargs = [i for i in fn if isinstance(i, Mapping)]
                kwargs = {} if len(kwargs) == 0 else kwargs[0]
                fn = [i for i in fn if i != weight and i != kwargs][0]
        # apply the loss
        if is_number(fn):
            if inherit_losses is None or fn >= len(inherit_losses):
                raise ValueError("Cannot find losses at index: '%d'" % fn)
            obj = inherit_losses[fn]
        elif K.is_tensor(fn):
            obj = fn
        elif hasattr(fn, '__call__'):
            try:
                sign = inspect.signature(fn)
                if 'weights' in sign.parameters and sample_weights is not None:
                    kwargs['weights'] = sample_weights
            except ValueError:
                pass
            finally:
                obj = fn(y_true[idx], y_pred[idx], **kwargs)
            if isinstance(obj, (tuple, list)):
                wprint(
                    "function: '%s' return %d outputs (%s), only pick the first one"
                    % (fn.__name__, len(obj), '; '.join([str(i)
                                                         for i in obj])))
                obj = obj[0]
        cost.append((weight, obj))
    # ====== reduce ====== #
    return [c if w == 1 else w * c for w, c in cost]
Exemple #3
0
def _preprocessing_losses(losses, y_true, y_pred, inherit_losses=None,
                          sample_weights=None):
  """ Can be used for both objectives and metrics """
  from odin import backend as K
  # ====== special cases, only one inputs outputs, and multiple loss ====== #
  nb_losses = len(losses)
  if len(y_true) == 0:
    y_true = [None] * nb_losses
  elif len(y_true) == 1:
    y_true = y_true * nb_losses
  if len(y_pred) == 0:
    y_pred = [None] * nb_losses
  elif len(y_pred) == 1:
    y_pred = y_pred * nb_losses
  # ====== applying ====== #
  cost = []
  for idx, fn in enumerate(as_tuple(losses)):
    weight = 1
    kwargs = {}
    # preprocess
    if isinstance(fn, (tuple, list)):
      if len(fn) == 1:
        fn = fn[0]
      else:
        weight = [i for i in fn if is_number(i)]
        weight = 1 if len(weight) == 0 else weight[0]
        kwargs = [i for i in fn if isinstance(i, Mapping)]
        kwargs = {} if len(kwargs) == 0 else kwargs[0]
        fn = [i for i in fn if i != weight and i != kwargs][0]
    # apply the loss
    if is_number(fn):
      if inherit_losses is None or fn >= len(inherit_losses):
        raise ValueError("Cannot find losses at index: '%d'" % fn)
      obj = inherit_losses[fn]
    elif K.is_tensor(fn):
      obj = fn
    elif hasattr(fn, '__call__'):
      try:
        sign = inspect.signature(fn)
        if 'weights' in sign.parameters and sample_weights is not None:
          kwargs['weights'] = sample_weights
      except ValueError:
        pass
      finally:
        obj = fn(y_true[idx], y_pred[idx], **kwargs)
      if isinstance(obj, (tuple, list)):
        wprint("function: '%s' return %d outputs (%s), only pick the first one"
               % (fn.__name__,
                  len(obj),
                  '; '.join([str(i) for i in obj])))
        obj = obj[0]
    cost.append((weight, obj))
  # ====== reduce ====== #
  return [c if w == 1 else w * c for w, c in cost]
Exemple #4
0
 def save_mean_std(sum1, sum2, name):
     N = dataset[name.split('_')[0]].shape[0]
     mean = sum1 / N
     std = np.sqrt(sum2 / N - np.power(mean, 2))
     if np.any(np.isnan(mean)):
         wprint('Mean contains NaN, name: %s' % name)
     if np.any(np.isnan(std)):
         wprint('Std contains NaN, name: %s' % name)
     dataset[name + 'sum1'] = sum1
     dataset[name + 'sum2'] = sum2
     dataset[name + 'mean'] = mean
     dataset[name + 'std'] = std
Exemple #5
0
 def save_mean_std(sum1, sum2, name):
   N = dataset[name.split('_')[0]].shape[0]
   mean = sum1 / N
   std = np.sqrt(sum2 / N - np.power(mean, 2))
   if np.any(np.isnan(mean)):
     wprint('Mean contains NaN, name: %s' % name)
   if np.any(np.isnan(std)):
     wprint('Std contains NaN, name: %s' % name)
   dataset[name + 'sum1'] = sum1
   dataset[name + 'sum2'] = sum2
   dataset[name + 'mean'] = mean
   dataset[name + 'std'] = std
Exemple #6
0
 def _restore_variables(self):
   """ This method can be called anywhere to make sure
   the variable related to this NNOp is restored after
   pickling.
   """
   if hasattr(self, '_restore_vars_path') and \
   self._restore_vars_path is not None:
     folder_path = os.path.dirname(self._restore_vars_path)
     if os.path.exists(folder_path):
       K.restore_variables(self._restore_vars_path)
       # delete cached folder if necessary
       if self._delete_vars_folder:
         shutil.rmtree(folder_path)
     else:
       wprint("NNOp: '%s' cannot restore variables from path: '%s'"
              (self.name, folder_path))
     # reset info
     self._set_restore_info(None, False)
Exemple #7
0
  def __init__(self, num_units,
          W_init=init_ops.glorot_uniform_initializer(seed=randint()),
          b_init=init_ops.constant_initializer(0.),
          rnn_mode='lstm', num_layers=1,
          skip_input=False, is_bidirectional=False,
          return_states=False, dropout=0., **kwargs):
    super(CudnnRNN, self).__init__(**kwargs)
    # ====== defaults recurrent control ====== #
    self.num_units = int(num_units)
    self.num_layers = int(num_layers)
    self.rnn_mode = str(rnn_mode)
    self.skip_input = bool(skip_input)
    self.is_bidirectional = bool(is_bidirectional)
    self.return_states = bool(return_states)
    self.dropout = dropout

    self.W_init = W_init
    self.b_init = b_init
    if skip_input:
      wprint("`skip_input` is not supported in Tensorflow.")
Exemple #8
0
def fast_tsne(*X,
              n_components=2,
              n_samples=None,
              perplexity=30.0,
              early_exaggeration=8.0,
              learning_rate=200.0,
              n_iter=1000,
              n_iter_without_progress=300,
              min_grad_norm=1e-7,
              metric="euclidean",
              init="random",
              verbose=0,
              random_state=1234,
              method='barnes_hut',
              angle=0.5,
              n_jobs=4):
    """
  Parameters
  ----------
  n_components : int, optional (default: 2)
      Dimension of the embedded space.

  n_samples : {int, None}
      if given, downsampling the data to given number of sample

  perplexity : float, optional (default: 30)
      The perplexity is related to the number of nearest neighbors that
      is used in other manifold learning algorithms. Larger datasets
      usually require a larger perplexity. Consider selecting a value
      between 5 and 50. The choice is not extremely critical since t-SNE
      is quite insensitive to this parameter.

  early_exaggeration : float, optional (default: 8.0)
      Controls how tight natural clusters in the original space are in
      the embedded space and how much space will be between them. For
      larger values, the space between natural clusters will be larger
      in the embedded space. Again, the choice of this parameter is not
      very critical. If the cost function increases during initial
      optimization, the early exaggeration factor or the learning rate
      might be too high.

  learning_rate : float, optional (default: 200.0)
      The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
      the learning rate is too high, the data may look like a 'ball' with any
      point approximately equidistant from its nearest neighbours. If the
      learning rate is too low, most points may look compressed in a dense
      cloud with few outliers. If the cost function gets stuck in a bad local
      minimum increasing the learning rate may help.

  n_iter : int, optional (default: 1000)
      Maximum number of iterations for the optimization. Should be at
      least 250.

  n_iter_without_progress : int, optional (default: 300)
      Maximum number of iterations without progress before we abort the
      optimization, used after 250 initial iterations with early
      exaggeration. Note that progress is only checked every 50 iterations so
      this value is rounded to the next multiple of 50.

  min_grad_norm : float, optional (default: 1e-7)
      If the gradient norm is below this threshold, the optimization will
      be stopped.

  metric : string or callable, optional
      The metric to use when calculating distance between instances in a
      feature array. If metric is a string, it must be one of the options
      allowed by scipy.spatial.distance.pdist for its metric parameter, or
      a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
      If metric is "precomputed", X is assumed to be a distance matrix.
      Alternatively, if metric is a callable function, it is called on each
      pair of instances (rows) and the resulting value recorded. The callable
      should take two arrays from X as input and return a value indicating
      the distance between them. The default is "euclidean" which is
      interpreted as squared euclidean distance.

  init : string or numpy array, optional (default: "random")
      Initialization of embedding. Possible options are 'random', 'pca',
      and a numpy array of shape (n_samples, n_components).
      PCA initialization cannot be used with precomputed distances and is
      usually more globally stable than random initialization.

  verbose : int, optional (default: 0)
      Verbosity level.

  random_state : int, RandomState instance or None, optional (default: None)
      If int, random_state is the seed used by the random number generator;
      If RandomState instance, random_state is the random number generator;
      If None, the random number generator is the RandomState instance used
      by `np.random`.  Note that different initializations might result in
      different local minima of the cost function.

  method : string (default: 'barnes_hut')
      By default the gradient calculation algorithm uses Barnes-Hut
      approximation running in O(NlogN) time. method='exact'
      will run on the slower, but exact, algorithm in O(N^2) time. The
      exact algorithm should be used when nearest-neighbor errors need
      to be better than 3%. However, the exact method cannot scale to
      millions of examples.

  angle : float (default: 0.5)
      Only used if method='barnes_hut'
      This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
      'angle' is the angular size (referred to as theta in [3]) of a distant
      node as measured from a point. If this size is below 'angle' then it is
      used as a summary node of all points contained within it.
      This method is not very sensitive to changes in this parameter
      in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
      computation time and angle greater 0.8 has quickly increasing error.
  """
    assert len(X) > 0, "No input is given!"
    if isinstance(X[0], (tuple, list)):
        X = X[0]
    if not all(isinstance(x, np.ndarray) for x in X):
        raise ValueError(
            "`X` can only be list of numpy.ndarray or numpy.ndarray")
    # ====== kwarg for creating T-SNE class ====== #
    kwargs = dict(locals())
    del kwargs['X']
    n_samples = kwargs.pop('n_samples', None)
    # ====== downsampling ====== #
    if n_samples is not None:
        n_samples = int(n_samples)
        assert n_samples > 0
        new_X = []
        rand = random_state if isinstance(random_state, np.random.RandomState) else \
        np.random.RandomState(seed=random_state)
        for x in X:
            if x.shape[0] > n_samples:
                ids = rand.permutation(x.shape[0])[:n_samples]
                x = x[ids]
            new_X.append(x)
        X = new_X
    # ====== import proper T-SNE ====== #
    tsne_version = None
    try:
        from tsnecuda import TSNE
        from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE
        tsne_version = 'cuda'
    except ImportError:
        # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` "
        #        "for significant speed up.")
        try:
            from MulticoreTSNE import MulticoreTSNE as TSNE
            tsne_version = 'multicore'
        except ImportError:
            wprint(
                "Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`"
                ' to accelerate the T-SNE on multiple CPU cores.')
            try:
                from sklearn.manifold import TSNE
                tsne_version = 'sklearn'
            except Exception as e:
                raise e
    # ====== modify kwargs ====== #
    if tsne_version == 'cuda':
        kwargs['random_seed'] = kwargs['random_state']
        kwargs['theta'] = angle
        if method == 'exact':
            TSNE = _exact_TSNE
            del kwargs['theta']
        del kwargs['random_state']
        del kwargs['n_jobs']
        del kwargs['angle']
        del kwargs['method']
    elif tsne_version == 'multicore':
        pass
    else:
        del kwargs['n_jobs']
    # ====== getting cached values ====== #
    results = []
    X_new = []
    for i, x in enumerate(X):
        md5 = md5_checksum(x)
        key = _create_key(kwargs, md5)
        if key in _cached_values:
            results.append((i, _cached_values[key]))
        else:
            X_new.append((i, md5, x))

    # ====== perform T-SNE ====== #
    def apply_tsne(j):
        idx, md5, x = j
        tsne = TSNE(**kwargs)
        return (idx, md5, tsne.fit_transform(x))

    # only 1 X, no need for MPI
    if len(X_new) == 1:
        idx, md5, x = apply_tsne(X_new[0])
        results.append((idx, x))
        _cached_values[_create_key(kwargs, md5)] = x
    else:
        mpi = MPI(jobs=X_new,
                  func=apply_tsne,
                  batch=1,
                  ncpu=min(len(X_new),
                           cpu_count() - 1))
        for idx, md5, x in mpi:
            results.append((idx, x))
            _cached_values[_create_key(kwargs, md5)] = x
    # ====== return and clean ====== #
    results = sorted(results, key=lambda a: a[0])
    results = [r[1] for r in results]
    return results[0] if len(results) == 1 else results
Exemple #9
0
 def __init__(self,
              jobs,
              path,
              extractor,
              n_cache=0.12,
              ncpu=1,
              override=False,
              identifier='name',
              log_path=None,
              stop_on_failure=False):
     super(FeatureProcessor, self).__init__()
     # ====== check outpath ====== #
     path = os.path.abspath(str(path))
     if os.path.isfile(path):
         raise ValueError("`path` must be path to a directory, but found a "
                          "path to file.")
     # check override
     if os.path.exists(path) and override:
         wprint("Remove existed Dataset at path: %s" % path)
         for i in os.listdir(path):
             i = os.path.join(path, i)
             if os.path.isdir(i):  # remove folder
                 shutil.rmtree(i)
             else:  # remove file
                 os.remove(i)
     # set path and name
     self.path = path
     # ====== check jobs ====== #
     if not isinstance(jobs, (tuple, list, np.ndarray)):
         raise ValueError(
             "Provided `jobs` must be instance of tuple, list or ndarray.")
     if isinstance(jobs, np.ndarray):
         jobs = jobs.tolist()
     self.jobs = tuple(jobs)
     # ====== check multiprocessing ====== #
     if ncpu is None:  # auto select number of CPU
         ncpu = min(len(jobs), cpu_count() - 1)
     ncpu = int(ncpu)
     if ncpu <= 0 or n_cache <= 0:
         raise ValueError(
             '`ncpu` and `n_cache` must be greater than 0, but '
             'given values ncpu=%d n_cache=%f' % (ncpu, n_cache))
     self.n_cpu = ncpu
     self.n_cache = n_cache
     # ====== internal control for feature processor ====== #
     if isinstance(extractor, Pipeline):
         pass
     elif isinstance(extractor, (tuple, list)):
         steps = [('%s_%d' % (e.__class__.__name__, i), e)
                  for i, e in enumerate(extractor)]
         extractor = Pipeline(steps=steps)
     elif isinstance(extractor, Mapping):
         steps = [(str(n), e) for n, e in extractor.items()]
         extractor = Pipeline(steps=steps)
     elif isinstance(extractor, Extractor):
         extractor = Pipeline(steps=[(extractor.__class__.__name__,
                                      extractor)])
     self.extractor = extractor
     # ====== check identifier and log path ====== #
     self._identifier = str(identifier)
     if log_path is None:
         log_path = os.path.join(self.path, 'log.txt')
     else:
         log_path = str(log_path)
     self._log_path = _check_logpath(log_path)
     # ====== others ====== #
     self.config = {}
     self._error_log = []
     self.stop_on_failure = bool(stop_on_failure)
Exemple #10
0
 def fit(self, X, y=None, cv=None):
     self._initialize(X)
     if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \
     not hasattr(X, '__len__'):
         raise ValueError(
             "`X` must has 'shape', '__len__' and '__iter__' attributes")
     nb_train_samples = len(X)
     # convert to odin.fuel.Data if possible
     if isinstance(X, (np.ndarray, list, tuple)):
         X = F.as_data(X)
     if isinstance(y, (np.ndarray, list, tuple)):
         y = F.as_data(y)
     start_tr = 0
     end_tr = nb_train_samples
     # ====== check if cross validating ====== #
     create_it_cv = None
     if is_number(cv):
         cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv)
         end_tr = nb_train_samples - cv
         start_cv = end_tr
         end_cv = nb_train_samples
         nb_cv_samples = end_cv - start_cv
         create_it_cv = _create_it_func(X=X,
                                        y=y,
                                        batch_size=self.batch_size,
                                        start=start_cv,
                                        end=end_cv)
     elif isinstance(cv, (tuple, list)):
         X_cv, y_cv = cv
         nb_cv_samples = X_cv.shape[0]
         create_it_cv = _create_it_func(X=X_cv,
                                        y=y_cv,
                                        batch_size=self.batch_size,
                                        start=0,
                                        end=X_cv.shape[0])
     elif hasattr(cv, 'set_batch'):
         nb_cv_samples = cv.shape[0]
         create_it_cv = _create_it_func(X=cv,
                                        y=None,
                                        batch_size=self.batch_size,
                                        start=0,
                                        end=cv.shape[0])
     elif cv is not None:
         raise ValueError(
             '`cv` can be float (0-1), tuple or list of X and y, '
             'any object that have "shape" and "__iter__" attributes, '
             'or None')
     # ====== preprocessing ====== #
     create_it = _create_it_func(X=X,
                                 y=y,
                                 batch_size=self.batch_size,
                                 start=start_tr,
                                 end=end_tr)
     # ====== prepare ====== #
     curr_niter = sum(epoch[0] for epoch in self._train_history)
     curr_nepoch = len(self._train_history)
     curr_patience = int(self.patience)
     last_losses = None
     last_checkpoint = None
     best_epoch = None
     is_converged = False
     # ====== fitting ====== #
     while not is_converged:
         curr_nepoch += 1
         seed = self._rand_state.randint(0, 10e8)
         # ====== training ====== #
         nb_iter, duration, results = _fitting_helper(
             create_it(seed),
             fn=self._f_train,
             nb_samples=nb_train_samples,
             nb_classes=self.nb_classes,
             title='Epoch %d' % curr_nepoch)
         curr_niter += nb_iter
         self._train_history.append(
             (nb_train_samples, nb_iter, duration, results))
         # ====== cross validation ====== #
         if create_it_cv is not None:
             nb_iter, duration_valid, results = _fitting_helper(
                 create_it_cv(seed),
                 fn=self._f_score,
                 nb_samples=nb_cv_samples,
                 nb_classes=self.nb_classes,
                 title="Validating")
             self._valid_history.append(
                 (nb_train_samples, nb_iter, duration_valid, results))
             duration += duration_valid
         # ====== print log ====== #
         if self.verbose >= 2:
             print(
                 ctext('#epoch:', 'cyan') + str(curr_nepoch),
                 ctext('#iter:', 'cyan') + str(curr_niter),
                 ctext("Loss:", 'yellow') + '%.5f' % results[0],
                 ctext("Acc:", 'yellow') + '%.3f' % results[1],
                 ctext("%.2f(s)" % duration, 'magenta'))
             if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0:
                 print(V.print_confusion(results[-1], labels=self.labels))
         # ====== early stopping ====== #
         losses = results[0]
         if last_checkpoint is None:  # first check point
             last_checkpoint = self.parameters
         if last_losses is not None:
             # degraded, smaller is better
             if last_losses - losses <= self.tol:
                 curr_patience -= 1
                 if self.rollback:
                     if self.verbose >= 2:
                         wprint(
                             '[LogisticRegression] Rollback to the best checkpoint '
                             'at epoch:%s patience:%s' %
                             (ctext(best_epoch,
                                    'cyan'), ctext(curr_patience, 'cyan')))
                     self.set_parameters(*last_checkpoint)
             # save best checkpoint
             else:
                 last_checkpoint = self.parameters
                 best_epoch = curr_nepoch
                 if self._path is not None:
                     with open(self._path, 'wb') as f:
                         pickle.dump(self, f)
         last_losses = losses
         if curr_patience <= 0:
             is_converged = True
         # end the training
         if self.max_iter is not None and \
         curr_niter >= self.max_iter:
             break
         if self.max_epoch is not None and \
         curr_nepoch >= self.max_epoch:
             break
     # ====== print summary plot ====== #
     if self.verbose >= 1:
         train_losses = [epoch[-1][0] for epoch in self._train_history]
         print(
             V.print_bar(train_losses,
                         height=12,
                         bincount=min(20, len(train_losses)),
                         title='Training Losses'))
         if create_it_cv is not None:
             valid_losses = [epoch[-1][0] for epoch in self._valid_history]
             print(
                 V.print_bar(valid_losses,
                             height=12,
                             bincount=min(20, len(train_losses)),
                             title='Validation Losses'))
         if self.confusion_matrix:
             print(
                 ctext("======== Training Confusion Matrix ========",
                       'cyan'))
             print(
                 V.print_confusion(arr=self._train_history[-1][-1][-1],
                                   labels=self.labels))
             if create_it_cv is not None:
                 print(
                     ctext("======== Validation Confusion Matrix ========",
                           'cyan'))
                 print(
                     V.print_confusion(arr=self._valid_history[-1][-1][-1],
                                       labels=self.labels))
     # ====== reset to best points ====== #
     self.set_parameters(*last_checkpoint)
     self._is_fitted = True
     if self._path is not None:
         with open(self._path, 'wb') as f:
             pickle.dump(self, f)
Exemple #11
0
def fast_tsne(*X, n_components=2, n_samples=None, perplexity=30.0,
              early_exaggeration=8.0, learning_rate=200.0, n_iter=1000,
              n_iter_without_progress=300, min_grad_norm=1e-7,
              metric="euclidean", init="random", verbose=0,
              random_state=5218, method='barnes_hut', angle=0.5,
              n_jobs=4):
  """
  Parameters
  ----------
  n_components : int, optional (default: 2)
      Dimension of the embedded space.

  n_samples : {int, None}
      if given, downsampling the data to given number of sample

  perplexity : float, optional (default: 30)
      The perplexity is related to the number of nearest neighbors that
      is used in other manifold learning algorithms. Larger datasets
      usually require a larger perplexity. Consider selecting a value
      between 5 and 50. The choice is not extremely critical since t-SNE
      is quite insensitive to this parameter.

  early_exaggeration : float, optional (default: 8.0)
      Controls how tight natural clusters in the original space are in
      the embedded space and how much space will be between them. For
      larger values, the space between natural clusters will be larger
      in the embedded space. Again, the choice of this parameter is not
      very critical. If the cost function increases during initial
      optimization, the early exaggeration factor or the learning rate
      might be too high.

  learning_rate : float, optional (default: 200.0)
      The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
      the learning rate is too high, the data may look like a 'ball' with any
      point approximately equidistant from its nearest neighbours. If the
      learning rate is too low, most points may look compressed in a dense
      cloud with few outliers. If the cost function gets stuck in a bad local
      minimum increasing the learning rate may help.

  n_iter : int, optional (default: 1000)
      Maximum number of iterations for the optimization. Should be at
      least 250.

  n_iter_without_progress : int, optional (default: 300)
      Maximum number of iterations without progress before we abort the
      optimization, used after 250 initial iterations with early
      exaggeration. Note that progress is only checked every 50 iterations so
      this value is rounded to the next multiple of 50.

  min_grad_norm : float, optional (default: 1e-7)
      If the gradient norm is below this threshold, the optimization will
      be stopped.

  metric : string or callable, optional
      The metric to use when calculating distance between instances in a
      feature array. If metric is a string, it must be one of the options
      allowed by scipy.spatial.distance.pdist for its metric parameter, or
      a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
      If metric is "precomputed", X is assumed to be a distance matrix.
      Alternatively, if metric is a callable function, it is called on each
      pair of instances (rows) and the resulting value recorded. The callable
      should take two arrays from X as input and return a value indicating
      the distance between them. The default is "euclidean" which is
      interpreted as squared euclidean distance.

  init : string or numpy array, optional (default: "random")
      Initialization of embedding. Possible options are 'random', 'pca',
      and a numpy array of shape (n_samples, n_components).
      PCA initialization cannot be used with precomputed distances and is
      usually more globally stable than random initialization.

  verbose : int, optional (default: 0)
      Verbosity level.

  random_state : int, RandomState instance or None, optional (default: None)
      If int, random_state is the seed used by the random number generator;
      If RandomState instance, random_state is the random number generator;
      If None, the random number generator is the RandomState instance used
      by `np.random`.  Note that different initializations might result in
      different local minima of the cost function.

  method : string (default: 'barnes_hut')
      By default the gradient calculation algorithm uses Barnes-Hut
      approximation running in O(NlogN) time. method='exact'
      will run on the slower, but exact, algorithm in O(N^2) time. The
      exact algorithm should be used when nearest-neighbor errors need
      to be better than 3%. However, the exact method cannot scale to
      millions of examples.

  angle : float (default: 0.5)
      Only used if method='barnes_hut'
      This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
      'angle' is the angular size (referred to as theta in [3]) of a distant
      node as measured from a point. If this size is below 'angle' then it is
      used as a summary node of all points contained within it.
      This method is not very sensitive to changes in this parameter
      in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
      computation time and angle greater 0.8 has quickly increasing error.
  """
  assert len(X) > 0, "No input is given!"
  if isinstance(X[0], (tuple, list)):
    X = X[0]
  if not all(isinstance(x, np.ndarray) for x in X):
    raise ValueError("`X` can only be list of numpy.ndarray or numpy.ndarray")
  # ====== kwarg for creating T-SNE class ====== #
  kwargs = dict(locals())
  del kwargs['X']
  n_samples = kwargs.pop('n_samples', None)
  # ====== downsampling ====== #
  if n_samples is not None:
    n_samples = int(n_samples)
    assert n_samples > 0
    new_X = []
    rand = random_state if isinstance(random_state, np.random.RandomState) else \
    np.random.RandomState(seed=random_state)
    for x in X:
      if x.shape[0] > n_samples:
        ids = rand.permutation(x.shape[0])[:n_samples]
        x = x[ids]
      new_X.append(x)
    X = new_X
  # ====== import proper T-SNE ====== #
  tsne_version = None
  try:
    from tsnecuda import TSNE
    from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE
    tsne_version = 'cuda'
  except ImportError:
    # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` "
    #        "for significant speed up.")
    try:
      from MulticoreTSNE import MulticoreTSNE as TSNE
      tsne_version = 'multicore'
    except ImportError:
      wprint("Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`"
             ' to accelerate the T-SNE on multiple CPU cores.')
      try:
        from sklearn.manifold import TSNE
        tsne_version = 'sklearn'
      except Exception as e:
        raise e
  # ====== modify kwargs ====== #
  if tsne_version == 'cuda':
    kwargs['random_seed'] = kwargs['random_state']
    kwargs['theta'] = angle
    if method == 'exact':
      TSNE = _exact_TSNE
      del kwargs['theta']
    del kwargs['random_state']
    del kwargs['n_jobs']
    del kwargs['angle']
    del kwargs['method']
  elif tsne_version == 'multicore':
    pass
  else:
    del kwargs['n_jobs']
  # ====== getting cached values ====== #
  results = []
  X_new = []
  for i, x in enumerate(X):
    md5 = md5_checksum(x)
    key = _create_key(kwargs, md5)
    if key in _cached_values:
      results.append((i, _cached_values[key]))
    else:
      X_new.append((i, md5, x))

  # ====== perform T-SNE ====== #
  def apply_tsne(j):
    idx, md5, x = j
    tsne = TSNE(**kwargs)
    return (idx, md5, tsne.fit_transform(x))
  # only 1 X, no need for MPI
  if len(X_new) == 1:
    idx, md5, x = apply_tsne(X_new[0])
    results.append((idx, x))
    _cached_values[_create_key(kwargs, md5)] = x
  else:
    mpi = MPI(jobs=X_new, func=apply_tsne, batch=1,
              ncpu=min(len(X_new), cpu_count() - 1))
    for idx, md5, x in mpi:
      results.append((idx, x))
      _cached_values[_create_key(kwargs, md5)] = x
  # ====== return and clean ====== #
  results = sorted(results, key=lambda a: a[0])
  results = [r[1] for r in results]
  return results[0] if len(results) == 1 else results
Exemple #12
0
    def copy(self,
             destination,
             indices_filter=None,
             data_filter=None,
             override=False):
        """ Copy the dataset to a new folder and closed
    the old dataset

    """
        from distutils.dir_util import copy_tree
        read_only = self.read_only
        # indices
        if indices_filter is not None and \
        not is_callable(indices_filter) and \
        not isinstance(indices_filter, (tuple, list)):
            raise ValueError(
                '`indices_filter` must be callable, tuple, list or None')
        if isinstance(indices_filter, (tuple, list)):
            tmp = tuple(indices_filter)
            indices_filter = lambda x: x in tmp
        # data name
        if data_filter is not None and \
        not is_callable(data_filter) and \
        not isinstance(data_filter, (tuple, list)):
            raise ValueError(
                '`data_filter` must be callable, tuple, list or None')
        if isinstance(data_filter, (tuple, list)):
            tmp = tuple(data_filter)
            data_filter = lambda x: x in tmp
        # ====== other files which are not Data ====== #
        other_files = [i for i in os.listdir(self.path) if i not in self]
        # ====== preprocessing ====== #
        destination = os.path.abspath(str(destination))
        if not os.path.exists(destination):
            os.mkdir(destination)
        elif not os.path.isdir(destination):
            raise ValueError('path at "%s" must be a folder' % destination)
        elif override:
            shutil.rmtree(destination)
            os.mkdir(destination)
        else:
            raise ValueError(
                "A folder exist at path: '%s', cannot be overrided." %
                destination)
        # ====== copy everything ====== #
        if indices_filter is None and data_filter is None:
            print("Copying %s files from '%s' to '%s' ..." %
                  (ctext(len(self), 'cyan'), ctext(
                      self.path, 'yellow'), ctext(destination, 'yellow')))
            copy_tree(self.path, destination)
        # ====== only data_filter ====== #
        elif indices_filter is None:
            data_list = [i for i in self.keys() if data_filter(i)]
            # copy all the data
            for name in data_list:
                org_path = os.path.join(self.path, name)
                dst_path = os.path.join(destination, name)
                print("Copying from '%s' to '%s' ..." %
                      (ctext(org_path, 'yellow'), ctext(dst_path, 'yellow')))
                shutil.copy2(org_path, dst_path)
            # copy all the related indices
            for name in self.keys():
                org_path = os.path.join(self.path, name)
                dst_path = os.path.join(destination, name)
                if not os.path.exists(dst_path) and \
                ('indices' == name or any(i in data_list for i in name.split('_')[1:])):
                    print("Copying Indices from '%s' to '%s'" %
                          (ctext(org_path, 'cyan'), ctext(dst_path, 'cyan')))
                    shutil.copy2(org_path, dst_path)
        # ====== use indices_filter and data_filter ====== #
        else:
            if data_filter is None:
                all_data = list(self.keys())
            else:
                all_data = [i for i in self.keys() if data_filter(i)]
            # list of data with separated indices
            separated_data = flatten_list(
                [k.split('_')[1:] for k in self.keys() if 'indices_' == k[:8]])
            # iterate over indices and copy one by one data
            for ids_name in [k for k in self.keys() if 'indices' == k[:7]]:
                indices = [(n, (s, e)) for n, (s, e) in self[ids_name]
                           if indices_filter(n)]
                # no match indices, skip
                if len(indices) == 0:
                    continue
                nb_samples = sum(e - s for n, (s, e) in indices)
                # get all data assigned to given indices
                data = ids_name.split('_')[1:]
                if len(data) == 0:
                    data = [i for i in all_data if i not in separated_data]
                else:
                    data = [i for i in data if i in all_data]
                # if still no data found, skip
                if len(data) == 0:
                    continue
                # copy each data
                for data_name in data:
                    X = self[data_name]
                    # copy big MmapDict
                    if isinstance(X, MmapDict) and len(X) == len(
                            self[ids_name]):
                        new_path = os.path.join(destination,
                                                os.path.basename(X.path))
                        print("Copying MmapDict from '%s' to '%s'" %
                              (ctext(X.path, 'cyan'), ctext(new_path, 'cyan')))
                        new_dict = MmapDict(new_path,
                                            cache_size=80000,
                                            read_only=False)
                        for n, (s, e) in indices:
                            new_dict[n] = X[n]
                        new_dict.flush(save_all=True)
                        new_dict.close()
                    # copy MmapData
                    elif isinstance(X, MmapData):
                        Y = MmapData(path=os.path.join(destination, data_name),
                                     dtype=X.dtype,
                                     shape=(0, ) + X.shape[1:],
                                     read_only=False)
                        prog = Progbar(target=nb_samples,
                                       print_report=True,
                                       print_summary=True,
                                       name="Copying data: '%s' to path:'%s'" %
                                       (ctext(data_name, 'yellow'),
                                        ctext(Y.data_info, 'cyan')))
                        for n, (s, e) in indices:
                            Y.append(X[s:e])
                            prog.add(e - s)
                    # unknown data-type
                    else:
                        org_path = os.path.join(self.path, data_name)
                        new_path = os.path.join(destination, data_name)
                        # just copy directly the files
                        if os.path.isfile(org_path) or \
                        not os.path.exists(new_path):
                            shutil.copy2(org_path, new_path)
                            print("Copying '%s' to '%s' ..." % (ctext(
                                org_path, 'cyan'), ctext(new_path, 'yellow')))
                        else:
                            wprint("Cannot copy: '%s' - %s" %
                                   (ctext(data_name, 'cyan'),
                                    ctext(type(self[data_name]), 'yellow')))
                # copy the indices
                new_indices = MmapDict(os.path.join(destination, ids_name),
                                       cache_size=80000,
                                       read_only=False)
                start = 0
                for n, (s, e) in indices:
                    size = e - s
                    new_indices[n] = (start, start + size)
                    start += size
                new_indices.flush(save_all=True)
                new_indices.close()
        # ====== copy others files ====== #
        for f in other_files:
            org_path = os.path.join(self.path, f)
            dst_path = os.path.join(destination, f)
            if not os.path.exists(dst_path):
                if os.path.isdir(org_path):  # directory
                    copy_tree(org_path, dst_path)
                else:  # single file
                    shutil.copy2(org_path, dst_path)
        # ====== readme ====== #
        readme_name = os.path.basename(self._readme_path)
        dst_path = os.path.join(destination, readme_name)
        if not os.path.exists(dst_path):
            shutil.copy2(self._readme_path, dst_path)
        return Dataset(destination, read_only=read_only)
Exemple #13
0
 def fit(self, X, y=None, cv=None):
   self._initialize(X)
   if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \
   not hasattr(X, '__len__'):
     raise ValueError("`X` must has 'shape', '__len__' and '__iter__' attributes")
   nb_train_samples = len(X)
   # convert to odin.fuel.Data if possible
   if isinstance(X, (np.ndarray, list, tuple)):
     X = F.as_data(X)
   if isinstance(y, (np.ndarray, list, tuple)):
     y = F.as_data(y)
   start_tr = 0
   end_tr = nb_train_samples
   # ====== check if cross validating ====== #
   create_it_cv = None
   if is_number(cv):
     cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv)
     end_tr = nb_train_samples - cv
     start_cv = end_tr
     end_cv = nb_train_samples
     nb_cv_samples = end_cv - start_cv
     create_it_cv = _create_it_func(X=X, y=y, batch_size=self.batch_size,
                                    start=start_cv, end=end_cv)
   elif isinstance(cv, (tuple, list)):
     X_cv, y_cv = cv
     nb_cv_samples = X_cv.shape[0]
     create_it_cv = _create_it_func(X=X_cv, y=y_cv, batch_size=self.batch_size,
                                    start=0, end=X_cv.shape[0])
   elif hasattr(cv, 'set_batch'):
     nb_cv_samples = cv.shape[0]
     create_it_cv = _create_it_func(X=cv, y=None, batch_size=self.batch_size,
                                    start=0, end=cv.shape[0])
   elif cv is not None:
     raise ValueError('`cv` can be float (0-1), tuple or list of X and y, '
                      'any object that have "shape" and "__iter__" attributes, '
                      'or None')
   # ====== preprocessing ====== #
   create_it = _create_it_func(X=X, y=y, batch_size=self.batch_size,
                               start=start_tr, end=end_tr)
   # ====== prepare ====== #
   curr_niter = sum(epoch[0] for epoch in self._train_history)
   curr_nepoch = len(self._train_history)
   curr_patience = int(self.patience)
   last_losses = None
   last_checkpoint = None
   best_epoch = None
   is_converged = False
   # ====== fitting ====== #
   while not is_converged:
     curr_nepoch += 1
     seed = self._rand_state.randint(0, 10e8)
     # ====== training ====== #
     nb_iter, duration, results = _fitting_helper(create_it(seed),
                                                  fn=self._f_train,
                                                  nb_samples=nb_train_samples,
                                                  nb_classes=self.nb_classes,
                                                  title='Epoch %d' % curr_nepoch)
     curr_niter += nb_iter
     self._train_history.append(
         (nb_train_samples, nb_iter, duration, results))
     # ====== cross validation ====== #
     if create_it_cv is not None:
       nb_iter, duration_valid, results = _fitting_helper(create_it_cv(seed),
                                                    fn=self._f_score,
                                                    nb_samples=nb_cv_samples,
                                                    nb_classes=self.nb_classes,
                                                    title="Validating")
       self._valid_history.append(
           (nb_train_samples, nb_iter, duration_valid, results))
       duration += duration_valid
     # ====== print log ====== #
     if self.verbose >= 2:
       print(ctext('#epoch:', 'cyan') + str(curr_nepoch),
             ctext('#iter:', 'cyan') + str(curr_niter),
             ctext("Loss:", 'yellow') + '%.5f' % results[0],
             ctext("Acc:", 'yellow') + '%.3f' % results[1],
             ctext("%.2f(s)" % duration, 'magenta'))
       if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0:
         print(V.print_confusion(results[-1], labels=self.labels))
     # ====== early stopping ====== #
     losses = results[0]
     if last_checkpoint is None: # first check point
       last_checkpoint = self.parameters
     if last_losses is not None:
       # degraded, smaller is better
       if last_losses - losses <= self.tol:
         curr_patience -= 1
         if self.rollback:
           if self.verbose >= 2:
             wprint('[LogisticRegression] Rollback to the best checkpoint '
                    'at epoch:%s patience:%s' %
                    (ctext(best_epoch, 'cyan'),
                     ctext(curr_patience, 'cyan')))
           self.set_parameters(*last_checkpoint)
       # save best checkpoint
       else:
         last_checkpoint = self.parameters
         best_epoch = curr_nepoch
         if self._path is not None:
           with open(self._path, 'wb') as f:
             pickle.dump(self, f)
     last_losses = losses
     if curr_patience <= 0:
       is_converged = True
     # end the training
     if self.max_iter is not None and \
     curr_niter >= self.max_iter:
       break
     if self.max_epoch is not None and \
     curr_nepoch >= self.max_epoch:
       break
   # ====== print summary plot ====== #
   if self.verbose >= 1:
     train_losses = [epoch[-1][0] for epoch in self._train_history]
     print(V.print_bar(train_losses, height=12,
                       bincount=min(20, len(train_losses)),
                       title='Training Losses'))
     if create_it_cv is not None:
       valid_losses = [epoch[-1][0] for epoch in self._valid_history]
       print(V.print_bar(valid_losses, height=12,
                         bincount=min(20, len(train_losses)),
                         title='Validation Losses'))
     if self.confusion_matrix:
       print(ctext("======== Training Confusion Matrix ========", 'cyan'))
       print(V.print_confusion(arr=self._train_history[-1][-1][-1],
                               labels=self.labels))
       if create_it_cv is not None:
         print(ctext("======== Validation Confusion Matrix ========", 'cyan'))
         print(V.print_confusion(arr=self._valid_history[-1][-1][-1],
                                 labels=self.labels))
   # ====== reset to best points ====== #
   self.set_parameters(*last_checkpoint)
   self._is_fitted = True
   if self._path is not None:
     with open(self._path, 'wb') as f:
       pickle.dump(self, f)
Exemple #14
0
  def copy(self, destination,
           indices_filter=None, data_filter=None,
           override=False):
    """ Copy the dataset to a new folder and closed
    the old dataset

    """
    from distutils.dir_util import copy_tree
    read_only = self.read_only
    # indices
    if indices_filter is not None and \
    not is_callable(indices_filter) and \
    not isinstance(indices_filter, (tuple, list)):
      raise ValueError('`indices_filter` must be callable, tuple, list or None')
    if isinstance(indices_filter, (tuple, list)):
      tmp = tuple(indices_filter)
      indices_filter = lambda x: x in tmp
    # data name
    if data_filter is not None and \
    not is_callable(data_filter) and \
    not isinstance(data_filter, (tuple, list)):
      raise ValueError('`data_filter` must be callable, tuple, list or None')
    if isinstance(data_filter, (tuple, list)):
      tmp = tuple(data_filter)
      data_filter = lambda x: x in tmp
    # ====== other files which are not Data ====== #
    other_files = [i for i in os.listdir(self.path)
                   if i not in self]
    # ====== preprocessing ====== #
    destination = os.path.abspath(str(destination))
    if not os.path.exists(destination):
      os.mkdir(destination)
    elif not os.path.isdir(destination):
      raise ValueError('path at "%s" must be a folder' % destination)
    elif override:
      shutil.rmtree(destination)
      os.mkdir(destination)
    else:
      raise ValueError("A folder exist at path: '%s', cannot be overrided." %
                       destination)
    # ====== copy everything ====== #
    if indices_filter is None and data_filter is None:
      print("Copying %s files from '%s' to '%s' ..." %
        (ctext(len(self), 'cyan'),
         ctext(self.path, 'yellow'),
         ctext(destination, 'yellow')))
      copy_tree(self.path, destination)
    # ====== only data_filter ====== #
    elif indices_filter is None:
      data_list = [i for i in self.keys() if data_filter(i)]
      # copy all the data
      for name in data_list:
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        print("Copying from '%s' to '%s' ..." %
              (ctext(org_path, 'yellow'),
               ctext(dst_path, 'yellow')))
        shutil.copy2(org_path, dst_path)
      # copy all the related indices
      for name in self.keys():
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        if not os.path.exists(dst_path) and \
        ('indices' == name or any(i in data_list for i in name.split('_')[1:])):
          print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'),
                                                       ctext(dst_path, 'cyan')))
          shutil.copy2(org_path, dst_path)
    # ====== use indices_filter and data_filter ====== #
    else:
      if data_filter is None:
        all_data = list(self.keys())
      else:
        all_data = [i for i in self.keys()
                    if data_filter(i)]
      # list of data with separated indices
      separated_data = flatten_list(
          [k.split('_')[1:] for k in self.keys()
         if 'indices_' == k[:8]])
      # iterate over indices and copy one by one data
      for ids_name in [k for k in self.keys() if 'indices' == k[:7]]:
        indices = [(n, (s, e))
                   for n, (s, e) in self[ids_name]
                   if indices_filter(n)]
        # no match indices, skip
        if len(indices) == 0:
          continue
        nb_samples = sum(e - s for n, (s, e) in indices)
        # get all data assigned to given indices
        data = ids_name.split('_')[1:]
        if len(data) == 0:
          data = [i for i in all_data if i not in separated_data]
        else:
          data = [i for i in data if i in all_data]
        # if still no data found, skip
        if len(data) == 0:
          continue
        # copy each data
        for data_name in data:
          X = self[data_name]
          # copy big MmapDict
          if isinstance(X, MmapDict) and len(X) == len(self[ids_name]):
            new_path = os.path.join(destination, os.path.basename(X.path))
            print("Copying MmapDict from '%s' to '%s'" % (
                ctext(X.path, 'cyan'),
                ctext(new_path, 'cyan')))
            new_dict = MmapDict(new_path, cache_size=80000, read_only=False)
            for n, (s, e) in indices:
              new_dict[n] = X[n]
            new_dict.flush(save_all=True)
            new_dict.close()
          # copy MmapData
          elif isinstance(X, MmapData):
            Y = MmapData(path=os.path.join(destination, data_name),
                         dtype=X.dtype, shape=(0,) + X.shape[1:],
                         read_only=False)
            prog = Progbar(target=nb_samples,
                           print_report=True, print_summary=True,
                           name="Copying data: '%s' to path:'%s'" %
                           (ctext(data_name, 'yellow'),
                            ctext(Y.data_info, 'cyan')))
            for n, (s, e) in indices:
              Y.append(X[s:e])
              prog.add(e - s)
          # unknown data-type
          else:
            org_path = os.path.join(self.path, data_name)
            new_path = os.path.join(destination, data_name)
            # just copy directly the files
            if os.path.isfile(org_path) or \
            not os.path.exists(new_path):
              shutil.copy2(org_path, new_path)
              print("Copying '%s' to '%s' ..." %
                (ctext(org_path, 'cyan'), ctext(new_path, 'yellow')))
            else:
              wprint("Cannot copy: '%s' - %s" %
                (ctext(data_name, 'cyan'),
                 ctext(type(self[data_name]), 'yellow')))
        # copy the indices
        new_indices = MmapDict(os.path.join(destination, ids_name),
                               cache_size=80000, read_only=False)
        start = 0
        for n, (s, e) in indices:
          size = e - s
          new_indices[n] = (start, start + size)
          start += size
        new_indices.flush(save_all=True)
        new_indices.close()
    # ====== copy others files ====== #
    for f in other_files:
      org_path = os.path.join(self.path, f)
      dst_path = os.path.join(destination, f)
      if not os.path.exists(dst_path):
        if os.path.isdir(org_path): # directory
          copy_tree(org_path, dst_path)
        else: # single file
          shutil.copy2(org_path, dst_path)
    # ====== readme ====== #
    readme_name = os.path.basename(self._readme_path)
    dst_path = os.path.join(destination, readme_name)
    if not os.path.exists(dst_path):
      shutil.copy2(self._readme_path, dst_path)
    return Dataset(destination, read_only=read_only)
Exemple #15
0
 def __init__(self, jobs, path, extractor,
              n_cache=0.12, ncpu=1, override=False,
              identifier='name',
              log_path=None,
              stop_on_failure=False):
   super(FeatureProcessor, self).__init__()
   # ====== check outpath ====== #
   path = os.path.abspath(str(path))
   if os.path.isfile(path):
     raise ValueError("`path` must be path to a directory, but found a "
                      "path to file.")
   # check override
   if os.path.exists(path) and override:
     wprint("Remove existed Dataset at path: %s" % path)
     for i in os.listdir(path):
       i = os.path.join(path, i)
       if os.path.isdir(i): # remove folder
         shutil.rmtree(i)
       else: # remove file
         os.remove(i)
   # set path and name
   self.path = path
   # ====== check jobs ====== #
   if not isinstance(jobs, (tuple, list, np.ndarray)):
     raise ValueError("Provided `jobs` must be instance of tuple, list or ndarray.")
   if isinstance(jobs, np.ndarray):
     jobs = jobs.tolist()
   self.jobs = tuple(jobs)
   # ====== check multiprocessing ====== #
   if ncpu is None: # auto select number of CPU
     ncpu = min(len(jobs), cpu_count() - 1)
   ncpu = int(ncpu)
   if ncpu <= 0 or n_cache <= 0:
     raise ValueError('`ncpu` and `n_cache` must be greater than 0, but '
                      'given values ncpu=%d n_cache=%f' % (ncpu, n_cache))
   self.n_cpu = ncpu
   self.n_cache = n_cache
   # ====== internal control for feature processor ====== #
   if isinstance(extractor, Pipeline):
     pass
   elif isinstance(extractor, (tuple, list)):
     steps = [('%s_%d' % (e.__class__.__name__, i), e)
              for i, e in enumerate(extractor)]
     extractor = Pipeline(steps=steps)
   elif isinstance(extractor, Mapping):
     steps = [(str(n), e) for n, e in extractor.items()]
     extractor = Pipeline(steps=steps)
   elif isinstance(extractor, Extractor):
     extractor = Pipeline(
         steps=[(extractor.__class__.__name__, extractor)])
   self.extractor = extractor
   # ====== check identifier and log path ====== #
   self._identifier = str(identifier)
   if log_path is None:
     log_path = os.path.join(self.path, 'log.txt')
   else:
     log_path = str(log_path)
   self._log_path = _check_logpath(log_path)
   # ====== others ====== #
   self.config = {}
   self._error_log = []
   self.stop_on_failure = bool(stop_on_failure)