Exemple #1
0
 def __init__(self, func, data, epoch=1, p=1.0,
              batch_size=128, seed=None, shuffle_level=2,
              callbacks=None, labels=None, name=None,
              verbose=2):
   super(Task, self).__init__()
   self.set_func(func, data)
   # this Progbar will record the history as well
   self._labels = [str(l) for l in labels] \
       if labels is not None else None
   self._progbar = Progbar(target=self.nb_samples, name=name,
                           interval=0.,
                           print_report=True, print_summary=True)
   self._progbar.set_labels(self._labels)
   # ====== set callback and verbose ====== #
   self._callback = CallbackList(callbacks)
   self.set_verbose(verbose)
   # ====== assign other arguments ====== #
   self._nb_epoch = epoch
   self._p = np.clip(p, 0., 1.)
   self._seed = seed
   self.set_batch(batch_size, seed, shuffle_level)
   self._name = name
   # ====== current info ====== #
   self._curr_epoch = 0
   self._curr_iter = 0
   self._curr_samples = 0
   self._curr_epoch_iter = 0
   self._curr_epoch_samples = 0
   self._callback_msg = []
   # ====== iter tracking ====== #
   self._created_iter = None
   self._stop = False
Exemple #2
0
 def _load_archive(self, path, extract_path):
   from zipfile import ZipFile, ZIP_DEFLATED
   try:
     zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED)
     allfile = zfile.namelist()
     # validate extract_path
     if not os.path.isdir(extract_path):
       raise ValueError('Extract path must be path folder, but path'
                        '={} is a file'.format(extract_path))
     extract_path = os.path.join(extract_path,
                                 os.path.basename(path).replace('.zip', ''))
     # found the extracted dir, use it
     if os.path.isdir(extract_path) and \
        set(os.listdir(extract_path)) == set(allfile):
       self._set_path(extract_path)
       return
     # decompress everything
     if not os.path.exists(extract_path):
       os.mkdir(extract_path)
     maxlen = max([len(i) for i in allfile])
     pb = Progbar(target=len(allfile), name="[Dataset] Loading Archive",
                  print_summary=True, print_report=True)
     for i, f in enumerate(allfile):
       zfile.extract(f, path=extract_path)
       pb['File'] = ('Unarchiving: %-' + str(maxlen) + 's') % f
       pb.add(1)
     # ====== finally set path ====== #
     self._set_path(extract_path)
   except IOError as e:
     raise IOError('Error loading archived dataset, path:{}, error:{}'
                   '.'.format(path, e))
   return None
Exemple #3
0
    def fit(self, X, y=None, print_progress=False):
        """Fit the model with X, using minibatches of size batch_size.

        Parameters
        ----------
        X: array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        y: Passthrough for ``Pipeline`` compatibility.

        Returns
        -------
        self: object
            Returns the instance itself.
        """
        if isinstance(X, Data):
            X = X[:]
        X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape

        if self.batch_size is None:
            batch_size = 12 * n_features
        else:
            batch_size = self.batch_size

        if print_progress:
            prog = Progbar(target=n_samples)
        for batch in gen_batches(n_samples, batch_size):
            x = X[batch]
            self.partial_fit(x, check_input=False)
            if print_progress:
                prog.add(x.shape[0])
        return self
Exemple #4
0
 def _load_archive(self, path, extract_path):
     from zipfile import ZipFile, ZIP_DEFLATED
     try:
         zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED)
         allfile = zfile.namelist()
         # validate extract_path
         if not os.path.isdir(extract_path):
             raise ValueError('Extract path must be path folder, but path'
                              '={} is a file'.format(extract_path))
         extract_path = os.path.join(
             extract_path,
             os.path.basename(path).replace('.zip', ''))
         # found the extracted dir, use it
         if os.path.isdir(extract_path) and \
            set(os.listdir(extract_path)) == set(allfile):
             self._set_path(extract_path)
             return
         # decompress everything
         if not os.path.exists(extract_path):
             os.mkdir(extract_path)
         maxlen = max([len(i) for i in allfile])
         progbar = Progbar(len(allfile))
         for i, f in enumerate(allfile):
             zfile.extract(f, path=extract_path)
             progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f
             progbar.update(i + 1)
         # ====== finally set path ====== #
         self._set_path(extract_path)
     except IOError as e:
         raise IOError('Error loading archived dataset, path:{}, error:{}'
                       '.'.format(path, e))
     return None
Exemple #5
0
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path,
                                  name_path):
    n_samples = X.shape[0]
    # indices is None, every row is single sample (utterance or image ...)
    if indices is None:
        if os.path.exists(z_path):
            os.remove(z_path)
        if os.path.exists(f_path):
            os.remove(f_path)
        Z = MmapArrayWriter(path=z_path,
                            dtype='float32',
                            shape=(n_samples, gmm.nmix),
                            remove_exist=True)
        F = MmapArrayWriter(path=f_path,
                            dtype='float32',
                            shape=(n_samples, gmm.feat_dim * gmm.nmix),
                            remove_exist=True)
        jobs, _ = _split_jobs(n_samples,
                              ncpu=mpi.cpu_count(),
                              device='cpu',
                              gpu_factor=1)

        def map_transform(start_end):
            start, end = start_end
            for i in range(start, end):
                # removed by SAD
                if sad is not None and not bool(sad[i]):
                    yield None, None, None
                else:
                    z, f = gmm.transform(X[i][np.newaxis, :],
                                         zero=True,
                                         first=True,
                                         device='cpu')
                    yield i, z, f

        prog = Progbar(target=n_samples,
                       print_report=True,
                       print_summary=False,
                       name="Extracting zero and first order statistics")
        for i, z, f in mpi.MPI(jobs, map_transform, ncpu=None, batch=1):
            if i is not None:  # i None means removed by SAD
                Z[i] = z
                F[i] = f
            prog.add(1)
        Z.flush()
        F.flush()
        Z.close()
        F.close()
    # use directly the transform_to_disk function
    else:
        gmm.transform_to_disk(X,
                              indices=indices,
                              sad=sad,
                              pathZ=z_path,
                              pathF=f_path,
                              name_path=name_path,
                              dtype='float32',
                              device=None,
                              ncpu=None,
                              override=True)
Exemple #6
0
    def test_conv_deconv_transpose(self):
        def feval(X, y):
            f = K.function(X, y)
            shape = (np.random.randint(8, 18), ) + tuple(X.shape.as_list()[1:])
            x = np.random.rand(*shape)
            return f(x)

        prog = Progbar(target=2 * 3 * 3 * 2 * 2, print_report=True)
        for X in (K.placeholder(shape=(None, 13, 12, 25)),
                  K.placeholder(shape=(None, 13, 12, 8, 25))):
            for strides in (1, 2, 3):
                for filter_size in (3, 4, 5):
                    for num_filters in (8, 25):
                        for pad in ("same", "valid"):
                            for dilation in (1, ):
                                # ====== progress ====== #
                                prog['test'] = "#Dim:%d;Stride:%d;Filter:%d;Channel:%d;Pad:%s" % \
                                    (X.shape.ndims, strides, filter_size, num_filters, pad)
                                prog.add(1)
                                # ====== test Conv ====== #
                                f = N.Conv(num_filters=num_filters,
                                           filter_size=filter_size,
                                           pad=pad,
                                           strides=strides,
                                           activation=tf.nn.relu,
                                           dilation=dilation)
                                fT = f.T
                                y = f(X)
                                self.assertEqual(
                                    feval(X, y).shape[1:],
                                    tuple(y.shape.as_list()[1:]))
                                yT = fT(y)
                                self.assertEqual(
                                    feval(X, yT).shape[1:],
                                    tuple(yT.shape.as_list()[1:]))
                                self.assertEqual(X.shape.as_list(),
                                                 yT.shape.as_list())
                                # ====== test Transpose ====== #
                                f = N.TransposeConv(num_filters=num_filters,
                                                    filter_size=filter_size,
                                                    pad=pad,
                                                    strides=strides,
                                                    activation=K.relu,
                                                    dilation=dilation)
                                fT = f.T
                                y = f(X)
                                self.assertEqual(
                                    feval(X, y).shape[1:],
                                    tuple(y.shape.as_list()[1:]))
                                yT = fT(y)
                                self.assertEqual(
                                    feval(X, yT).shape[1:],
                                    tuple(yT.shape.as_list()[1:]))
                                self.assertEqual(X.shape.as_list(),
                                                 yT.shape.as_list())
Exemple #7
0
 def fit(self, texts, vocabulary=None):
     """
 Parameters
 ----------
 texts: iterator of unicode
     iterator, generator or list (e.g. [u'a', u'b', ...])
     of unicode documents.
 """
     texts = self._validate_texts(texts)
     word_counts = self._word_counts
     word_docs = self._word_docs
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== start processing ====== #
     prog = Progbar(target=1234,
                    name="Fitting tokenizer",
                    print_report=True,
                    print_summary=True)
     start_time = timeit.default_timer()
     for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
         total_docs_tokens = 0
         seen_words = {}
         # update words->count
         for token in doc:
             total_docs_tokens += 1
             word_counts[token] += 1
             # update words->doc
             if token not in seen_words:
                 seen_words[token] = 1
                 word_docs[token] += 1
         # save longest docs
         if total_docs_tokens > self.__longest_document[-1]:
             self.__longest_document = [doc, total_docs_tokens]
         # print progress
         prog['#Doc'] = nb_docs
         prog['#Tok'] = len(word_counts)
         prog.add(1)
         if prog.seen_so_far >= 0.8 * prog.target:
             prog.target = 1.2 * prog.target
     # ====== print summary of the process ====== #
     # if self.print_progress:
     #     prog.target = nb_docs; prog.update(nb_docs)
     processing_time = timeit.default_timer() - start_time
     print('Processed %d-docs, %d-tokens in %f second.' %
           (nb_docs, len(word_counts), processing_time))
     self.nb_docs += nb_docs
     # ====== sorting ====== #
     self._refresh_dictionary()
     return self
Exemple #8
0
 def fit(self, texts, vocabulary=None):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   """
   texts = self._validate_texts(texts)
   word_counts = self._word_counts
   word_docs = self._word_docs
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== start processing ====== #
   prog = Progbar(target=1208, name="Fitting tokenizer",
                  print_report=True, print_summary=True)
   start_time = timeit.default_timer()
   for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
     total_docs_tokens = 0
     seen_words = {}
     # update words->count
     for token in doc:
       total_docs_tokens += 1
       word_counts[token] += 1
       # update words->doc
       if token not in seen_words:
         seen_words[token] = 1
         word_docs[token] += 1
     # save longest docs
     if total_docs_tokens > self.__longest_document[-1]:
       self.__longest_document = [doc, total_docs_tokens]
     # print progress
     prog['#Doc'] = nb_docs
     prog['#Tok'] = len(word_counts)
     prog.add(1)
     if prog.seen_so_far >= 0.8 * prog.target:
       prog.target = 1.2 * prog.target
   # ====== print summary of the process ====== #
   # if self.print_progress:
   #     prog.target = nb_docs; prog.update(nb_docs)
   processing_time = timeit.default_timer() - start_time
   print('Processed %d-docs, %d-tokens in %f second.' %
       (nb_docs, len(word_counts), processing_time))
   self.nb_docs += nb_docs
   # ====== sorting ====== #
   self._refresh_dictionary()
   return self
Exemple #9
0
 def __init__(self, name, format='', tracking=[]):
     super(ProgressMonitor, self).__init__()
     self.name = name
     self._history = []
     self._prog = Progbar(100, title='')
     # ====== format ====== #
     self._format_results = 0
     for i in _PLACEHOLDER:
         self._format_results += len(i.findall(format))
     self._format = format
     # ====== one-time tracking at epoch_end ====== #
     if isinstance(tracking, dict):
         tracking = tracking.iteritems()
     self.tracking = [(int(i), j) for i, j in tracking if callable(j)]
     self._tracking_history = defaultdict(list)
Exemple #10
0
 def __init__(self, func, data, epoch=1, p=1.0,
              batch_size=128, seed=None, shuffle_level=2,
              callbacks=None, labels=None, name=None,
              verbose=2):
   super(Task, self).__init__()
   self.set_func(func, data)
   # this Progbar will record the history as well
   self._labels = [str(l) for l in labels] \
       if labels is not None else None
   self._progbar = Progbar(target=self.nb_samples, name=name,
                           interval=0.,
                           print_report=True, print_summary=True)
   self._progbar.set_labels(self._labels)
   # ====== set callback and verbose ====== #
   self._callback = CallbackList(callbacks)
   self.set_verbose(verbose)
   # ====== assign other arguments ====== #
   self._nb_epoch = epoch
   self._p = np.clip(p, 0., 1.)
   self._seed = seed
   self.set_batch(batch_size, seed, shuffle_level)
   self._name = name
   # ====== current info ====== #
   self._curr_epoch = 0
   self._curr_iter = 0
   self._curr_samples = 0
   self._curr_epoch_iter = 0
   self._curr_epoch_samples = 0
   self._callback_msg = []
   # ====== iter tracking ====== #
   self._created_iter = None
   self._stop = False
Exemple #11
0
    def archive(self):
        from zipfile import ZipFile, ZIP_DEFLATED
        path = self.archive_path
        zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED)

        files = set([_[-1] for _ in self._data_map.itervalues()])

        progbar = Progbar(len(files), title='Archiving:')
        maxlen = max([len(os.path.basename(i)) for i in files])
        for i, f in enumerate(files):
            zfile.write(f, os.path.basename(f))
            progbar.title = ('Archiving: %-' + str(maxlen) +
                             's') % os.path.basename(f)
            progbar.update(i + 1)
        zfile.close()
        return path
Exemple #12
0
  def archive(self):
    from zipfile import ZipFile, ZIP_DEFLATED
    path = self.archive_path
    zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED)

    files = set([_[-1] for _ in self._data_map.values()])

    prog = Progbar(target=len(files), name="[Dataset] Archiving",
                   print_report=True, print_summary=True)
    maxlen = max([len(os.path.basename(i)) for i in files])
    for i, f in enumerate(files):
      zfile.write(f, os.path.basename(f))
      prog['Data'] = ('Archiving: %-' + str(maxlen) + 's') \
          % os.path.basename(f)
      prog.add(1)
    zfile.close()
    return path
Exemple #13
0
 def _saveable_variables(self):
     return {'_format': self._format,
             '_format_results': self._format_results,
             '_history': [],
             'tracking': self.tracking,
             '_tracking_history': defaultdict(list),
             '_prog': Progbar(100, title=''),
             'name': self.name}
Exemple #14
0
    def transform_mpi(self,
                      X,
                      y=None,
                      keep_order=True,
                      ncpu=4,
                      n_components=None,
                      print_progress=False):
        """ Sample as transform but using multiprocessing """
        n = X.shape[0]
        if self.batch_size is None:
            batch_size = 12 * len(self.mean_)
        else:
            batch_size = self.batch_size
        batch_list = [(i, min(i + batch_size, n))
                      for i in range(0, n + batch_size, batch_size) if i < n]
        if print_progress:
            prog = Progbar(target=n)

        # ====== run MPI jobs ====== #
        def map_func(batch):
            for start, end in batch:
                start, end = batch[0]
                x = super(MiniBatchPCA, self).transform(X=X[start:end], y=y)
                # doing dim reduction here save a lot of memory for
                # inter-processors transfer
                if n_components is not None:
                    x = x[:, :n_components]
                # just need to return the start for ordering
                yield start, x

        mpi = MPI(batch_list,
                  map_func=map_func,
                  ncpu=ncpu,
                  buffer_size=1,
                  maximum_queue_size=ncpu * 12)
        # ====== process the return ====== #
        X_transformed = []
        for start, x in mpi:
            X_transformed.append((start, x))
            if print_progress:
                prog.add(x.shape[0])
        if keep_order:
            X_transformed = sorted(X_transformed, key=lambda x: x[0])
        X_transformed = np.concatenate([x[-1] for x in X_transformed], axis=0)
        return X_transformed
Exemple #15
0
    def save_cache(self, path, datatype='memmap', print_progress=True):
        """ Save all preprocessed data to a Dataset """
        if not isinstance(path, str) or os.path.isfile(path):
            raise ValueError('path must be string path to a folder.')
        if os.path.exists(path):
            print('Remove old dataset at path:', path)
            shutil.rmtree(path)

        ds = Dataset(path)
        # ====== start caching ====== #
        if print_progress:
            prog = Progbar(target=self.shape[0], title='Caching:')
        for X in self:
            if not isinstance(X, (tuple, list)):
                X = (X, )
            # saving preprocessed data
            for i, x in enumerate(X):
                name = 'data%d' % i
                if name in ds: ds[name].append(x)
                else: ds[(name, datatype)] = x
            # print progress
            if print_progress:
                prog.add(X[0].shape[0])
        prog.target = prog.seen_so_far
        prog.add(0)
        ds.flush()
        ds.close()
        # end
        return self
Exemple #16
0
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path, name_path):
  n_samples = X.shape[0]
  # indices is None, every row is single sample (utterance or image ...)
  if indices is None:
    if os.path.exists(z_path):
      os.remove(z_path)
    if os.path.exists(f_path):
      os.remove(f_path)
    Z = MmapData(path=z_path, dtype='float32',
                 shape=(n_samples, gmm.nmix), read_only=False)
    F = MmapData(path=f_path, dtype='float32',
                 shape=(n_samples, gmm.feat_dim * gmm.nmix),
                 read_only=False)
    jobs, _ = _split_jobs(n_samples, ncpu=mpi.cpu_count(),
                       device='cpu', gpu_factor=1)

    def map_transform(start_end):
      start, end = start_end
      for i in range(start, end):
        # removed by SAD
        if sad is not None and not bool(sad[i]):
          yield None, None, None
        else:
          z, f = gmm.transform(X[i][np.newaxis, :],
                               zero=True, first=True, device='cpu')
          yield i, z, f
    prog = Progbar(target=n_samples,
                   print_report=True, print_summary=False,
                   name="Extracting zero and first order statistics")
    for i, z, f in mpi.MPI(jobs, map_transform,
                           ncpu=None, batch=1):
      if i is not None: # i None means removed by SAD
        Z[i] = z
        F[i] = f
      prog.add(1)
    Z.flush(); Z.close()
    F.flush(); F.close()
  # use directly the transform_to_disk function
  else:
    gmm.transform_to_disk(X, indices=indices, sad=sad,
                          pathZ=z_path,
                          pathF=f_path,
                          name_path=name_path,
                          dtype='float32', device=None, ncpu=None,
                          override=True)
Exemple #17
0
    def archive(self):
        from zipfile import ZipFile, ZIP_DEFLATED
        path = self.archive_path
        zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED)

        files = set([_[-1] for _ in self._data_map.values()])

        prog = Progbar(target=len(files),
                       name="[Dataset] Archiving",
                       print_report=True,
                       print_summary=True)
        maxlen = max([len(os.path.basename(i)) for i in files])
        for i, f in enumerate(files):
            zfile.write(f, os.path.basename(f))
            prog['Data'] = ('Archiving: %-' + str(maxlen) + 's') \
                % os.path.basename(f)
            prog.add(1)
        zfile.close()
        return path
Exemple #18
0
 def transform(self, X, y=None, n_components=None, print_progress=False):
     n = X.shape[0]
     if self.batch_size is None:
         batch_size = 12 * len(self.mean_)
     else:
         batch_size = self.batch_size
     batch_list = [(i, min(i + batch_size, n))
                   for i in range(0, n + batch_size, batch_size) if i < n]
     if print_progress:
         prog = Progbar(target=n)
     # ====== start transforming ====== #
     X_transformed = []
     for start, end in batch_list:
         x = super(MiniBatchPCA, self).transform(X=X[start:end], y=y)
         if n_components is not None:
             x = x[:, :n_components]
         X_transformed.append(x)
         if print_progress:
             prog.add(x.shape[0])
     return np.concatenate(X_transformed, axis=0)
Exemple #19
0
def _fitting_helper(it, fn, nb_samples, nb_classes, title):
  prog = Progbar(target=nb_samples, print_report=True,
                 print_summary=False, name=title)
  results = None
  start_time = time.time()
  for nb_iter, (x, y) in enumerate(it):
    # ====== preprocessing ====== #
    x, y = _preprocess_xy(x, y, nb_classes)
    # ====== post-processing results ====== #
    if results is None:
      results = list(fn(x, y))
    else:
      for idx, r in enumerate(fn(x, y)):
        results[idx] += r
    # ====== update progress ====== #
    prog.add(x.shape[0])
  duration = time.time() - start_time
  return (nb_iter + 1,
          duration,
          [r if isinstance(r, np.ndarray) else r / (nb_iter + 1)
           for r in results])
Exemple #20
0
 def _predict(self, X, f_pred):
   if not self.is_fitted:
     raise RuntimeError("LogisticRegression hasn't been initialized or "
                        "fitted.")
   if hasattr(X, 'set_batch'):
     it = iter(X.set_batch(batch_size=self.batch_size, seed=None))
   elif hasattr(X, '__getitem__'):
     it = (X[start:end]
           for start, end in batching(batch_size=self.batch_size,
                                      n=X.shape[0]))
   else:
     raise ValueError("`X` must has attributes 'set_batch' or '__getitem__'")
   # ====== make prediction ====== #
   y = []
   prog = Progbar(target=X.shape[0], print_report=True,
                  print_summary=False, name="Predicting")
   for x in it:
     x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes)
     y.append(f_pred(x))
     prog.add(x.shape[0])
   return np.concatenate(y, axis=0)
Exemple #21
0
 def _extract_test_data(feat, label, utt_length):
   prog = Progbar(target=len(feeder_test),
                  print_summary=True, name="Preprocessing test set")
   X_test = defaultdict(list)
   for name, idx, X, y in feeder_test:
     # validate everything as expected
     assert fn_label(name) == np.argmax(y), name # label is right
     # save to list
     X_test[name].append((idx, X))
     prog.add(X.shape[0])
   # ====== create 1 array for data and dictionary for indices ====== #
   X_test_name = []
   X_test_data = []
   for name, X in X_test.items():
     X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                        axis=0).astype('float16')
     X_test_name += [name + '.%d' % i for i in range(len(X))]
     X_test_data.append(X)
   X_test_name = np.array(X_test_name)
   X_test_data = np.concatenate(X_test_data, axis=0)
   return X_test_name, X_test_data
Exemple #22
0
def _fitting_helper(it, fn, nb_samples, nb_classes, title):
    prog = Progbar(target=nb_samples,
                   print_report=True,
                   print_summary=False,
                   name=title)
    results = None
    start_time = time.time()
    for nb_iter, (x, y) in enumerate(it):
        # ====== preprocessing ====== #
        x, y = _preprocess_xy(x, y, nb_classes)
        # ====== post-processing results ====== #
        if results is None:
            results = list(fn(x, y))
        else:
            for idx, r in enumerate(fn(x, y)):
                results[idx] += r
        # ====== update progress ====== #
        prog.add(x.shape[0])
    duration = time.time() - start_time
    return (nb_iter + 1, duration, [
        r if isinstance(r, np.ndarray) else r / (nb_iter + 1) for r in results
    ])
Exemple #23
0
    def predict_proba(self, *args):
        self._auto_create_inputs(args)
        self._create_function()

        n = 0
        nb_samples = args[0].shape[0]
        batch_size = self._batch_size
        prediction = []
        prog = Progbar(target=nb_samples, title='Predicting')
        while n < nb_samples:
            end = min(n + batch_size, nb_samples)
            x = [i[n:end] for i in args]
            x = self._functions['pred'](*x)
            _min = np.min(x, axis=-1)[:, None]
            _max = np.max(x, axis=-1)[:, None]
            x = (x - _min) / (_max - _min)
            x = x / x.sum(-1)[:, None]
            prediction.append(x)
            n = end
            prog.update(n)

        return np.concatenate(prediction, axis=0)
Exemple #24
0
def make_prediction(feeder, title):
  prog = Progbar(target=len(feeder), print_summary=True, name=title)
  name_list = []
  y_pred = []
  y_true = []
  for name, idx, X, y in feeder.set_batch(batch_size=100000,
                                          batch_mode='file',
                                          seed=None, shuffle_level=0):
    name_list.append(name)

    y = np.argmax(y, axis=-1)
    assert len(np.unique(y)) == 1, name
    spk = label2spk[y[0]]
    assert spkid[name] == spk, name
    y_true.append(y)

    y_ = f_prob(X)
    y_pred.append(y_)

    assert len(y) == len(y_)
    prog.add(X.shape[0])
  evaluate_prediction(name_list, y_pred, y_true, title=title)
Exemple #25
0
 def _extract_test_data(feat, label, utt_length):
     prog = Progbar(target=len(feeder_test),
                    print_summary=True,
                    name="Preprocessing test set")
     X_test = defaultdict(list)
     for name, idx, X, y in feeder_test:
         # validate everything as expected
         assert fn_label(name) == np.argmax(y), name  # label is right
         # save to list
         X_test[name].append((idx, X))
         prog.add(X.shape[0])
     # ====== create 1 array for data and dictionary for indices ====== #
     X_test_name = []
     X_test_data = []
     for name, X in X_test.items():
         X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                            axis=0).astype('float16')
         X_test_name += [name + '.%d' % i for i in range(len(X))]
         X_test_data.append(X)
     X_test_name = np.array(X_test_name)
     X_test_data = np.concatenate(X_test_data, axis=0)
     return X_test_name, X_test_data
def evaluate_latent(fn, feeder, title):
    y_true = []
    Z = []
    for outputs in Progbar(feeder.set_batch(batch_mode='file'),
                           name=title,
                           print_report=True,
                           print_summary=False,
                           count_func=lambda x: x[-1].shape[0]):
        name = str(outputs[0])
        idx = int(outputs[1])
        data = outputs[2:]
        assert idx == 0
        y_true.append(name)
        Z.append(fn(*data))
    Z = np.concatenate(Z, axis=0)
    # ====== visualize spectrogram ====== #
    if Z.ndim >= 3:
        sample = np.random.choice(range(len(Z)), size=3, replace=False)
        spec = Z[sample.astype('int32')]
        y = [y_true[int(i)] for i in sample]
        plot_figure(nrow=6, ncol=6)
        for i, (s, tit) in enumerate(zip(spec, y)):
            s = s.reshape(len(s), -1)
            plot_spectrogram(s.T, ax=(1, 3, i + 1), title=tit)
    # ====== visualize each point ====== #
    # flattent to 2D
    Z = np.reshape(Z, newshape=(len(Z), -1))
    # tsne if necessary
    if Z.shape[-1] > 3:
        Z = fast_tsne(Z,
                      n_components=3,
                      n_jobs=8,
                      random_state=K.get_rng().randint(0, 10e8))
    # color and marker
    Z_color = [digit_color_map[i.split('_')[-1]] for i in y_true]
    Z_marker = [gender_marker_map[i.split('_')[1]] for i in y_true]
    plot_figure(nrow=6, ncol=20)
    for i, azim in enumerate((15, 60, 120)):
        plot_scatter(x=Z[:, 0],
                     y=Z[:, 1],
                     z=Z[:, 2],
                     ax=(1, 3, i + 1),
                     size=4,
                     color=Z_color,
                     marker=Z_marker,
                     azim=azim,
                     legend=legends if i == 1 else None,
                     legend_ncol=11,
                     fontsize=10,
                     title=title)
    plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
Exemple #27
0
def make_prediction(feeder, title):
    prog = Progbar(target=len(feeder), print_summary=True, name=title)
    name_list = []
    y_pred = []
    y_true = []
    for name, idx, X, y in feeder.set_batch(batch_size=100000,
                                            batch_mode='file',
                                            seed=None,
                                            shuffle_level=0):
        name_list.append(name)

        y = np.argmax(y, axis=-1)
        assert len(np.unique(y)) == 1, name
        spk = label2spk[y[0]]
        assert spkid[name] == spk, name
        y_true.append(y)

        y_ = f_prob(X)
        y_pred.append(y_)

        assert len(y) == len(y_)
        prog.add(X.shape[0])
    evaluate_prediction(name_list, y_pred, y_true, title=title)
Exemple #28
0
def make_dnn_prediction(functions, X, batch_size=256, title=''):
  return_list = True
  if not isinstance(functions, (tuple, list)):
    functions = [functions]
    return_list = False
  n_functions = len(functions)
  results = [[] for i in range(n_functions)]
  # ====== prepare progress bar ====== #
  n_samples = len(X)
  prog = Progbar(target=n_samples, print_summary=True,
                 name="Making prediction: %s" % str(title))
  # ====== for feeder ====== #
  if isinstance(X, F.Feeder):
    y_true = []
    for x, y in X.set_batch(batch_size=batch_size):
      for res, fn in zip(results, functions):
        res.append(fn(x))
      prog.add(x.shape[0])
      y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y)
    results = [np.concatenate(res, axis=0)
               for res in results]
    y_true = np.concatenate(y_true, axis=0)
    if return_list:
      return results, y_true
    return results[0], y_true
  # ====== for numpy array ====== #
  else:
    for start, end in batching(batch_size=batch_size, n=n_samples):
      y = X[start:end]
      for res, fn in zip(results, functions):
        res.append(fn(y))
      prog.add(end - start)
    results = [np.concatenate(res, axis=0)
               for res in results]
    if return_list:
      return results
    return results[0]
Exemple #29
0
 def _predict(self, X, f_pred):
     if not self.is_fitted:
         raise RuntimeError("LogisticRegression hasn't been initialized or "
                            "fitted.")
     if hasattr(X, 'set_batch'):
         it = iter(X.set_batch(batch_size=self.batch_size, seed=None))
     elif hasattr(X, '__getitem__'):
         it = (X[start:end]
               for start, end in batching(batch_size=self.batch_size,
                                          n=X.shape[0]))
     else:
         raise ValueError(
             "`X` must has attributes 'set_batch' or '__getitem__'")
     # ====== make prediction ====== #
     y = []
     prog = Progbar(target=X.shape[0],
                    print_report=True,
                    print_summary=False,
                    name="Predicting")
     for x in it:
         x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes)
         y.append(f_pred(x))
         prog.add(x.shape[0])
     return np.concatenate(y, axis=0)
Exemple #30
0
    def test_pool_depool(self):
        X1 = K.placeholder(shape=(None, 12, 8, 25), name='X1')
        X2 = K.placeholder(shape=(None, 12, 8, 25, 18), name='X2')
        x1 = np.random.rand(13, 12, 8, 25)
        x2 = np.random.rand(13, 12, 8, 25, 18)
        prog = Progbar(target=2 * 2 * 2 * 3, print_report=True)

        def check_shape(s1, s2):
            self.assertEqual(tuple(s1),
                             tuple(s2),
                             msg="%s != %s" % (str(s1), str(s2)))

        for pool_size in (2, 3):
            for strides in (2, 3):
                # strides > window_shape not supported due to inconsistency
                # between CPU and GPU implementations
                if pool_size < strides:
                    prog.add(1)
                    continue
                for pad in ('valid', 'same'):
                    for transpose_mode in ('nn', 'pad_margin', 'repeat'):
                        # ====== print prog ====== #
                        prog['test'] = "Size:%d,Stride:%d,Pad:%s,T:%s" % \
                            (pool_size, strides, pad, transpose_mode)
                        prog.add(1)
                        # ====== check ops 4D ====== #
                        down = N.Pool(pool_size=pool_size,
                                      strides=strides,
                                      pad=pad,
                                      mode='max',
                                      transpose_mode=transpose_mode)
                        up = down.T
                        y1 = down(X1)
                        check_shape(
                            K.eval(y1, {
                                X1: x1
                            }).shape[1:],
                            y1.shape.as_list()[1:])
                        y2 = up(y1)
                        check_shape(K.eval(y2, {X1: x1}).shape, x1.shape)
                        # ====== check ops 5D ====== #
                        down = N.Pool(pool_size=pool_size,
                                      strides=strides,
                                      pad=pad,
                                      mode='max',
                                      transpose_mode=transpose_mode)
                        up = down.T
                        y1 = down(X2)
                        check_shape(
                            K.eval(y1, {
                                X2: x2
                            }).shape[1:], y1.shape[1:])
                        y2 = up(y1)
                        check_shape(K.eval(y2, {X2: x2}).shape, x2.shape)
Exemple #31
0
def validating_noise_data(in_path_raw):
    # preparing
    noise_dataset = ['musan', 'rirs']
    all_files = defaultdict(list)
    n_files = sum(
        len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list)
    n_non_exist = 0
    n_exist = 0
    prog = Progbar(target=n_files,
                   print_summary=True,
                   name="Validating noise dataset")
    prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1])
    prog.set_summarizer(key='#Exist', fn=lambda x: x[-1])
    # check all dataset
    for ds_name in noise_dataset:
        if ds_name not in sre_file_list:
            continue
        if ds_name not in in_path_raw:
            continue
        base_path = in_path_raw[ds_name]
        base_ds = all_files[ds_name]
        # start validating
        for row in sre_file_list[ds_name]:
            # check file
            path, channel, name, noise_type, duration = row[:5]
            path = os.path.join(base_path, path)
            if os.path.exists(path):
                base_ds.append([path, channel, name, noise_type, duration])
                n_exist += 1
            else:
                n_non_exist += 1
            # update progress
            prog['ds'] = ds_name
            prog['#Exist'] = n_exist
            prog['#Non-exist'] = n_non_exist
            prog.add(1)
    # ====== return ====== #
    # Header:
    #  0       1      2         3         4
    # path, channel, name, noise_type, duration
    return {
        key: np.array(sorted(val, key=lambda x: x[0]))
        for key, val in all_files.items()
    }
Exemple #32
0
def validating_noise_data(in_path_raw):
  # preparing
  noise_dataset = ['musan', 'rirs']
  all_files = defaultdict(list)
  n_files = sum(len(sre_file_list[i])
                for i in noise_dataset
                if i in sre_file_list)
  n_non_exist = 0
  n_exist = 0
  prog = Progbar(target=n_files, print_summary=True,
                 name="Validating noise dataset")
  prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1])
  prog.set_summarizer(key='#Exist', fn=lambda x: x[-1])
  # check all dataset
  for ds_name in noise_dataset:
    if ds_name not in sre_file_list:
      continue
    if ds_name not in in_path_raw:
      continue
    base_path = in_path_raw[ds_name]
    base_ds = all_files[ds_name]
    # start validating
    for row in sre_file_list[ds_name]:
      # check file
      path, channel, name, noise_type, duration = row[:5]
      path = os.path.join(base_path, path)
      if os.path.exists(path):
        base_ds.append([path, channel, name, noise_type, duration])
        n_exist += 1
      else:
        n_non_exist += 1
      # update progress
      prog['ds'] = ds_name
      prog['#Exist'] = n_exist
      prog['#Non-exist'] = n_non_exist
      prog.add(1)
  # ====== return ====== #
  # Header:
  #  0       1      2         3         4
  # path, channel, name, noise_type, duration
  return {key: np.array(sorted(val, key=lambda x: x[0]))
          for key, val in all_files.items()}
def evaluate_feeder(feeder, title):
    y_true_digit = []
    y_true_gender = []
    y_pred = []
    for outputs in Progbar(feeder.set_batch(batch_mode='file'),
                           name=title,
                           print_report=True,
                           print_summary=False,
                           count_func=lambda x: x[-1].shape[0]):
        name = str(outputs[0])
        idx = int(outputs[1])
        data = outputs[2:]
        assert idx == 0
        y_true_digit.append(f_digits(name))
        y_true_gender.append(f_genders(name))
        y_pred.append(f_pred(*data))
    # ====== post processing ====== #
    y_true_digit = np.array(y_true_digit, dtype='int32')
    y_true_gender = np.array(y_true_gender, dtype='int32')
    y_pred_proba = np.concatenate(y_pred, axis=0)
    y_pred_all = np.argmax(y_pred_proba, axis=-1).astype('int32')
    # ====== plotting for each gender ====== #
    plot_figure(nrow=6, ncol=25)
    for gen in range(len(genders)):
        y_true, y_pred = [], []
        for i, g in enumerate(y_true_gender):
            if g == gen:
                y_true.append(y_true_digit[i])
                y_pred.append(y_pred_all[i])
        if len(y_true) == 0:
            continue
        cm = confusion_matrix(y_true, y_pred, labels=range(len(digits)))
        plot_confusion_matrix(cm,
                              labels=digits,
                              fontsize=8,
                              ax=(1, 4, gen + 1),
                              title='[%s]%s' % (genders[gen], title))
    plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
Exemple #34
0
def make_dnn_prediction(functions, X, batch_size=256, title=''):
    return_list = True
    if not isinstance(functions, (tuple, list)):
        functions = [functions]
        return_list = False
    n_functions = len(functions)
    results = [[] for i in range(n_functions)]
    # ====== prepare progress bar ====== #
    n_samples = len(X)
    prog = Progbar(target=n_samples,
                   print_summary=True,
                   name="Making prediction: %s" % str(title))
    # ====== for feeder ====== #
    if isinstance(X, F.Feeder):
        y_true = []
        for x, y in X.set_batch(batch_size=batch_size):
            for res, fn in zip(results, functions):
                res.append(fn(x))
            prog.add(x.shape[0])
            y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y)
        results = [np.concatenate(res, axis=0) for res in results]
        y_true = np.concatenate(y_true, axis=0)
        if return_list:
            return results, y_true
        return results[0], y_true
    # ====== for numpy array ====== #
    else:
        for start, end in batching(batch_size=batch_size, n=n_samples):
            y = X[start:end]
            for res, fn in zip(results, functions):
                res.append(fn(y))
            prog.add(end - start)
        results = [np.concatenate(res, axis=0) for res in results]
        if return_list:
            return results
        return results[0]
Exemple #35
0
def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()
Exemple #36
0
  def run(self):
    njobs = len(self.jobs)
    dataset = Dataset(self.path)
    if self.n_cache <= 1:
      cache_limit = max(2, int(0.12 * njobs))
    else:
      cache_limit = int(self.n_cache)
    # ====== indices ====== #
    databases = defaultdictkey(lambda key:
        MmapDict(path=os.path.join(dataset.path, key), cache_size=10000,
                 read_only=False))
    last_start = defaultdict(int)
    # ====== statistic ====== #
    # load old statistics
    stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2)
    for key in dataset.keys():
      if 'sum1' == key[-4]:
        stats[key[:-4]][0] = dataset[key][:]
      elif 'sum2' == key[-4:]:
        stats[key[:-4]][1] = dataset[key][:]
    # all data are cached for periodically flushed
    cache = defaultdict(list)
    n_processed = [0] # store the value as reference

    # ====== helper ====== #
    def flush_feature(feat_name, X_cached):
      if len(X_cached) > 0:
        X_cached = np.concatenate(X_cached, 0)
        # flush data
        if feat_name in dataset:
          dataset[feat_name].append(X_cached)
        else:
          dataset[(feat_name, 'memmap')] = X_cached

    # ====== repeated for each result returned ====== #
    def post_processing(result):
      # search for file name
      if self.identifier not in result:
        raise RuntimeError(
            "Cannot find identifier '%s' in returned dictionary" % self.identifier)
      file_name = result[self.identifier]
      # invalid file_name
      if not is_string(file_name):
        raise RuntimeError("Cannot find file name in returned features "
            "list, the file name can be specified in key: 'name', 'path' "
            "and the type of the value must be string. All available "
            "keys are: %s" % str(result.keys()))
      # store all new indices
      # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
      all_indices = {}
      # processing
      for feat_name, X in result.items():
        # some invalid feat_name
        if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
          raise RuntimeError("Returned features' name cannot be one "
                             "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
        # ignore some feat_name
        if feat_name in ('name'):
          continue
        # if numpy ndarray, save to MmapData
        if isinstance(X, np.ndarray) or \
        'sum1' == feat_name[-4:] or \
        'sum2' == feat_name[-4:]:
          # save statistics instead
          if 'sum1' == feat_name[-4:]:
            stats[feat_name[:-4]][0] += X
          elif 'sum2' == feat_name[-4:]:
            stats[feat_name[:-4]][1] += X
          # save features array
          else:
            all_indices[feat_name] = X.shape[0]
            # cache data, only if we have more than 0 sample
            if X.shape[0] > 0:
              cache[feat_name].append(X)
        # else all other kind of data save to MmapDict
        else:
          databases[feat_name][file_name] = X
        # remove data
        del X
      # ====== update indices ====== #
      if len(all_indices) > 0:
        for feat_name, n in all_indices.items():
          ids_name = 'indices_%s' % feat_name
          databases[ids_name][file_name] = (last_start[ids_name],
                                            last_start[ids_name] + n)
          last_start[ids_name] += n
      # ====== flush cache ====== #
      n_processed[0] += 1
      if n_processed[0] % cache_limit == 0: # 12 + 8
        for feat_name, X_cached in cache.items():
          flush_feature(feat_name, X_cached)
        cache.clear()
      # ====== update progress ====== #
      return file_name

    # ====== mapping function ====== #
    def _map_func(dat):
      try:
        ret = self.extractor.transform(dat)
      except Exception as e: # Non-handled exception
        ret = '\n========\n'
        ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
        ret += 'Error : `%s`\n' % str(e)
        ret += 'Input : `%s`\n' % str(dat)
        import traceback
        etype, value, tb = sys.exc_info()
        for line in traceback.TracebackException(
                type(value), value, tb, limit=None).format(chain=True):
          ret += line
      return ret
    # ====== processing ====== #
    mpi = MPI(jobs=self.jobs,
              func=_map_func,
              ncpu=self.n_cpu,
              batch=1,
              hwm=self.n_cpu * 3,
              backend='python')
    # initialize
    prog = Progbar(target=njobs, name=self.path,
                   interval=0.12, print_report=True, print_summary=True)
    start_time = time.time()
    last_time = time.time()
    last_count = 0
    with open(self._log_path, 'w') as flog:
      # writing the log head
      flog.write('============================\n')
      flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False))
      flog.write('Outpath    : %s\n' % self.path)
      flog.write('Extractor  : %s\n' % '->'.join([s[-1].__class__.__name__
                                                  for s in self.extractor.steps]))
      flog.write('#Jobs      : %d\n' % njobs)
      flog.write('#CPU       : %d\n' % self.n_cpu)
      flog.write('#Cache     : %d\n' % cache_limit)
      flog.write('============================\n')
      flog.flush()
      # start processing the file list
      for count, result in enumerate(mpi):
        # Non-handled exception
        if isinstance(result, string_types):
          flog.write(result)
          flog.flush()
          self._error_log.append(result)
          if self.stop_on_failure:
            raise RuntimeError(result)
        # some error might happened
        elif isinstance(result, ExtractorSignal):
          flog.write(str(result)); flog.flush()
          if result.action == 'error':
            prog.add_notification(str(result))
            raise RuntimeError("ExtractorSignal requests terminating processor!")
          elif result.action == 'warn':
            prog.add_notification(str(result))
          elif result.action == 'ignore':
            self._error_log.append(result)
          else:
            raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action)
          prog['File'] = '%-48s' % result.message[:48]
        # otherwise, no error happened, do post-processing
        else:
          name = post_processing(result)
          prog['File'] = '%-48s' % str(name)[:48]
        # update progress
        prog.add(1)
        # manually write to external log file
        if (count + 1) % max(1, int(0.01 * njobs)) == 0:
          curr_time = time.time()
          elap = curr_time - start_time
          avg_speed = (count + 1) / elap
          cur_speed = (count + 1 - last_count) / (curr_time - last_time)
          avg_est = (njobs - count - 1) / avg_speed
          cur_est = (njobs - count - 1) / cur_speed
          flog.write('[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                     '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                     '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                     (get_formatted_datetime(only_number=False),
                      count + 1, njobs - count - 1, elap,
                      avg_speed, avg_est,
                      cur_speed, cur_est))
          flog.flush()
          last_time = curr_time
          last_count = count + 1
    # ====== end, flush the last time ====== #
    for feat_name, X_cached in cache.items():
      flush_feature(feat_name, X_cached)
    cache.clear()
    cache = None
    dataset.flush()
    prog.add_notification("Flushed all data to disk")
    # ====== saving indices ====== #
    for name, db in databases.items():
      db.flush(save_all=True)
      db_size = len(db)
      db.close()
      prog.add_notification('Flush MmapDict "%s" to disk, size: %s' %
                            (ctext(name, 'yellow'),
                             ctext(str(db_size), 'yellow')))

    # ====== save mean and std ====== #
    def save_mean_std(sum1, sum2, name):
      N = dataset[name.split('_')[0]].shape[0]
      mean = sum1 / N
      std = np.sqrt(sum2 / N - np.power(mean, 2))
      if np.any(np.isnan(mean)):
        wprint('Mean contains NaN, name: %s' % name)
      if np.any(np.isnan(std)):
        wprint('Std contains NaN, name: %s' % name)
      dataset[name + 'sum1'] = sum1
      dataset[name + 'sum2'] = sum2
      dataset[name + 'mean'] = mean
      dataset[name + 'std'] = std
    # save all stats
    if len(stats) > 0:
      for feat_name, (sum1, sum2) in stats.items():
        save_mean_std(sum1, sum2, feat_name)
        prog.add_notification('Saved statistics of: %s, shape: %s' %
                              (ctext(feat_name.split('_')[0], 'yellow'),
                               ctext(str(sum1.shape), 'yellow')))
    # ====== dataset flush() ====== #
    dataset.flush()
    dataset.close()
    # ====== saving the extractor ====== #
    # not good idea to save the extractor all the time
    # pipeline_path = os.path.join(dataset.path, 'pipeline')
    # with open(pipeline_path, 'wb') as f:
    #   cPickle.dump(self.extractor, f, protocol=2)
    # prog.add_notification("Saved Extractor pipeline at: %s" %
    #                       ctext(pipeline_path, 'yellow'))
    # ====== saving the configuration ====== #
    config_path = os.path.join(dataset.path, 'config')
    config = MmapDict(config_path)
    config['__configuration_time__'] = time.time()
    config['__processor__'] = self.path
    for i in dir(self):
      if _default_module.match(i) is not None:
        continue
      j = getattr(self, i)
      if isinstance(j, (Number, string_types, bool)):
        config[i] = j
    config.flush(save_all=True)
    self.config = {i: j
                   for i, j in config}
    config.close()
    prog.add_notification("Saved configuration at: %s" %
                          ctext(config_path, 'yellow'))
    # ====== final notification ====== #
    prog.add_notification("Closed all dataset.")
    prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
Exemple #37
0
  def save_cache(self, path, name=None, dtype=None, batch_size=1024):
    """ Save all preprocessed data to a Dataset

    Parameters
    ----------
    path: string
        path to a folder
    name: None, or list of string
        specific name for each returned `numpy.ndarray` during iteration
    dtype: None, or list of dtype, or single dtype
        specific dtype for all or each of returned `numpy.ndarray`
        during iteration
    batch_size: int
        amount of samples for each batch (higher the faster iteration)

    Note
    ----
    Only returned `numpy.ndarray` are saved
    """
    from odin.fuel.dataset import Dataset
    if not is_string(path):
      raise ValueError("`path` must be string path to a folder.")
    if os.path.exists(path) and os.path.isfile(path):
      raise ValueError("`path` is a file, required a folder for "
                       "saving all cache data.")
    # ====== start caching ====== #
    prog = Progbar(target=len(self),
                   name='Saving cache of preprocessed data',
                   print_report=True, print_summary=True)
    ds = Dataset(path, override=True)
    with self.set_batch_context(batch_size=int(batch_size), seed=None,
                                start=0, end=-1, shuffle_level=0):
      for X in self:
        if not isinstance(X, (tuple, list)):
          X = (X,)
        n = 0
        i = 0
        # saving preprocessed data
        for x in X:
          if isinstance(x, np.ndarray):
            # checking name
            if name is None:
              x_name = 'X%d' % i
            else:
              x_name = name[i]
            # checking dtype
            if isinstance(dtype, (tuple, list)):
              x = x.astype(dtype[i])
            elif dtype is not None:
              x = x.astype(dtype)
            # saving to the dataset
            if x_name in ds:
              ds[x_name].append(x)
            else:
              ds[(x_name, 'memmap')] = x
            # update samples count, and data count
            n = x.shape[0]
            i += 1
        # print progress
        prog.add(n)
    # ====== flush and close everything ====== #
    ds.flush()
    ds.close()
    with open(os.path.join(path, 'README'), 'wb') as f:
      f.write(str(self))
    # end
    # ====== check one more time ====== #
    ds = Dataset(path, read_only=True)
    print(ds)
    print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)')
    ds.close()
    return self
Exemple #38
0
  def copy(self, destination,
           indices_filter=None, data_filter=None,
           override=False):
    """ Copy the dataset to a new folder and closed
    the old dataset

    """
    from distutils.dir_util import copy_tree
    read_only = self.read_only
    # indices
    if indices_filter is not None and \
    not is_callable(indices_filter) and \
    not isinstance(indices_filter, (tuple, list)):
      raise ValueError('`indices_filter` must be callable, tuple, list or None')
    if isinstance(indices_filter, (tuple, list)):
      tmp = tuple(indices_filter)
      indices_filter = lambda x: x in tmp
    # data name
    if data_filter is not None and \
    not is_callable(data_filter) and \
    not isinstance(data_filter, (tuple, list)):
      raise ValueError('`data_filter` must be callable, tuple, list or None')
    if isinstance(data_filter, (tuple, list)):
      tmp = tuple(data_filter)
      data_filter = lambda x: x in tmp
    # ====== other files which are not Data ====== #
    other_files = [i for i in os.listdir(self.path)
                   if i not in self]
    # ====== preprocessing ====== #
    destination = os.path.abspath(str(destination))
    if not os.path.exists(destination):
      os.mkdir(destination)
    elif not os.path.isdir(destination):
      raise ValueError('path at "%s" must be a folder' % destination)
    elif override:
      shutil.rmtree(destination)
      os.mkdir(destination)
    else:
      raise ValueError("A folder exist at path: '%s', cannot be overrided." %
                       destination)
    # ====== copy everything ====== #
    if indices_filter is None and data_filter is None:
      print("Copying %s files from '%s' to '%s' ..." %
        (ctext(len(self), 'cyan'),
         ctext(self.path, 'yellow'),
         ctext(destination, 'yellow')))
      copy_tree(self.path, destination)
    # ====== only data_filter ====== #
    elif indices_filter is None:
      data_list = [i for i in self.keys() if data_filter(i)]
      # copy all the data
      for name in data_list:
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        print("Copying from '%s' to '%s' ..." %
              (ctext(org_path, 'yellow'),
               ctext(dst_path, 'yellow')))
        shutil.copy2(org_path, dst_path)
      # copy all the related indices
      for name in self.keys():
        org_path = os.path.join(self.path, name)
        dst_path = os.path.join(destination, name)
        if not os.path.exists(dst_path) and \
        ('indices' == name or any(i in data_list for i in name.split('_')[1:])):
          print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'),
                                                       ctext(dst_path, 'cyan')))
          shutil.copy2(org_path, dst_path)
    # ====== use indices_filter and data_filter ====== #
    else:
      if data_filter is None:
        all_data = list(self.keys())
      else:
        all_data = [i for i in self.keys()
                    if data_filter(i)]
      # list of data with separated indices
      separated_data = flatten_list(
          [k.split('_')[1:] for k in self.keys()
         if 'indices_' == k[:8]])
      # iterate over indices and copy one by one data
      for ids_name in [k for k in self.keys() if 'indices' == k[:7]]:
        indices = [(n, (s, e))
                   for n, (s, e) in self[ids_name]
                   if indices_filter(n)]
        # no match indices, skip
        if len(indices) == 0:
          continue
        nb_samples = sum(e - s for n, (s, e) in indices)
        # get all data assigned to given indices
        data = ids_name.split('_')[1:]
        if len(data) == 0:
          data = [i for i in all_data if i not in separated_data]
        else:
          data = [i for i in data if i in all_data]
        # if still no data found, skip
        if len(data) == 0:
          continue
        # copy each data
        for data_name in data:
          X = self[data_name]
          # copy big MmapDict
          if isinstance(X, MmapDict) and len(X) == len(self[ids_name]):
            new_path = os.path.join(destination, os.path.basename(X.path))
            print("Copying MmapDict from '%s' to '%s'" % (
                ctext(X.path, 'cyan'),
                ctext(new_path, 'cyan')))
            new_dict = MmapDict(new_path, cache_size=80000, read_only=False)
            for n, (s, e) in indices:
              new_dict[n] = X[n]
            new_dict.flush(save_all=True)
            new_dict.close()
          # copy MmapData
          elif isinstance(X, MmapData):
            Y = MmapData(path=os.path.join(destination, data_name),
                         dtype=X.dtype, shape=(0,) + X.shape[1:],
                         read_only=False)
            prog = Progbar(target=nb_samples,
                           print_report=True, print_summary=True,
                           name="Copying data: '%s' to path:'%s'" %
                           (ctext(data_name, 'yellow'),
                            ctext(Y.data_info, 'cyan')))
            for n, (s, e) in indices:
              Y.append(X[s:e])
              prog.add(e - s)
          # unknown data-type
          else:
            org_path = os.path.join(self.path, data_name)
            new_path = os.path.join(destination, data_name)
            # just copy directly the files
            if os.path.isfile(org_path) or \
            not os.path.exists(new_path):
              shutil.copy2(org_path, new_path)
              print("Copying '%s' to '%s' ..." %
                (ctext(org_path, 'cyan'), ctext(new_path, 'yellow')))
            else:
              wprint("Cannot copy: '%s' - %s" %
                (ctext(data_name, 'cyan'),
                 ctext(type(self[data_name]), 'yellow')))
        # copy the indices
        new_indices = MmapDict(os.path.join(destination, ids_name),
                               cache_size=80000, read_only=False)
        start = 0
        for n, (s, e) in indices:
          size = e - s
          new_indices[n] = (start, start + size)
          start += size
        new_indices.flush(save_all=True)
        new_indices.close()
    # ====== copy others files ====== #
    for f in other_files:
      org_path = os.path.join(self.path, f)
      dst_path = os.path.join(destination, f)
      if not os.path.exists(dst_path):
        if os.path.isdir(org_path): # directory
          copy_tree(org_path, dst_path)
        else: # single file
          shutil.copy2(org_path, dst_path)
    # ====== readme ====== #
    readme_name = os.path.basename(self._readme_path)
    dst_path = os.path.join(destination, readme_name)
    if not os.path.exists(dst_path):
      shutil.copy2(self._readme_path, dst_path)
    return Dataset(destination, read_only=read_only)
Exemple #39
0
def filter_utterances(X, indices, spkid,
                      min_dur=None, min_utt=None,
                      remove_min_length=True, remove_min_uttspk=True,
                      n_speakers=None, ncpu=None, save_path=None,
                      title=''):
  """
  X : 2-D matrix
    input features

  indices : Mapping
    utterance_name -> (start, end) in `X`

  spkid : Mapping
    utterance_name -> speaker_id

  remove_min_length : bool (default: True)
    if True, remove all files shorter than MINIMUM_UTT_DURATION

  remove_min_uttspk : bool (default: True)
    if True, remove all speakers with lower amount of utterances than
    MINIMUM_UTT_PER_SPEAKERS

  n_speakers : {None, int} (default: None)
    if given, downsample the dataset by given number of speakers

  save_path : {None, str} (default: None)
    if given, pickle all filtered files to disk

  """
  if min_dur is None:
    min_dur = MINIMUM_UTT_DURATION
  if min_utt is None:
    min_utt = MINIMUM_UTT_PER_SPEAKERS

  minimum_amount_of_frames = min_dur / Config.STEP_LENGTH
  save_data = {}

  prog = Progbar(target=len(indices),
                 print_report=True, print_summary=True,
                 name='Filtering broken utterances: %s' % title)
  prog.set_summarizer('zero-length', fn=lambda x: x[-1])
  prog.set_summarizer('min-frames', fn=lambda x: x[-1])
  prog.set_summarizer('zero-var', fn=lambda x: x[-1])
  prog.set_summarizer('small-var', fn=lambda x: x[-1])
  prog.set_summarizer('overflow', fn=lambda x: x[-1])

  # ====== mpi function for checking ====== #
  @nb.jit(nopython=True, nogil=True)
  def _fast_mean_var_ax0(z):
    # using this function for calculating mean and variance
    # can double the speed but cannot check overflow,
    # only accept float32 or float64 input
    s1 = np.zeros(shape=(z.shape[1],), dtype=z.dtype)
    s2 = np.zeros(shape=(z.shape[1],), dtype=z.dtype)
    for i in range(z.shape[0]):
      s1 += z[i]
      s2 += np.power(z[i], 2)
    mean = s1 / z.shape[0]
    var = s2 / z.shape[0] - np.power(mean, 2)
    return mean, var

  def _mpi_func(jobs):
    for name, (start, end) in jobs:
      y = X[start:end]
      # flags
      is_zero_len = False
      is_zero_var = False
      is_small_var = False
      is_min_frames = False
      is_overflow = False
      # checking length
      if y.shape[0] == 0:
        is_zero_len = True
      elif y.shape[0] < minimum_amount_of_frames:
        is_min_frames = True
      # checking statistics
      else:
        with catch_warnings_error(RuntimeWarning):
          try:
            # mean = np.mean(y, axis=-1)
            var = np.var(y, axis=-1)
            # min_val = np.min(y, axis=-1)
            # max_val = np.max(y, axis=-1)
          # numerical unstable
          except RuntimeWarning as w:
            if 'overflow encountered' in str(w):
              is_overflow = True
            else:
              print(name, ':', w)
          # process with more numerical filtering
          else:
            if np.any(np.isclose(var, 0)):
              is_zero_var = True
            # very heuristic and aggressive here
            # filter-out anything with ~16.67% of low-var
            # this could remove 1/3 of the original data
            if np.sum(var < 0.01) > (len(y) / 6):
              is_small_var = True
      # return the flags
      yield (name, is_zero_len, is_min_frames,
             is_zero_var, is_small_var,
             is_overflow)
  # ====== running the multiprocessing filter ====== #
  zero_len_files = {}
  min_frame_files = {}
  zero_var_files = {}
  small_var_files = {}
  overflow_files = {}
  for res in mpi.MPI(jobs=sorted(indices.items(),
                                 key=lambda x: x[1][0]),
                     func=_mpi_func,
                     ncpu=NCPU if ncpu is None else int(ncpu),
                     batch=250):
    name = res[0]
    if res[1]: zero_len_files[name] = 1
    if res[2]: min_frame_files[name] = 1
    if res[3]: zero_var_files[name] = 1
    if res[4]: small_var_files[name] = 1
    if res[5]: overflow_files[name] = 1
    # update progress
    prog['name'] = name[:48]
    prog['zero-length'] = len(zero_len_files)
    prog['min-frames'] = len(min_frame_files)
    prog['zero-var'] = len(zero_var_files)
    prog['small-var'] = len(small_var_files)
    prog['overflow'] = len(overflow_files)
    prog.add(1)
  # ====== remove broken files ====== #
  if not bool(remove_min_length):
    min_frame_files = {}
  new_indices = {name: (start, end)
                 for name, (start, end) in indices.items()
                 if name not in zero_len_files and
                 name not in min_frame_files and
                 name not in zero_var_files and
                 name not in small_var_files and
                 name not in overflow_files}
  print("Filtered #utterances: %s/%s (files)" %
    (ctext(len(indices) - len(new_indices), 'lightcyan'),
     ctext(len(indices), 'cyan')))
  indices = new_indices
  # ====== store save data ====== #
  save_data['zero_len'] = zero_len_files
  save_data['min_dur'] = min_frame_files
  save_data['zero_var'] = zero_var_files
  save_data['small_var'] = small_var_files
  save_data['overflow'] = overflow_files
  # ====== filter-out by number of utt-per-speaker ====== #
  if bool(remove_min_uttspk):
    spk2utt = defaultdict(list)
    for name in indices.keys():
      spk2utt[spkid[name]].append(name)

    n_utt_removed = 0
    n_spk_removed = 0
    removed_utt = []
    keep_utt = []
    for spk, utt in spk2utt.items():
      if len(utt) < min_utt:
        n_utt_removed += len(utt)
        n_spk_removed += 1
        removed_utt += utt
      else:
        keep_utt += utt

    removed_utt = set(removed_utt)
    keep_utt = set(keep_utt)
    save_data['min_utt'] = removed_utt

    print("Removed min-utt/spk:  %s/%s(utt)  %s/%s(spk)" % (
        ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'),
        ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan')
    ))
    assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!"

    indices = {name: (start, end)
               for name, (start, end) in indices.items()
               if name in keep_utt}
  # ====== sample by number of speakers ====== #
  if isinstance(n_speakers, Number) and n_speakers > 0:
    spk2utt = defaultdict(list)
    for name, (start, end) in indices.items():
      spk2utt[spkid[name]].append((name, (start, end)))

    n_org_spk = len(spk2utt)
    n_org_ids = len(indices)
    # only need down-sampling with smaller number of speaker
    if n_speakers < n_org_spk:
      rand = np.random.RandomState(seed=Config.SUPER_SEED)
      tmp = list(spk2utt.keys())
      rand.shuffle(tmp)
      sampled_spk = tmp[:n_speakers]

      indices = []
      for spk in sampled_spk:
        indices += spk2utt[spk]
      indices = dict(indices)
    else:
      sampled_spk = spk2utt
    # print some log
    print("Selected: %s/%s(spk) which have %s/%s(utt)" % (
        ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'),
        ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan')
    ))
  # ====== return the new indices ====== #
  if save_path is not None:
    try:
      with open(save_path, 'wb') as save_file:
        pickle.dump(save_data, save_file)
    except Exception as e:
      print("Cannot save filtering data to path: '%s', error: '%s'" %
        (save_path, str(e)))
  return indices
Exemple #40
0
 def __call__(self, *inputs, **kwargs):
   show_progress = kwargs.pop('show_progress', False)
   # dictionary as inputs
   if len(kwargs) == len(self.inputs_name):
     inputs = [kwargs[i] for i in self.inputs_name]
   # ====== delete un-matchede inputs ====== #
   inputs_new = []
   tmp = list(inputs)
   shapes = list(self._input_shape)
   # this process iteratively remove inputs with mismatch shape
   # to current given input
   for s in shapes:
     for i in tuple(tmp):
       if len(i.shape) != len(s) or \
       any(a is not None and a > 0 and a != b
               for a, b in zip(s, i.shape)): # different ndim, or shape
         tmp.remove(i)
       else:
         inputs_new.append(i)
         tmp.remove(i)
         break
   if len(inputs_new) != len(self.inputs):
     raise ValueError("Given inputs have shape: %s, cannot match the shape of "
                      "defined inputs: %s" %
                      ('; '.join([str(i.shape) for i in inputs]),
                       '; '.join([str(i) for i in self.input_shape])))
   if not self._strict:
     inputs = inputs_new
   # ====== create feed_dict ====== #
   feed_dict = {}
   inputs = flatten_list(inputs, level=None)
   for tensor, value in zip(self.inputs, inputs):
     feed_dict[tensor] = value
   feed_dict.update(self.defaults)
   # check if modifying training mode
   if self.training is None:
     pass
   elif self.training:
     feed_dict.update({is_training(): True})
   else:
     feed_dict.update({is_training(): False})
   session = get_session()
   outputs = None
   # ====== mini-batches ====== #
   if self.batch_size is not None:
     batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)]
                   if len(self.batch_vars) == 0 else self.batch_vars)
     batch_vars = [i for i in batch_vars
                   if i in feed_dict and hasattr(feed_dict[i], 'shape')]
     n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars))
     assert len(n_samples) == 1, \
     "Data have multiple batching dimension: %s" % str(n_samples)
     n_samples = n_samples[0]
     # only continue if we have more samples than `batch_size`
     if n_samples > self.batch_size:
       n_output = len(self.outputs)
       outputs = []
       all_batches = []
       # (optional) showing progress
       if show_progress:
         prog = Progbar(target=n_samples,
                        print_report=False, print_summary=False,
                        name='')
       for s, e in batching(batch_size=int(self.batch_size),
                            n=n_samples):
         if show_progress:
           prog.add(e - s)
         all_batches.append(e - s)
         feed_dict_minibatch = OrderedDict([(k, v[s:e])
                                            if k in batch_vars else (k, v)
                                            for k, v in feed_dict.items()])
         updated = session.run(self.outputs + [self.updates_ops],
                               feed_dict=feed_dict_minibatch)
         updated = updated[:n_output]
         if not self._return_list:
           updated = updated[0]
         outputs.append(updated)
       ## concatenate all outputs
       if not self._return_list:
         o_ndim = outputs[0].ndim
         if o_ndim == 0: # returned scalars
           outputs = np.array(outputs)
         else: # returned array
           for o_axis in range(o_ndim):
             all_n = [o.shape[o_axis] for o in outputs]
             if all_n == all_batches:
               break
           outputs = np.concatenate(outputs, axis=o_axis)
       ## returning a list of outputs
       else:
         new_outputs = []
         for output_idx in range(len(outputs[0])):
           o = [x[output_idx] for x in outputs]
           o_ndim = o[0].ndim
           if o_ndim == 0: # returned scalars
             o = np.array(o)
           else: # returned array
             for o_axis in range(o[0].ndim):
               all_n = [val.shape[o_axis] for val in o]
               if all_n == all_batches:
                 break
             o = np.concatenate(o, axis=o_axis)
           new_outputs.append(o)
         outputs = new_outputs
   # ====== single batch ====== #
   if outputs is None:
     updated = session.run(self.outputs + [self.updates_ops],
                           feed_dict=feed_dict)
     outputs = updated[:len(self.outputs)]
     if not self._return_list:
       outputs = outputs[0]
   # ====== return final output ====== #
   return outputs
Exemple #41
0
def prepare_dnn_data(save_dir, feat_name=None,
                     utt_length=None, seq_mode=None,
                     min_dur=None, min_utt=None,
                     exclude=None, train_proportion=None,
                     return_dataset=False):
  assert os.path.isdir(save_dir), \
      "Path to '%s' is not a directory" % save_dir
  if feat_name is None:
    feat_name = FEATURE_NAME
  if utt_length is None:
    utt_length = int(_args.utt)
  if seq_mode is None:
    seq_mode = str(_args.seq).strip().lower()
  if min_dur is None:
    min_dur = MINIMUM_UTT_DURATION
  if min_utt is None:
    min_utt = MINIMUM_UTT_PER_SPEAKERS
  if exclude is None:
    exclude = str(_args.exclude).strip()
  print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan'))
  print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan'))
  # ******************** prepare dataset ******************** #
  path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)
  assert os.path.exists(path), "Cannot find acoustic dataset at path: %s" % path
  ds = F.Dataset(path=path, read_only=True)
  rand = np.random.RandomState(seed=Config.SUPER_SEED)
  # ====== find the right feature ====== #
  assert feat_name in ds, "Cannot find feature with name: %s" % feat_name
  X = ds[feat_name]
  ids_name = 'indices_%s' % feat_name
  assert ids_name in ds, "Cannot find indices with name: %s" % ids_name
  # ====== basic path ====== #
  path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl')
  path_train_files = os.path.join(save_dir, 'train_files.pkl')
  path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl')
  # ******************** cannot find cached data ******************** #
  if any(not os.path.exists(p) for p in [path_filtered_data,
                                         path_train_files,
                                         path_speaker_info]):
    # ====== exclude some dataset ====== #
    if len(exclude) > 0:
      exclude_dataset = {i: 1 for i in exclude.split(',')}
      print("* Excluded dataset:", ctext(exclude_dataset, 'cyan'))
      indices = {name: (start, end)
                 for name, (start, end) in ds[ids_name].items()
                 if ds['dsname'][name] not in exclude_dataset}
      # special case exclude all the noise data
      if 'noise' in exclude_dataset:
        indices = {name: (start, end)
                   for name, (start, end) in indices.items()
                   if '/' not in name}
    else:
      indices = {i: j for i, j in ds[ids_name].items()}
    # ====== down-sampling if necessary ====== #
    if _args.downsample > 1000:
      dataset2name = defaultdict(list)
      # ordering the indices so we sample the same set every time
      for name in sorted(indices.keys()):
        dataset2name[ds['dsname'][name]].append(name)
      n_total_files = len(indices)
      n_sample_files = int(_args.downsample)
      # get the percentage of each dataset
      dataset2per = {i: len(j) / n_total_files
                     for i, j in dataset2name.items()}
      # sampling based on percentage
      _ = {}
      for dsname, flist in dataset2name.items():
        rand.shuffle(flist)
        n_dataset_files = int(dataset2per[dsname] * n_sample_files)
        _.update({i: indices[i]
                  for i in flist[:n_dataset_files]})
      indices = _
    # ====== * filter out "bad" sample ====== #
    indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'],
                                min_utt=min_utt, min_dur=min_dur,
                                remove_min_length=True,
                                remove_min_uttspk=True,
                                n_speakers=None, ncpu=None,
                                save_path=path_filtered_data)
    # ====== all training file name ====== #
    # modify here to train full dataset
    all_name = sorted(indices.keys())
    rand.shuffle(all_name); rand.shuffle(all_name)
    n_files = len(all_name)
    print("#Files:", ctext(n_files, 'cyan'))
    # ====== speaker mapping ====== #
    name2spk = {name: ds['spkid'][name]
                for name in all_name}
    all_speakers = sorted(set(name2spk.values()))
    spk2label = {spk: i
                 for i, spk in enumerate(all_speakers)}
    name2label = {name: spk2label[spk]
                  for name, spk in name2spk.items()}
    assert len(name2label) == len(all_name)
    print("#Speakers:", ctext(len(all_speakers), 'cyan'))
    # ====== stratify sampling based on speaker ====== #
    valid_name = []
    # create speakers' cluster
    label2name = defaultdict(list)
    for name, label in sorted(name2label.items(),
                              key=lambda x: x[0]):
      label2name[label].append(name)
    # for each speaker with >= 3 utterance
    for label, name_list in sorted(label2name.items(),
                                   key=lambda x: x[0]):
      if len(name_list) < 3:
        continue
      n = max(1, int(0.05 * len(name_list))) # 5% for validation
      valid_name += rand.choice(a=name_list, size=n, replace=False).tolist()
    # train list is the rest
    _ = set(valid_name)
    train_name = [i for i in all_name if i not in _]
    # ====== split training and validation ====== #
    train_indices = {name: indices[name] for name in train_name}
    valid_indices = {name: indices[name] for name in valid_name}
    # ====== save cached data ====== #
    with open(path_train_files, 'wb') as fout:
      pickle.dump({'train': train_indices, 'valid': valid_indices},
                  fout)
    with open(path_speaker_info, 'wb') as fout:
      pickle.dump({'all_speakers': all_speakers,
                   'name2label': name2label,
                   'spk2label': spk2label},
                  fout)
  # ******************** load cached data ******************** #
  else:
    with open(path_train_files, 'rb') as fin:
      obj = pickle.load(fin)
      train_indices = obj['train']
      valid_indices = obj['valid']
    with open(path_speaker_info, 'rb') as fin:
      obj = pickle.load(fin)
      all_speakers = obj['all_speakers']
      name2label = obj['name2label']
      spk2label = obj['spk2label']

  # ******************** print log ******************** #
  def summary_indices(ids):
    datasets = defaultdict(int)
    speakers = defaultdict(list)
    text = ''
    for name in sorted(ids.keys()):
      text += name + str(ids[name])
      dsname = ds['dsname'][name]
      datasets[dsname] += 1
      speakers[dsname].append(ds['spkid'][name])
    for dsname in sorted(datasets.keys()):
      print('  %-18s: %s(utt) %s(spk)' % (
          dsname,
          ctext('%6d' % datasets[dsname], 'cyan'),
          ctext(len(set(speakers[dsname])), 'cyan')))
    print('  MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))
  # ====== training files ====== #
  print("#Train files:", ctext('%-8d' % len(train_indices), 'cyan'),
        "#spk:", ctext(len(set(name2label[name]
                               for name in train_indices.keys())), 'cyan'),
        "#noise:", ctext(len([name for name in train_indices.keys()
                              if '/' in name]), 'cyan'))
  summary_indices(ids=train_indices)
  # ====== valid files ====== #
  print("#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'),
        "#spk:", ctext(len(set(name2label[name]
                               for name in valid_indices.keys())), 'cyan'),
        "#noise:", ctext(len([name for name in valid_indices.keys()
                              if '/' in name]), 'cyan'))
  summary_indices(ids=valid_indices)
  # ******************** create the recipe ******************** #
  assert all(name in name2label
             for name in train_indices.keys())
  assert all(name in name2label
            for name in valid_indices.keys())
  recipes = prepare_dnn_feeder_recipe(name2label=name2label,
                                      n_speakers=len(all_speakers),
                                      utt_length=utt_length, seq_mode=seq_mode)
  # ====== downsample training set for analyzing if required ====== #
  if train_proportion is not None:
    assert 0 < train_proportion < 1
    n_training = len(train_indices)
    train_indices = list(train_indices.items())
    rand.shuffle(train_indices); rand.shuffle(train_indices)
    train_indices = dict(train_indices[:int(n_training * train_proportion)])
  # ====== create feeder ====== #
  train_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X,
                              indices=train_indices),
      batch_mode='batch', ncpu=NCPU, buffer_size=256)

  valid_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X,
                              indices=valid_indices),
      batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64)

  train_feeder.set_recipes(recipes)
  valid_feeder.set_recipes(recipes)
  print(train_feeder)
  print(valid_feeder)
  # ====== debugging ====== #
  if IS_DEBUGGING:
    import matplotlib
    matplotlib.use('Agg')
    prog = Progbar(target=len(valid_feeder), print_summary=True,
                   name="Iterating validation set")
    samples = []
    n_visual = 250
    for name, idx, X, y in valid_feeder.set_batch(batch_size=100000,
                                                  batch_mode='file',
                                                  seed=None, shuffle_level=0):
      assert idx == 0, "Utterances longer than %.2f(sec)" % (100000 * Config.STEP_LENGTH)
      prog['X'] = X.shape
      prog['y'] = y.shape
      prog.add(X.shape[0])
      # random sampling
      if rand.rand(1) < 0.5 and len(samples) < n_visual:
        for i in rand.randint(0, X.shape[0], size=4, dtype='int32'):
          samples.append((name, X[i], np.argmax(y[i], axis=-1)))
    # plot the spectrogram
    n_visual = len(samples)
    V.plot_figure(nrow=n_visual, ncol=8)
    for i, (name, X, y) in enumerate(samples):
      is_noise = '/' in name
      assert name2label[name] == y, "Speaker label mismatch for file: %s" % name
      name = name.split('/')[0]
      dsname = ds['dsname'][name]
      spkid = ds['spkid'][name]
      y = np.argmax(y, axis=-1)
      ax = V.plot_spectrogram(X.T,
                              ax=(n_visual, 1, i + 1),
                              title='#%d' % (i + 1))
      ax.set_title('[%s][%s]%s  %s' %
                   ('noise' if is_noise else 'clean', dsname, name, spkid),
                   fontsize=6)
    # don't need to be high resolutions
    V.plot_save('/tmp/tmp.pdf', dpi=12)
    exit()
  # ====== return ====== #
  if bool(return_dataset):
    return train_feeder, valid_feeder, all_speakers, ds
  return train_feeder, valid_feeder, all_speakers
Exemple #42
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()
 with np.warnings.catch_warnings():
     rand = np.random.RandomState(seed=Config.SUPER_SEED)
     np.warnings.filterwarnings('ignore')
     # ====== stratify sampling from each dataset ====== #
     clusters = defaultdict(list)
     clusters_count = defaultdict(int)
     samples = []
     for row in sorted(ALL_FILES, key=lambda x: x[0]):
         clusters[row[4]].append(row)
         clusters_count[row[4]] += 1
     for k, v in clusters.items():
         rand.shuffle(v)
         samples += v[:18]  # 18 files from each dataset
     # ====== run the MPI for feature extraction ====== #
     prog = Progbar(target=len(samples),
                    print_report=True,
                    print_summary=False,
                    name=FEATURE_RECIPE)
     error_signal = []
     for feat in mpi.MPI(jobs=samples,
                         func=recipe.transform,
                         ncpu=NCPU,
                         batch=1):
         assert FEATURE_NAME in feat
         # update progress
         if isinstance(feat, pp.base.ExtractorSignal):
             error_signal.append(feat)
             prog.add(1)
             continue
         prog['spkid'] = feat['spkid']
         prog['name'] = feat['name']
         prog['dsname'] = feat['dsname']
Exemple #44
0
class Task(object):
  """
  Parameters
  ----------
  func: call-able
      function will be executed for each iteration
  data: single or list of odin.fuel.Data, numpy.ndarray
      iterate over all these data and execute function on
      the data.
  epoch: int
      how many epoch will be repeated
  p: float (0.0 - 1.0)
      probability the `func` will be execute for each iteration
  batch_size: int (> 0)
      number of samples for each iteration
  seed: int
      random seed for shuffling the data
  shuffle_level: int (0, 1, 2)
      if 0, shuffle the file lists
      if 1, shuffle the buffer (i.e. list of processing files) and
          all the previous
      if 2, shuffle the returned batch and all the previous
  callbacks: None, or list of `odin.training.Callback`
      callback will be promoted during the execution of the task
  labels: None, or list of string
      labels for printing the confusion matrix in `odin.utils.Progbar`
  name: None or string
      unique name for Task identity.
  verbose : {0, 1, 2, 3, 4}
      specific verbose level controlling the log output
      0 - Turn off all log
      1 - progress off, only notification
      2 - progress off, notification and summary
      3 - progress on, nothing else
      4 - progress on, notification and summary
      5 - progress on, notification, summary and batch report
  """

  def __init__(self, func, data, epoch=1, p=1.0,
               batch_size=128, seed=None, shuffle_level=2,
               callbacks=None, labels=None, name=None,
               verbose=2):
    super(Task, self).__init__()
    self.set_func(func, data)
    # this Progbar will record the history as well
    self._labels = [str(l) for l in labels] \
        if labels is not None else None
    self._progbar = Progbar(target=self.nb_samples, name=name,
                            interval=0.,
                            print_report=True, print_summary=True)
    self._progbar.set_labels(self._labels)
    # ====== set callback and verbose ====== #
    self._callback = CallbackList(callbacks)
    self.set_verbose(verbose)
    # ====== assign other arguments ====== #
    self._nb_epoch = epoch
    self._p = np.clip(p, 0., 1.)
    self._seed = seed
    self.set_batch(batch_size, seed, shuffle_level)
    self._name = name
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False

  def __str__(self):
    return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \
    (ctext(self.name, 'lightyellow'),
     ctext(self.probability, 'cyan'),
     ctext(self.batch_size, 'cyan'),
     ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'),
     ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'),
     ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'),
     ','.join([ctext(i.__class__.__name__, 'cyan')
               for i in self._callback._callbacks]))

  def __getstate__(self):
    return (self._progbar, self._nb_epoch, self._p, self._name,
            self._batch_size, self._rng, self._seed,
            self._shuffle_level, self._verbose)

  def __setstate__(self, states):
    (self._progbar, self._nb_epoch, self._p, self._name,
     self._batch_size, self._rng, self._seed,
     self._shuffle_level, self._verbose) = states
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False
    # ====== reset value of func and data ====== #
    self._func = None
    self._data = None

  def set_callbacks(self, callbacks):
    self._callback.set_callbacks(callbacks)
    if self._verbose == 0:
      self._callback.set_notification(False)
    else:
      self._callback.set_notification(True)
    return self

  def set_verbose(self, verbose):
    verbose = int(verbose)
    self._verbose = verbose
    if verbose == 0: # turn off everything
      self._callback.set_notification(False)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 1: # progress off, only notification
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 2: # progress off, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 3: # progress on, nothing else
      self._callback.set_notification(False)
      self._progbar.print_progress = True
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 4: # progress on, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 5: # progress on, notification, report, summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = True
    else:
      raise ValueError(
          "Only support verbose value: 0, 1, 2, 3, 4, 5; but given: %s" % str(verbose))

  def set_func(self, func, data):
    # ====== check function ====== #
    self._func = func
    if isinstance(func, K.Function):
      self._output_info = [(o.name, o.shape.as_list())
                           for o in self._func.outputs]
    elif hasattr(func, '__call__'):
      self._output_info = [] # No info (normal function)
    else:
      raise ValueError("No support for function type: %s" %
          func.__class__.__name__)
    # ====== check data ====== #
    if not isinstance(data, (tuple, list)):
      data = [data]
    self._data = [fuel.as_data(i, copy=not isinstance(i, fuel.Feeder))
                  for i in data]
    self._nb_samples = min([d.iter_len for d in self._data])
    return self

  def set_batch(self, batch_size=None, seed=-1, shuffle_level=None):
    if batch_size is not None:
      self._batch_size = batch_size
    if seed is None or seed >= 0:
      if seed is not None:
        self._rng = np.random.RandomState(seed)
      else:
        self._rng = struct()
        self._rng.randint = lambda x: None
        self._rng.rand = get_rng().rand
    if shuffle_level is not None:
      self._shuffle_level = min(max(int(shuffle_level), 0), 2)
    return self

  # ==================== Properties ==================== #
  @property
  def history(self):
    """ Return : dictionary type
      {epoch_id : {tensor_name0: [batch_return1, batch_return2, ...],
                   tensor_name1: [batch_return1, batch_return2, ...],
                   ...},
       1 : {tensor_name0: [batch_return1, batch_return2, ...],
                  tensor_name1: [batch_return1, batch_return2, ...],
                  ...},
       ... }

    Example
    -------
    >>> for task_name, task_hist in task.history.items():
    >>>   print(task_name)
    >>>   for epoch_id, values in task_hist.items():
    >>>     print('  Epoch:', epoch_id)
    >>>     for tensor_name, v in values.items():
    >>>       print('  ', tensor_name, len(v))
    """
    return self._progbar.history

  @property
  def progbar(self):
    return self._progbar

  @property
  def name(self):
    return str(self._name)

  @property
  def labels(self):
    return self._labels

  @property
  def nb_epoch(self):
    return self._nb_epoch

  @property
  def nb_samples(self):
    ''' Estimated number of iteration for each epoch '''
    return self._nb_samples

  @property
  def probability(self):
    """Chance that the func will be execute during iteration"""
    return self._p

  @property
  def iter_per_epoch(self):
    ''' Estimated number of iteration for each epoch '''
    return int(np.ceil(self._nb_samples / self._batch_size))

  @property
  def batch_size(self):
    return self._batch_size

  @property
  def curr_epoch(self):
    """Total number of epoch finished since the beginning of the Task"""
    return self._curr_epoch

  @property
  def curr_iter(self):
    """Total number of iteration finished since the beginning of the Task"""
    return self._curr_iter

  @property
  def curr_samples(self):
    """Total number of samples finished since the beginning of the Task"""
    return self._curr_samples

  @property
  def curr_epoch_iter(self):
    """Number of iteration within current epoch"""
    return self._curr_epoch_iter

  @property
  def curr_epoch_samples(self):
    """Number of samples within current epoch"""
    return self._curr_epoch_samples

  @property
  def callback_msg(self):
    return self._callback_msg

  # ==================== control function ==================== #
  def stop(self):
    """ Stop all iterations running for this Task"""
    if self._created_iter is not None:
      self._stop = True
      # just run to end of the iterators
      for i in self._created_iter:
        pass
      self._stop = False
      self._created_iter = None

  def copy(self):
    return Task(self._func, self._data,
                epoch=self.nb_epoch, p=self.probability,
                batch_size=self.batch_size, seed=self._seed,
                shuffle_level=self._shuffle_level,
                name=self._name, verbose=self._verbose)

  def __iter(self):
    '''
    Return
    ------
    One of the following:
    * 'task_start':
    * 'epoch_start' : beginning of epoch
    * 'epoch_end' : epoch ended
    * 'task_end' : task ended
    * (results, nb_iter, nb_samples,
       nb_total_samples, nb_epoch) : results of execute function on data

    Note
    ----
    'end_task' also end of final epoch
    '''
    yield None # just for initalize the iterator
    self._callback_msg = self._callback.task_start(self)
    yield 'task_start'
    if self._stop:
      yield 'task_end'
    else:
      # ====== start of training ====== #
      while self._curr_epoch < self._nb_epoch:
        self._callback_msg = self._callback.epoch_start(self, self._data)
        yield 'epoch_start'
        seed = self._rng.randint(10e8)
        # if only 1 Data, don't need zip or we will mess up
        if len(self._data) == 1:
          data_it = iter(self._data[0].set_batch(batch_size=self._batch_size,
                                                 seed=seed,
                                                 shuffle_level=self._shuffle_level))
          data = data_it
        else:
          data_it = [iter(d.set_batch(batch_size=self._batch_size,
                                      seed=seed,
                                      shuffle_level=self._shuffle_level))
                     for d in self._data]
          data = zip(*data_it)
        # ======  start the iteration ====== #
        self._curr_epoch_samples = 0
        self._curr_epoch_iter = 0
        with self._progbar.safe_progress():
          for i, x in enumerate(data):
            # alread terminated, try to exhausted the iterator
            # if forced_to_terminate: continue
            # preprocessed the data
            if not isinstance(x, (tuple, list)):
              x = [x]
            # update some info
            shape0 = x[0].shape[0]
            self._curr_samples += shape0
            self._curr_iter += 1
            self._curr_epoch_samples += shape0
            self._curr_epoch_iter += 1
            self._callback_msg = self._callback.batch_start(self, x)
            # apply the function
            if self.probability >= 1. or self._rng.rand() < self.probability:
              results = self._func(*x)
              # add msg from batch_end event
              self._callback_msg += self._callback.batch_end(self, results)
              # return results
              yield results
              # update the progress bar
              for (name, shape), res in zip(self._output_info,
                                            as_tuple(results)):
                if len(shape) == 0: # return single value
                  self._progbar[name] = res
                else: # return tensor
                  self._progbar[name] = res
              self._progbar.add(shape0)
            # check TERMINATE signal
            if self._stop:
              # send signal to the data iterators also
              for i in data_it:
                if hasattr(i, 'stop'):
                  i.stop()
                else: # just iterate all over
                  for _ in i: pass
              # break the epoch loop
              break
        ### Epoch end signaling
        self._curr_epoch += 1
        self._callback_msg = self._callback.epoch_end(
            self, self._progbar.history[self._curr_epoch - 1])
        yield 'epoch_end'
        # ====== check if we got the right number for epoch iter ====== #
        if self._curr_epoch_samples != self._nb_samples:
          # just for sure should not smaller than the real number
          self._nb_samples = self._curr_epoch_samples
        # ======  end_epoch or task ====== #
        if self._stop or self._curr_epoch >= self._nb_epoch:
          self._callback_msg = self._callback.task_end(
              self, self._progbar.history)
          yield 'task_end'
          # showing notification
          if self._verbose >= 1 and self._verbose != 3:
            self._progbar.add_notification('Task "%s" ended!' % str(self.name))
          break
    # ====== end of iteration ====== #
    self._created_iter = None

  def __iter__(self):
    if self._created_iter is None:
      # reset all information
      self._curr_epoch = 0
      self._curr_iter = 0
      self._curr_samples = 0
      self._curr_epoch_iter = 0
      self._curr_epoch_samples = 0
      self._callback_msg = []
      # create new iter
      self._created_iter = self.__iter()
      # initialize the iteration
      next(self._created_iter)
    return self._created_iter

  def __del__(self):
    self.stop()
Exemple #45
0
def fast_pca(*x, n_components=None, algo='rpca', y=None,
             batch_size=1024, return_model=False,
             random_state=5218):
  """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
  batch_size = int(batch_size)
  algo = str(algo).lower()
  if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
    raise ValueError("`algo` must be one of the following: 'pca', "
                     "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
  if algo in ('sppca', 'plda') and y is None:
    raise RuntimeError("`y` must be not None if `algo='sppca'`")
  x = flatten_list(x, level=None)
  x = [i[:] if i.__class__.__name__ == 'MmapData' else i
       for i in x]
  # ====== check input ====== #
  x_train = x[0]
  x_test = x[1:]
  input_shape = None
  if x_train.ndim > 2: # only 2D for PCA
    input_shape = (-1,) + x_train.shape[1:]
    new_shape = (-1, np.prod(input_shape[1:]))
    x_train = np.reshape(x_train, new_shape)
    x_test = [np.reshape(x, new_shape) for x in x_test]
    if n_components is not None: # no need to reshape back
      input_shape = None
  # ====== train PCA ====== #
  if algo == 'sppca':
    pca = SupervisedPPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'plda':
    from odin.ml import PLDA
    pca = PLDA(n_phi=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'pca':
    pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  elif algo == 'rpca':
    # we copy the implementation of RandomizedPCA because
    # it is significantly faster than PCA(svd_solver='randomize')
    pca = RandomizedPCA(n_components=n_components, iterated_power=2,
                        random_state=random_state)
    pca.fit(x_train)
  elif algo == 'ipca':
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    prog = Progbar(target=x_train.shape[0],
                   print_report=False, print_summary=False, name="Fitting PCA")
    for start, end in batching(batch_size=batch_size, n=x_train.shape[0],
                               seed=5218):
      pca.partial_fit(x_train[start:end], check_input=False)
      prog.add(end - start)
  elif algo == 'ppca':
    pca = PPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  # ====== transform ====== #
  x_train = pca.transform(x_train)
  x_test = [pca.transform(x) for x in x_test]
  # reshape back to original shape if necessary
  if input_shape is not None:
    x_train = np.reshape(x_train, input_shape)
    x_test = [np.reshape(x, input_shape) for x in x_test]
  # return the results
  if len(x_test) == 0:
    return x_train if not return_model else (pca, x_train)
  return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
Exemple #46
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
  """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
  # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
  # add reading data from indices also
  # ====== check input dataset ====== #
  own_dataset = True
  if is_string(dataset) and os.path.isdir(dataset):
    dataset = Dataset(dataset, read_only=True)
  elif isinstance(dataset, Dataset):
    own_dataset = False
  elif isinstance(dataset, FeatureProcessor):
    dataset = Dataset(dataset.path, read_only=True)
  else:
    raise ValueError("Cannot acquire Dataset from input: %s" %
                     str(dataset))
  # ====== extract all feat_name ====== #
  if is_string(feat_name) and feat_name == 'auto':
    feat_name = []
    for k in dataset.keys():
      X = dataset[k]
      if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
        feat_name.append(k)
  else:
    feat_name = [name
                 for name in as_tuple(feat_name, t=str)
                 if name in dataset]
  # ====== load PCA ====== #
  from odin.ml import MiniBatchPCA
  # init PCA
  nb_samples = 0
  for feat in feat_name:
    nb_samples += dataset[feat].shape[0]
  # ====== prepare MPI PCA ====== #
  add_notification("Selected features for PCA: " +
      ctext(', '.join(feat_name), 'yellow'))

  def map_pca(name):
    X = dataset[name]
    # found exist pca model
    if 'pca_' + feat in dataset and not override:
      pca = dataset['pca_' + feat]
    # create new PCA
    else:
      pca = MiniBatchPCA(n_components=None, whiten=False,
                         copy=True, batch_size=None)
    # No shuffling make iter much faster
    for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0):
      pca.partial_fit(x)
      yield x.shape[0]
    # save PCA model
    with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
      cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
    # finish return feature name
    yield name
  mpi = MPI(jobs=feat_name, func=map_pca,
            ncpu=None, batch=1, hwm=12082518,
            backend='python')
  # ====== running the MPI ====== #
  remain_features = list(feat_name)
  finished_features = []
  prog = Progbar(target=nb_samples, print_summary=True, print_report=True,
                 name='PCA')
  for n in mpi:
    if is_string(n):
      remain_features.remove(n)
      finished_features.append(n)
    else:
      prog['Remain'] = ', '.join(remain_features)
      prog['Finished'] = ', '.join(finished_features)
      prog.add(n)
  # ====== return ====== #
  if own_dataset:
    dataset.close()
  loss = tf.losses.log_loss(labels=X, predictions=X_probas)
# ===========================================================================
# Optimizing the network
# ===========================================================================
update_ops = K.optimizers.Adam(lr=0.001).minimize(loss)
K.initialize_all_variables()
# ====== intitalize ====== #
record_train_loss = []
record_valid_loss = []
patience = 3
epoch = 0
# We want the rate to go up but the distortion to go down
while True:
  # ====== training ====== #
  train_losses = []
  prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch)
  start_time = timeit.default_timer()
  for start, end in batching(batch_size=args.bs, n=X_train.shape[0],
                             seed=K.get_rng().randint(10e8)):
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
Exemple #48
0
    shutil.rmtree(wav_path)
  elif len(os.listdir(wav_path)) != TOTAL_FILES:
    print("Found only %d files at '%s', delete old wave files" %
      (len(os.listdir(wav_path)), wav_path))
    shutil.rmtree(wav_path)
# ====== convert all compress audio to .wav using sph2pipe ====== #
if not os.path.exists(wav_path):
  os.mkdir(wav_path)
  cmds = ["sph2pipe %s %s -f rif" % (path, os.path.join(wav_path, get_name(path)))
          for path in audio_files]

  def mpi_fn(cmd):
    exec_commands(cmd, print_progress=False)
    yield len(cmd)
  prog = Progbar(target=len(cmds),
                 print_report=True, print_summary=True,
                 name='Converting .sph to .wav')
  # run the MPI tasks
  mpi = MPI(jobs=cmds, func=mpi_fn,
            ncpu=cpu_count() - 1, batch=12)
  for i in mpi:
    prog.add(i)
# ===========================================================================
# Extract Acoustic features
# ===========================================================================
jobs = get_all_files(wav_path,
                     filter_func=lambda x: '.wav' == x[-4:])
assert len(jobs) == TOTAL_FILES
# ====== configuration ====== #
if not os.path.exists(outpath) or args.ds:
  extractors = pp.make_pipeline(steps=[
Exemple #49
0
    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))
Exemple #50
0
for feat_name in features:
  # ====== check all indices not overlap ====== #
  all_indices = [F.MmapDict(path=os.path.join(i, 'indices_%s' % feat_name),
                        read_only=True)
                 for i in inpath]
  _ = []
  for ids in all_indices:
    _ += list(ids.keys())
  assert len(_) == len(set(_)), "Overlap indices name"
  # ====== initialize ====== #
  out_data = None
  out_indices = {}
  start = 0
  curr_nfile = 0
  prog = Progbar(target=sum(len(i) for i in all_indices),
                 print_summary=True,
                 name=outpath)

  for i, path in enumerate(inpath):
    in_data = F.MmapData(path=os.path.join(path, feat_name),
                         read_only=True)
    in_indices = all_indices[i]
    # initialize
    if out_data is None:
      out_data = F.MmapData(path=os.path.join(outpath, feat_name),
                            dtype=in_data.dtype,
                            shape=(0,) + in_data.shape[1:],
                            read_only=False)
    # copy data
    for name, (s, e) in list(in_indices.items()):
      X = in_data[s:e]
Exemple #51
0
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208):
  """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
  # Load dataset
  frame_length = int(utt_length / FRAME_SHIFT)
  ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe),
                 read_only=True)
  X = ds[feat]
  train_indices = {name: ds['indices'][name]
                   for name in TRAIN_DATA.keys()}
  test_indices = {name: start_end
                  for name, start_end in ds['indices'].items()
                  if name not in TRAIN_DATA}
  train_indices, valid_indices = train_valid_test_split(
      x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed)
  all_speakers = sorted(set(TRAIN_DATA.values()))
  n_speakers = max(all_speakers) + 1
  print("#Train files:", ctext(len(train_indices), 'cyan'))
  print("#Valid files:", ctext(len(valid_indices), 'cyan'))
  print("#Test files:", ctext(len(test_indices), 'cyan'))
  print("#Speakers:", ctext(n_speakers, 'cyan'))
  recipes = [
      F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length,
                           end='pad', pad_value=0, pad_mode='post',
                           data_idx=0),
      F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0),
      F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
  ]
  train_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=train_indices),
      batch_mode='batch', ncpu=7, buffer_size=12)
  valid_feeder = F.Feeder(
      data_desc=F.IndexedData(data=X, indices=valid_indices),
      batch_mode='batch', ncpu=2, buffer_size=4)
  train_feeder.set_recipes(recipes)
  valid_feeder.set_recipes(recipes)
  print(train_feeder)
  # ====== cache the test data ====== #
  cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length)))
  cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length)))
  # validate cache files
  if os.path.exists(cache_ids):
    with open(cache_ids, 'rb') as f:
      ids = pickle.load(f)
    if len(ids) != len(test_indices):
      os.remove(cache_ids)
      if os.path.exists(cache_dat):
        os.remove(cache_dat)
  elif os.path.exists(cache_dat):
    os.remove(cache_dat)
  # caching
  if not os.path.exists(cache_dat):
    dat = F.MmapData(cache_dat, dtype='float16',
                     shape=(0, frame_length, X.shape[1]))
    ids = {}
    prog = Progbar(target=len(test_indices))
    s = 0
    for name, (start, end) in test_indices.items():
      y = X[start:end]
      y = segment_axis(y, axis=0,
                       frame_length=frame_length, step_length=frame_length,
                       end='pad', pad_value=0, pad_mode='post')
      dat.append(y)
      # update indices
      ids[name] = (s, s + len(y))
      s += len(y)
      # update progress
      prog.add(1)
    dat.flush()
    dat.close()
    with open(cache_ids, 'wb') as f:
      pickle.dump(ids, f)
  # ====== re-load ====== #
  dat = F.MmapData(cache_dat, read_only=True)
  with open(cache_ids, 'rb') as f:
    ids = pickle.load(f)
  # ====== save some sample ====== #
  sample_path = os.path.join(PATH_EXP,
                             'test_%s_%d.pdf' % (feat, int(utt_length)))
  V.plot_figure(nrow=9, ncol=6)
  for i, (name, (start, end)) in enumerate(
      sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)):
    x = dat[start:end][:].astype('float32')
    ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                            ax=(12, 1, i + 1), title='')
    ax.set_title(name)
  V.plot_save(sample_path)
  return (train_feeder, valid_feeder,
          ids, dat, all_speakers)
Exemple #52
0
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
               token_not_found='ignore'):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
       pass
   """
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
   else:
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
         continue
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
         vec.append(token_not_found)
     # append ending document token
     if end_document is not None:
       vec.append(end_document)
     # add the final results
     results.append(vec)
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       prog.add(1)
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
                             value=value)
   # transform into one-hot matrix
   else:
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results
Exemple #53
0
def fast_pca(*x,
             n_components=None,
             algo='rpca',
             y=None,
             batch_size=1024,
             return_model=False,
             random_state=1234):
    """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
    batch_size = int(batch_size)
    algo = str(algo).lower()
    if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
        raise ValueError(
            "`algo` must be one of the following: 'pca', "
            "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
    if algo in ('sppca', 'plda') and y is None:
        raise RuntimeError("`y` must be not None if `algo='sppca'`")
    x = flatten_list(x, level=None)
    x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x]
    # ====== check input ====== #
    x_train = x[0]
    x_test = x[1:]
    input_shape = None
    if x_train.ndim > 2:  # only 2D for PCA
        input_shape = (-1, ) + x_train.shape[1:]
        new_shape = (-1, np.prod(input_shape[1:]))
        x_train = np.reshape(x_train, new_shape)
        x_test = [np.reshape(x, new_shape) for x in x_test]
        if n_components is not None:  # no need to reshape back
            input_shape = None
    # ====== train PCA ====== #
    if algo == 'sppca':
        pca = SupervisedPPCA(n_components=n_components,
                             random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'plda':
        from odin.ml import PLDA
        pca = PLDA(n_phi=n_components, random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'pca':
        pca = PCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    elif algo == 'rpca':
        # we copy the implementation of RandomizedPCA because
        # it is significantly faster than PCA(svd_solver='randomize')
        pca = RandomizedPCA(n_components=n_components,
                            iterated_power=2,
                            random_state=random_state)
        pca.fit(x_train)
    elif algo == 'ipca':
        pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        prog = Progbar(target=x_train.shape[0],
                       print_report=False,
                       print_summary=False,
                       name="Fitting PCA")
        for start, end in batching(batch_size=batch_size,
                                   n=x_train.shape[0],
                                   seed=1234):
            pca.partial_fit(x_train[start:end], check_input=False)
            prog.add(end - start)
    elif algo == 'ppca':
        pca = PPCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    # ====== transform ====== #
    x_train = pca.transform(x_train)
    x_test = [pca.transform(x) for x in x_test]
    # reshape back to original shape if necessary
    if input_shape is not None:
        x_train = np.reshape(x_train, input_shape)
        x_test = [np.reshape(x, input_shape) for x in x_test]
    # return the results
    if len(x_test) == 0:
        return x_train if not return_model else (pca, x_train)
    return tuple([x_train] +
                 x_test) if not return_model else tuple([pca, x_train] +
                                                        x_test)
Exemple #54
0
      training.EarlyStopGeneralizationLoss('valid', ce,
                                           threshold=5, patience=5)
  ])
  task.set_train_task(func=f_train, data=train,
                      epoch=args.epoch, name='train')
  task.set_valid_task(func=f_score, data=valid,
                      freq=training.Timer(percentage=0.8),
                      name='valid')
  task.run()
# ===========================================================================
# Saving the test data
# CSV separated by tab
# ===========================================================================
sep = '\t'
prog = Progbar(target=len(test_ids) + len(train) + len(valid),
               print_summary=True, print_report=True,
               name="Extracting x-vector")
with open(TRAIN_PATH, 'w') as f_train, open(TEST_PATH, 'w') as f_test:
  # ====== save training set ====== #
  for name, idx, X, y in train.set_batch(batch_size=8000,
                                         batch_mode='file', seed=None):
    assert idx == 0
    y = np.argmax(y, axis=-1)
    assert len(set(y)) == 1
    y = y[0]
    z = np.mean(f_z(X), axis=0, keepdims=False).astype('float32')
    f_train.write(sep.join([str(y)] + [str(i) for i in z]) + '\n')
    prog.add(X.shape[0])
  # ====== save validation set ====== #
  for name, idx, X, y in valid.set_batch(batch_size=8000,
                                         batch_mode='file', seed=None):
Exemple #55
0
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)
Exemple #56
0
def validating_training_data(in_path_raw, training_dataset):
  file_list = {ds: sre_file_list[ds]
               for ds in training_dataset
               if ds in sre_file_list}
  # ====== meta info ====== #
  all_files = []
  non_exist_files = []
  extension_count = defaultdict(int)
  total_data = sum(v.shape[0]
                   for k, v in file_list.items()
                   if k not in('musan', 'rirs'))
  # ====== progress ====== #
  prog = Progbar(target=total_data,
                 print_summary=True, print_report=True,
                 name="Preprocessing File List")
  prog.set_summarizer('#Files', fn=lambda x: x[-1])
  prog.set_summarizer('#Non-exist', fn=lambda x: x[-1])
  # ====== iterating ====== #
  for ds_name, data in sorted(file_list.items(),
                              key=lambda x: x[0]):
    if ds_name in ('musan', 'rirs'):
      continue
    for row in data:
      path, channel, name, spkid = row[:4]
      assert channel in ('0', '1')
      # check path provided
      if ds_name in in_path_raw:
        path = os.path.join(in_path_raw[ds_name], path)
      # create new row
      start_time = '-'
      end_time = '-'
      if ds_name == 'mx6':
        start_time, end_time = row[-2:]
      new_row = [path, channel, name,
                 ds_name + '_' + spkid, ds_name,
                 start_time, end_time]
      # check file exist
      if os.path.exists(path):
        all_files.append(new_row)
      else:
        non_exist_files.append(new_row)
      # extension
      ext = os.path.splitext(path)[-1]
      extension_count[ext + '-' + ds_name] += 1
      # update progress
      prog['Dataset'] = ds_name
      prog['#Files'] = len(all_files)
      prog['#Non-exist'] = len(non_exist_files)
      prog.add(1)
  # final results
  all_files = np.array(all_files)
  if len(all_files) == 0:
    return all_files, np.array(non_exist_files), extension_count
  # ====== check no duplicated name ====== #
  n_files = len(all_files)
  n_unique_files = len(np.unique(all_files[:, 2]))
  assert n_files == n_unique_files, \
  'Found duplicated name: %d != %d' % (n_files, n_unique_files)
  # ====== check no duplicated speaker ====== #
  n_spk = sum(len(np.unique(dat[:, 3]))
              for name, dat in file_list.items()
              if name not in ('musan', 'rirs'))
  n_unique_spk = len(np.unique(all_files[:, 3]))
  assert n_spk == n_unique_spk, \
  'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk)
  # ====== return ====== #
  # Header:
  #  0       1      2      3       4          5         6
  # path, channel, name, spkid, dataset, start_time, end_time
  return all_files, np.array(non_exist_files), extension_count
Exemple #57
0
  loss = tf.losses.log_loss(labels=X, predictions=X_probas)
# ===========================================================================
# Optimizing the network
# ===========================================================================
update_ops = K.optimizers.Adam(lr=0.001).minimize(loss)
K.initialize_all_variables()
# ====== intitalize ====== #
record_train_loss = []
record_valid_loss = []
patience = 3
epoch = 0
# We want the rate to go up but the distortion to go down
while True:
  # ====== training ====== #
  train_losses = []
  prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch)
  start_time = timeit.default_timer()
  for start, end in batching(batch_size=args.bs, n=X_train.shape[0],
                             seed=K.get_rng().randint(10e8)):
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
Exemple #58
0
def validate_features(ds_or_processor, path, nb_samples=25,
                      override=False, seed=12082518, fig_width=4):
  # TODO: add PCA visualization
  # TODO: update to match new indices style
  def logger(title, tag, check):
    check = bool(check)
    text_color = 'yellow' if check else 'red'
    print(ctext('   *', 'cyan'),
          ctext(str(title), text_color),
          ctext(str(tag), 'magenta'),
          ctext("✓", text_color) if check else ctext("✗", text_color))
  import matplotlib
  matplotlib.use('Agg')
  from odin.visual import plot_save, plot_multiple_features
  # ====== check path to dataset ====== #
  should_close_ds = True
  if isinstance(ds_or_processor, FeatureProcessor):
    ds = Dataset(ds_or_processor.path, read_only=True)
  elif is_string(ds_or_processor):
    ds = Dataset(ds_or_processor, read_only=True)
  elif isinstance(ds_or_processor, Dataset):
    ds = ds_or_processor
    should_close_ds = False
  else:
    raise ValueError("`ds` can be None, string, or Dataset. No "
                     "support for given input type: %s" % str(type(ds)))
  print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
  # ====== extract the config of the dataset ====== #
  if 'config' not in ds:
    raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` "
                       "which must contain `config` MmapDict of extracted "
                       "features configuration.")
  # config = ds['config']
  # pipeline = ds['pipeline']
  # ====== output path ====== #
  path = str(path)
  if not os.path.exists(path):
    os.mkdir(path)
  elif override:
    if os.path.isfile(path):
      os.remove(path)
    else:
      shutil.rmtree(path)
    os.mkdir(path)
  else:
    raise ValueError("`path`=%s exists, cannot override." % path)
  prev_stdio = get_stdio_path()
  stdio(path=os.path.join(path, 'log.txt'))
  nb_samples = int(nb_samples)
  # ====== get all features ====== #
  # [(name, dtype, statistic-able), ...]
  all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
  # store all features (included the features in external_indices
  all_features = []
  # the external indices can be: indices_mfcc_bnf
  external_indices = flatten_list([k.split('_')[1:] for k in all_keys
                                   if 'indices' in k and k != 'indices'])
  # ====== checking indices ====== #
  main_indices = {name: (start, end)
                  for name, (start, end) in ds['indices'].items()}
  for ids_name in (k for k in all_keys if 'indices' in k):
    ids = sorted([(name, start, end)
                  for name, (start, end) in ds[ids_name].items()],
                 key=lambda x: x[1])
    for prev, now in zip(ids, ids[1:]):
      assert prev[2] == now[1], "Zero length in indices"
      assert prev[2] - prev[1] > 0, "Zero length in indices"
      assert now[2] - now[1] > 0, "Zero length in indices"
    # final length match length of Data
    if ids_name != 'indices':
      for feat_name in ids_name.split('_')[1:]:
        assert now[-1] == len(ds[feat_name]), \
            "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
            (ids_name, feat_name)
        all_features.append(feat_name)
    else:
      for feat_name in all_keys:
        if feat_name not in external_indices and \
        'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
        'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
        isinstance(ds[feat_name], MmapData):
          assert now[-1] == len(ds[feat_name]), \
          "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
          all_features.append(feat_name)
    # logging
    logger("Checked all:", ids_name, True)
  # ====== check all dictionary types ====== #
  for name in all_keys:
    if isinstance(ds[name], MmapDict) and 'indices' not in name:
      data = ds[name]
      # special cases
      if name == 'sr':
        checking_func = lambda x: x > 0 # for sr
      else:
        checking_func = lambda x: True
      # check
      for key, val in data.items():
        assert key in main_indices, \
        "Dictionary with name:'%s' has key not found in indices." % name
        assert checking_func(val)
      logger("Checked dictionary: ", name, True)
  # ====== checking each type of data ====== #
  # get all stats name
  all_stats = defaultdict(list)
  for k in all_keys:
    if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
    'mean' == k[-4:] or 'std' == k[-3:]:
      all_stats[k[:-4].split('_')[0]].append(k)
  # get all pca name
  all_pca = {i: i + '_pca' for i in all_features
             if i + '_pca' in ds}
  # checking one-by-one numpy.ndarray features array
  for feat_name in all_features:
    dtype = str(ds[feat_name].dtype)
    # checking all data
    indices = ds.find_prefix(feat_name, 'indices')
    prog = Progbar(target=len(indices), interval=0.1,
                   print_report=True,
                   name='Checking: %s(%s)' % (feat_name, dtype))
    # start iterating over all data file
    fail_test = False
    for file_name, (start, end) in indices:
      dat = ds[feat_name][start:end]
      # No NaN value
      if np.any(np.isnan(dat)):
        logger("NaN values", file_name + ':' + feat_name, False)
        fail_test = True
      # not all value closed to zeros
      if np.all(np.isclose(dat, 0.)):
        logger("All-closed-zeros values", file_name + ':' + feat_name,
               False)
        fail_test = True
      prog['Name'] = file_name
      prog.add(1)
    if not fail_test:
      logger("Check data incredibility for: ", feat_name, True)
    # checking statistics
    if feat_name in all_stats:
      fail_test = False
      for stat_name in all_stats[feat_name]:
        X = ds[stat_name]
        if X.ndim >= 1:
          X = X[:]
        if np.any(np.isnan(X)):
          logger("NaN values", feat_name + ':' + stat_name, False)
          fail_test = True
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values", feat_name + ':' + stat_name,
                 False)
          fail_test = True
      if not fail_test:
        logger("Check statistics for: ", feat_name, True)
    # check PCA
    if feat_name in all_pca:
      pca = ds[all_pca[feat_name]]
      n = ds[feat_name].shape[0]
      nb_feats = ds[feat_name].shape[-1]
      fail_test = False
      # performing PCA on random samples
      for i in range(nb_samples):
        start = np.random.randint(0, n - nb_samples - 1)
        X = pca.transform(
            ds[feat_name][start:(start + nb_samples)],
            n_components=max(nb_feats // 2, 1))
        if np.any(np.isnan(X)):
          logger("NaN values in PCA", feat_name, False)
          fail_test = True
          break
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values in PCA", feat_name, False)
          fail_test = True
          break
      if not fail_test:
        logger("Check PCA for: ", feat_name, True)
  # ====== Do sampling ====== #
  np.random.seed(seed) # seed for reproceducible
  all_samples = np.random.choice(list(ds['indices'].keys()),
                                 size=nb_samples,
                                 replace=False)
  # plotting all samples
  for sample_id, file_name in enumerate(all_samples):
    X = {}
    for feat_name in all_features:
      start, end = ds.find_prefix(feat_name, 'indices')[file_name]
      feat = ds[feat_name][start:end]
      X[feat_name] = feat
      # some special handling
      try:
        _special_cases(X=feat, feat_name=feat_name, file_name=file_name,
                       ds=ds, path=path)
      except Exception as e:
        logger("Special case error: %s" % str(e),
               file_name + ':' + feat_name, False)
    plot_multiple_features(X, title=file_name, fig_width=fig_width)
    figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name))
    plot_save(figure_path, log=False, clear_all=True)
    logger("Sample figure saved at: ", figure_path, True)
  # plotting the statistic
  figure_path = os.path.join(path, 'stats.pdf')
  for feat_name, stat_name in all_stats.items():
    X = {name: ds[name][:]
         for name in stat_name
         if ds[name].ndim >= 1}
    if len(X) > 0:
      plot_multiple_features(X, title=feat_name, fig_width=fig_width)
  plot_save(figure_path, log=False, clear_all=True)
  logger("Stats figure save at: ", figure_path, True)
  logger("All reports at folder: ", os.path.abspath(path), True)
  # ====== cleaning ====== #
  stdio(path=prev_stdio)
  if should_close_ds:
    ds.close()