Esempio n. 1
0
 def write_history(self, *msg):
     with open(os.path.join(self._save_path, 'history.txt'), 'a+') as f:
         f.write("[%s]" % get_formatted_datetime(only_number=False))
         for i, m in enumerate(msg):
             sep = " " if i == 0 else "\t"
             f.write("%s%s\n" % (sep, str(m)))
     return self
Esempio n. 2
0
 def __init__(self):
     super(ExtractorSignal, self).__init__()
     self._timestamp = get_formatted_datetime(only_number=False)
     self._extractor = None
     self._msg = ''
     self._action = 'ignore'
     self._last_input = {}
Esempio n. 3
0
 def get_config_path(self, cfg: DictConfig = None, datetime=False):
     output_path = self.get_output_path(cfg)
     if datetime:
         return os.path.join(
             output_path,
             'configs_%s.yaml' % get_formatted_datetime(only_number=False))
     return os.path.join(output_path, 'configs.yaml')
Esempio n. 4
0
File: base.py Progetto: imito/odin
 def __init__(self):
   super(ExtractorSignal, self).__init__()
   self._timestamp = get_formatted_datetime(only_number=False)
   self._extractor = None
   self._msg = ''
   self._action = 'ignore'
   self._last_input = {}
Esempio n. 5
0
    def fetch_exp_cfg(self, conditions={}, require_model=True) -> dict:
        r"""

    Arguments:
      require_model : a Boolean. If True, only return exp with saved model

    Return:
      A dictionary mapping from path to experiments and list of configs
    """
        conditions = _prepare_conditions(conditions)

        def get_attr(c, name):
            if '.' in name:
                for key in name.split('.'):
                    c = c.get(key)
                return c
            return c[name]

        # prepare the path
        path = self._save_path
        exp_path = [
            os.path.join(path, name) for name in os.listdir(path)
            if 'exp_' == name[:4]
        ]
        # filter path with require_model
        if require_model:
            exp_path = list(
                filter(lambda x: os.path.isdir(os.path.join(x, 'model')),
                       exp_path))
        ret = {}
        for path in exp_path:
            cfg = sorted(
                [
                    os.path.join(path, i)
                    for i in os.listdir(path) if 'configs_' == i[:8]
                ],
                key=lambda x: get_formatted_datetime(
                    only_number=False,
                    convert_text=x.split('_')[-1].split('.')[0]).timestamp())
            if len(cfg) > 0:
                if len(conditions) > 0:
                    last_cfg = cfg[-1]  # lastest config
                    with open(last_cfg, 'r') as f:
                        last_cfg = OmegaConf.load(f)
                    # filter the conditions
                    if all(
                            get_attr(last_cfg, key) in val
                            for key, val in conditions.items()):
                        ret[path] = cfg
                    del last_cfg
                else:
                    ret[path] = cfg
        return ret
Esempio n. 6
0
 def _map_func(dat):
   try:
     ret = self.extractor.transform(dat)
   except Exception as e: # Non-handled exception
     ret = '\n========\n'
     ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
     ret += 'Error : `%s`\n' % str(e)
     ret += 'Input : `%s`\n' % str(dat)
     import traceback
     etype, value, tb = sys.exc_info()
     for line in traceback.TracebackException(
             type(value), value, tb, limit=None).format(chain=True):
       ret += line
   return ret
Esempio n. 7
0
 def _map_func(dat):
     try:
         ret = self.extractor.transform(dat)
     except Exception as e:  # Non-handled exception
         ret = '\n========\n'
         ret += 'Time  : `%s`\n' % str(
             get_formatted_datetime(only_number=False))
         ret += 'Error : `%s`\n' % str(e)
         ret += 'Input : `%s`\n' % str(dat)
         import traceback
         etype, value, tb = sys.exc_info()
         for line in traceback.TracebackException(
                 type(value), value, tb, limit=None).format(chain=True):
             ret += line
     return ret
Esempio n. 8
0
def get_exp_path(system_name, args, override=False):
    """ Return: exp_dir, model_path, log_path """
    exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' %
                          (system_name, args.task, args.feat))
    if 'nmix' in args:
        exp_dir += '_%d' % args.nmix
    if 'tdim' in args:
        exp_dir += '_%d' % args.tdim
    # ====== check override ====== #
    if bool(override) and os.path.exists(exp_dir):
        shutil.rmtree(exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
    # ====== basic paths ====== #
    model_path = os.path.join(exp_dir, 'model.ai')
    log_path = os.path.join(
        exp_dir, 'log_%s.txt' % get_formatted_datetime(only_number=True))
    print("Exp dir:", ctext(exp_dir, 'cyan'))
    print("Model path:", ctext(model_path, 'cyan'))
    print("Log path:", ctext(log_path, 'cyan'))
    return exp_dir, model_path, log_path
Esempio n. 9
0
File: utils.py Progetto: imito/odin
def get_exp_path(system_name, args, override=False):
  """ Return: exp_dir, model_path, log_path """
  exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' %
    (system_name, args.task, args.feat))
  if 'nmix' in args:
    exp_dir += '_%d' % args.nmix
  if 'tdim' in args:
    exp_dir += '_%d' % args.tdim
  # ====== check override ====== #
  if bool(override) and os.path.exists(exp_dir):
    shutil.rmtree(exp_dir)
  if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)
  # ====== basic paths ====== #
  model_path = os.path.join(exp_dir, 'model.ai')
  log_path = os.path.join(exp_dir,
                         'log_%s.txt' % get_formatted_datetime(only_number=True))
  print("Exp dir:", ctext(exp_dir, 'cyan'))
  print("Model path:", ctext(model_path, 'cyan'))
  print("Log path:", ctext(log_path, 'cyan'))
  return exp_dir, model_path, log_path
Esempio n. 10
0
    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))
Esempio n. 11
0
from odin.stats import describe

from helpers import (SCORING_DATASETS, BACKEND_DATASETS, SCORE_SYSTEM_NAME,
                     SCORE_SYSTEM_ID, N_PLDA, N_LDA, PLDA_MAXIMUM_LIKELIHOOD,
                     PLDA_SHOW_LLK, PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE,
                     FEATURE_NAME, get_model_path, NCPU, get_logpath,
                     prepare_dnn_feeder_recipe, sre_file_list, Config, EXP_DIR,
                     VECTORS_DIR, RESULT_DIR, filter_utterances)
# ====== scoring log ====== #
stdio(
    get_logpath(name='make_score.log',
                increasing=True,
                odin_base=False,
                root=EXP_DIR))
print('=' * 48)
print(get_formatted_datetime(only_number=False))
print("System name    :", SCORE_SYSTEM_NAME)
print("System id      :", SCORE_SYSTEM_ID)
print("Feature recipe :", FEATURE_RECIPE)
print("Feature name   :", FEATURE_NAME)
print("Backend dataset:", ','.join(BACKEND_DATASETS.keys()))
print("Scoring dataset:", ','.join(SCORING_DATASETS.keys()))
print('=' * 48)


# ===========================================================================
# Some helper
# ===========================================================================
def _check_running_feature_extraction(feat_dir, n_files):
    # True mean need to run the feature extraction
    if not os.path.exists(feat_dir):
Esempio n. 12
0
  def run(self):
    njobs = len(self.jobs)
    dataset = Dataset(self.path)
    if self.n_cache <= 1:
      cache_limit = max(2, int(0.12 * njobs))
    else:
      cache_limit = int(self.n_cache)
    # ====== indices ====== #
    databases = defaultdictkey(lambda key:
        MmapDict(path=os.path.join(dataset.path, key), cache_size=10000,
                 read_only=False))
    last_start = defaultdict(int)
    # ====== statistic ====== #
    # load old statistics
    stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2)
    for key in dataset.keys():
      if 'sum1' == key[-4]:
        stats[key[:-4]][0] = dataset[key][:]
      elif 'sum2' == key[-4:]:
        stats[key[:-4]][1] = dataset[key][:]
    # all data are cached for periodically flushed
    cache = defaultdict(list)
    n_processed = [0] # store the value as reference

    # ====== helper ====== #
    def flush_feature(feat_name, X_cached):
      if len(X_cached) > 0:
        X_cached = np.concatenate(X_cached, 0)
        # flush data
        if feat_name in dataset:
          dataset[feat_name].append(X_cached)
        else:
          dataset[(feat_name, 'memmap')] = X_cached

    # ====== repeated for each result returned ====== #
    def post_processing(result):
      # search for file name
      if self.identifier not in result:
        raise RuntimeError(
            "Cannot find identifier '%s' in returned dictionary" % self.identifier)
      file_name = result[self.identifier]
      # invalid file_name
      if not is_string(file_name):
        raise RuntimeError("Cannot find file name in returned features "
            "list, the file name can be specified in key: 'name', 'path' "
            "and the type of the value must be string. All available "
            "keys are: %s" % str(result.keys()))
      # store all new indices
      # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
      all_indices = {}
      # processing
      for feat_name, X in result.items():
        # some invalid feat_name
        if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
          raise RuntimeError("Returned features' name cannot be one "
                             "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
        # ignore some feat_name
        if feat_name in ('name'):
          continue
        # if numpy ndarray, save to MmapData
        if isinstance(X, np.ndarray) or \
        'sum1' == feat_name[-4:] or \
        'sum2' == feat_name[-4:]:
          # save statistics instead
          if 'sum1' == feat_name[-4:]:
            stats[feat_name[:-4]][0] += X
          elif 'sum2' == feat_name[-4:]:
            stats[feat_name[:-4]][1] += X
          # save features array
          else:
            all_indices[feat_name] = X.shape[0]
            # cache data, only if we have more than 0 sample
            if X.shape[0] > 0:
              cache[feat_name].append(X)
        # else all other kind of data save to MmapDict
        else:
          databases[feat_name][file_name] = X
        # remove data
        del X
      # ====== update indices ====== #
      if len(all_indices) > 0:
        for feat_name, n in all_indices.items():
          ids_name = 'indices_%s' % feat_name
          databases[ids_name][file_name] = (last_start[ids_name],
                                            last_start[ids_name] + n)
          last_start[ids_name] += n
      # ====== flush cache ====== #
      n_processed[0] += 1
      if n_processed[0] % cache_limit == 0: # 12 + 8
        for feat_name, X_cached in cache.items():
          flush_feature(feat_name, X_cached)
        cache.clear()
      # ====== update progress ====== #
      return file_name

    # ====== mapping function ====== #
    def _map_func(dat):
      try:
        ret = self.extractor.transform(dat)
      except Exception as e: # Non-handled exception
        ret = '\n========\n'
        ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
        ret += 'Error : `%s`\n' % str(e)
        ret += 'Input : `%s`\n' % str(dat)
        import traceback
        etype, value, tb = sys.exc_info()
        for line in traceback.TracebackException(
                type(value), value, tb, limit=None).format(chain=True):
          ret += line
      return ret
    # ====== processing ====== #
    mpi = MPI(jobs=self.jobs,
              func=_map_func,
              ncpu=self.n_cpu,
              batch=1,
              hwm=self.n_cpu * 3,
              backend='python')
    # initialize
    prog = Progbar(target=njobs, name=self.path,
                   interval=0.12, print_report=True, print_summary=True)
    start_time = time.time()
    last_time = time.time()
    last_count = 0
    with open(self._log_path, 'w') as flog:
      # writing the log head
      flog.write('============================\n')
      flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False))
      flog.write('Outpath    : %s\n' % self.path)
      flog.write('Extractor  : %s\n' % '->'.join([s[-1].__class__.__name__
                                                  for s in self.extractor.steps]))
      flog.write('#Jobs      : %d\n' % njobs)
      flog.write('#CPU       : %d\n' % self.n_cpu)
      flog.write('#Cache     : %d\n' % cache_limit)
      flog.write('============================\n')
      flog.flush()
      # start processing the file list
      for count, result in enumerate(mpi):
        # Non-handled exception
        if isinstance(result, string_types):
          flog.write(result)
          flog.flush()
          self._error_log.append(result)
          if self.stop_on_failure:
            raise RuntimeError(result)
        # some error might happened
        elif isinstance(result, ExtractorSignal):
          flog.write(str(result)); flog.flush()
          if result.action == 'error':
            prog.add_notification(str(result))
            raise RuntimeError("ExtractorSignal requests terminating processor!")
          elif result.action == 'warn':
            prog.add_notification(str(result))
          elif result.action == 'ignore':
            self._error_log.append(result)
          else:
            raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action)
          prog['File'] = '%-48s' % result.message[:48]
        # otherwise, no error happened, do post-processing
        else:
          name = post_processing(result)
          prog['File'] = '%-48s' % str(name)[:48]
        # update progress
        prog.add(1)
        # manually write to external log file
        if (count + 1) % max(1, int(0.01 * njobs)) == 0:
          curr_time = time.time()
          elap = curr_time - start_time
          avg_speed = (count + 1) / elap
          cur_speed = (count + 1 - last_count) / (curr_time - last_time)
          avg_est = (njobs - count - 1) / avg_speed
          cur_est = (njobs - count - 1) / cur_speed
          flog.write('[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                     '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                     '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                     (get_formatted_datetime(only_number=False),
                      count + 1, njobs - count - 1, elap,
                      avg_speed, avg_est,
                      cur_speed, cur_est))
          flog.flush()
          last_time = curr_time
          last_count = count + 1
    # ====== end, flush the last time ====== #
    for feat_name, X_cached in cache.items():
      flush_feature(feat_name, X_cached)
    cache.clear()
    cache = None
    dataset.flush()
    prog.add_notification("Flushed all data to disk")
    # ====== saving indices ====== #
    for name, db in databases.items():
      db.flush(save_all=True)
      db_size = len(db)
      db.close()
      prog.add_notification('Flush MmapDict "%s" to disk, size: %s' %
                            (ctext(name, 'yellow'),
                             ctext(str(db_size), 'yellow')))

    # ====== save mean and std ====== #
    def save_mean_std(sum1, sum2, name):
      N = dataset[name.split('_')[0]].shape[0]
      mean = sum1 / N
      std = np.sqrt(sum2 / N - np.power(mean, 2))
      if np.any(np.isnan(mean)):
        wprint('Mean contains NaN, name: %s' % name)
      if np.any(np.isnan(std)):
        wprint('Std contains NaN, name: %s' % name)
      dataset[name + 'sum1'] = sum1
      dataset[name + 'sum2'] = sum2
      dataset[name + 'mean'] = mean
      dataset[name + 'std'] = std
    # save all stats
    if len(stats) > 0:
      for feat_name, (sum1, sum2) in stats.items():
        save_mean_std(sum1, sum2, feat_name)
        prog.add_notification('Saved statistics of: %s, shape: %s' %
                              (ctext(feat_name.split('_')[0], 'yellow'),
                               ctext(str(sum1.shape), 'yellow')))
    # ====== dataset flush() ====== #
    dataset.flush()
    dataset.close()
    # ====== saving the extractor ====== #
    # not good idea to save the extractor all the time
    # pipeline_path = os.path.join(dataset.path, 'pipeline')
    # with open(pipeline_path, 'wb') as f:
    #   cPickle.dump(self.extractor, f, protocol=2)
    # prog.add_notification("Saved Extractor pipeline at: %s" %
    #                       ctext(pipeline_path, 'yellow'))
    # ====== saving the configuration ====== #
    config_path = os.path.join(dataset.path, 'config')
    config = MmapDict(config_path)
    config['__configuration_time__'] = time.time()
    config['__processor__'] = self.path
    for i in dir(self):
      if _default_module.match(i) is not None:
        continue
      j = getattr(self, i)
      if isinstance(j, (Number, string_types, bool)):
        config[i] = j
    config.flush(save_all=True)
    self.config = {i: j
                   for i, j in config}
    config.close()
    prog.add_notification("Saved configuration at: %s" %
                          ctext(config_path, 'yellow'))
    # ====== final notification ====== #
    prog.add_notification("Closed all dataset.")
    prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))