コード例 #1
0
ファイル: processor.py プロジェクト: professorlust/odin-ai
    def run(self):
        njobs = len(self.jobs)
        dataset = Dataset(self.path)
        if self.n_cache <= 1:
            cache_limit = max(2, int(0.12 * njobs))
        else:
            cache_limit = int(self.n_cache)
        # ====== indices ====== #
        databases = defaultdictkey(
            lambda key: MmapDict(path=os.path.join(dataset.path, key),
                                 cache_size=10000,
                                 read_only=False))
        last_start = defaultdict(int)
        # ====== statistic ====== #
        # load old statistics
        stats = defaultdict(lambda: [0, 0])  # name -> (sum1, sum2)
        for key in dataset.keys():
            if 'sum1' == key[-4]:
                stats[key[:-4]][0] = dataset[key][:]
            elif 'sum2' == key[-4:]:
                stats[key[:-4]][1] = dataset[key][:]
        # all data are cached for periodically flushed
        cache = defaultdict(list)
        n_processed = [0]  # store the value as reference

        # ====== helper ====== #
        def flush_feature(feat_name, X_cached):
            if len(X_cached) > 0:
                X_cached = np.concatenate(X_cached, 0)
                # flush data
                if feat_name in dataset:
                    dataset[feat_name].append(X_cached)
                else:
                    dataset[(feat_name, 'memmap')] = X_cached

        # ====== repeated for each result returned ====== #
        def post_processing(result):
            # search for file name
            if self.identifier not in result:
                raise RuntimeError(
                    "Cannot find identifier '%s' in returned dictionary" %
                    self.identifier)
            file_name = result[self.identifier]
            # invalid file_name
            if not is_string(file_name):
                raise RuntimeError(
                    "Cannot find file name in returned features "
                    "list, the file name can be specified in key: 'name', 'path' "
                    "and the type of the value must be string. All available "
                    "keys are: %s" % str(result.keys()))
            # store all new indices
            # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
            all_indices = {}
            # processing
            for feat_name, X in result.items():
                # some invalid feat_name
                if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
                    raise RuntimeError(
                        "Returned features' name cannot be one "
                        "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
                    )
                # ignore some feat_name
                if feat_name in ('name'):
                    continue
                # if numpy ndarray, save to MmapData
                if isinstance(X, np.ndarray) or \
                'sum1' == feat_name[-4:] or \
                'sum2' == feat_name[-4:]:
                    # save statistics instead
                    if 'sum1' == feat_name[-4:]:
                        stats[feat_name[:-4]][0] += X
                    elif 'sum2' == feat_name[-4:]:
                        stats[feat_name[:-4]][1] += X
                    # save features array
                    else:
                        all_indices[feat_name] = X.shape[0]
                        # cache data, only if we have more than 0 sample
                        if X.shape[0] > 0:
                            cache[feat_name].append(X)
                # else all other kind of data save to MmapDict
                else:
                    databases[feat_name][file_name] = X
                # remove data
                del X
            # ====== update indices ====== #
            if len(all_indices) > 0:
                for feat_name, n in all_indices.items():
                    ids_name = 'indices_%s' % feat_name
                    databases[ids_name][file_name] = (last_start[ids_name],
                                                      last_start[ids_name] + n)
                    last_start[ids_name] += n
            # ====== flush cache ====== #
            n_processed[0] += 1
            if n_processed[0] % cache_limit == 0:  # 12 + 8
                for feat_name, X_cached in cache.items():
                    flush_feature(feat_name, X_cached)
                cache.clear()
            # ====== update progress ====== #
            return file_name

        # ====== mapping function ====== #
        def _map_func(dat):
            try:
                ret = self.extractor.transform(dat)
            except Exception as e:  # Non-handled exception
                ret = '\n========\n'
                ret += 'Time  : `%s`\n' % str(
                    get_formatted_datetime(only_number=False))
                ret += 'Error : `%s`\n' % str(e)
                ret += 'Input : `%s`\n' % str(dat)
                import traceback
                etype, value, tb = sys.exc_info()
                for line in traceback.TracebackException(
                        type(value), value, tb, limit=None).format(chain=True):
                    ret += line
            return ret

        # ====== processing ====== #
        mpi = MPI(jobs=self.jobs,
                  func=_map_func,
                  ncpu=self.n_cpu,
                  batch=1,
                  hwm=self.n_cpu * 3,
                  backend='python')
        # initialize
        prog = Progbar(target=njobs,
                       name=self.path,
                       interval=0.12,
                       print_report=True,
                       print_summary=True)
        start_time = time.time()
        last_time = time.time()
        last_count = 0
        with open(self._log_path, 'w') as flog:
            # writing the log head
            flog.write('============================\n')
            flog.write('Start Time : %s\n' %
                       get_formatted_datetime(only_number=False))
            flog.write('Outpath    : %s\n' % self.path)
            flog.write('Extractor  : %s\n' % '->'.join(
                [s[-1].__class__.__name__ for s in self.extractor.steps]))
            flog.write('#Jobs      : %d\n' % njobs)
            flog.write('#CPU       : %d\n' % self.n_cpu)
            flog.write('#Cache     : %d\n' % cache_limit)
            flog.write('============================\n')
            flog.flush()
            # start processing the file list
            for count, result in enumerate(mpi):
                # Non-handled exception
                if isinstance(result, string_types):
                    flog.write(result)
                    flog.flush()
                    self._error_log.append(result)
                    if self.stop_on_failure:
                        raise RuntimeError(result)
                # some error might happened
                elif isinstance(result, ExtractorSignal):
                    flog.write(str(result))
                    flog.flush()
                    if result.action == 'error':
                        prog.add_notification(str(result))
                        raise RuntimeError(
                            "ExtractorSignal requests terminating processor!")
                    elif result.action == 'warn':
                        prog.add_notification(str(result))
                    elif result.action == 'ignore':
                        self._error_log.append(result)
                    else:
                        raise RuntimeError(
                            "Unknown action from ExtractorSignal: %s" %
                            result.action)
                    prog['File'] = '%-48s' % result.message[:48]
                # otherwise, no error happened, do post-processing
                else:
                    name = post_processing(result)
                    prog['File'] = '%-48s' % str(name)[:48]
                # update progress
                prog.add(1)
                # manually write to external log file
                if (count + 1) % max(1, int(0.01 * njobs)) == 0:
                    curr_time = time.time()
                    elap = curr_time - start_time
                    avg_speed = (count + 1) / elap
                    cur_speed = (count + 1 - last_count) / (curr_time -
                                                            last_time)
                    avg_est = (njobs - count - 1) / avg_speed
                    cur_est = (njobs - count - 1) / cur_speed
                    flog.write(
                        '[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                        '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                        '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                        (get_formatted_datetime(only_number=False), count + 1,
                         njobs - count - 1, elap, avg_speed, avg_est,
                         cur_speed, cur_est))
                    flog.flush()
                    last_time = curr_time
                    last_count = count + 1
        # ====== end, flush the last time ====== #
        for feat_name, X_cached in cache.items():
            flush_feature(feat_name, X_cached)
        cache.clear()
        cache = None
        dataset.flush()
        prog.add_notification("Flushed all data to disk")
        # ====== saving indices ====== #
        for name, db in databases.items():
            db.flush(save_all=True)
            db_size = len(db)
            db.close()
            prog.add_notification(
                'Flush MmapDict "%s" to disk, size: %s' %
                (ctext(name, 'yellow'), ctext(str(db_size), 'yellow')))

        # ====== save mean and std ====== #
        def save_mean_std(sum1, sum2, name):
            N = dataset[name.split('_')[0]].shape[0]
            mean = sum1 / N
            std = np.sqrt(sum2 / N - np.power(mean, 2))
            if np.any(np.isnan(mean)):
                wprint('Mean contains NaN, name: %s' % name)
            if np.any(np.isnan(std)):
                wprint('Std contains NaN, name: %s' % name)
            dataset[name + 'sum1'] = sum1
            dataset[name + 'sum2'] = sum2
            dataset[name + 'mean'] = mean
            dataset[name + 'std'] = std

        # save all stats
        if len(stats) > 0:
            for feat_name, (sum1, sum2) in stats.items():
                save_mean_std(sum1, sum2, feat_name)
                prog.add_notification(
                    'Saved statistics of: %s, shape: %s' %
                    (ctext(feat_name.split('_')[0],
                           'yellow'), ctext(str(sum1.shape), 'yellow')))
        # ====== dataset flush() ====== #
        dataset.flush()
        dataset.close()
        # ====== saving the extractor ====== #
        # not good idea to save the extractor all the time
        # pipeline_path = os.path.join(dataset.path, 'pipeline')
        # with open(pipeline_path, 'wb') as f:
        #   cPickle.dump(self.extractor, f, protocol=2)
        # prog.add_notification("Saved Extractor pipeline at: %s" %
        #                       ctext(pipeline_path, 'yellow'))
        # ====== saving the configuration ====== #
        config_path = os.path.join(dataset.path, 'config')
        config = MmapDict(config_path)
        config['__configuration_time__'] = time.time()
        config['__processor__'] = self.path
        for i in dir(self):
            if _default_module.match(i) is not None:
                continue
            j = getattr(self, i)
            if isinstance(j, (Number, string_types, bool)):
                config[i] = j
        config.flush(save_all=True)
        self.config = {i: j for i, j in config}
        config.close()
        prog.add_notification("Saved configuration at: %s" %
                              ctext(config_path, 'yellow'))
        # ====== final notification ====== #
        prog.add_notification("Closed all dataset.")
        prog.add_notification("Dataset at path: %s" %
                              ctext(dataset.path, 'yellow'))
コード例 #2
0
ファイル: trainer.py プロジェクト: professorlust/odin-ai
class Task(object):
  """
  Parameters
  ----------
  func: call-able
      function will be executed for each iteration
  data: single or list of odin.fuel.Data, numpy.ndarray
      iterate over all these data and execute function on
      the data.
  epoch: int
      how many epoch will be repeated
  p: float (0.0 - 1.0)
      probability the `func` will be execute for each iteration
  batch_size: int (> 0)
      number of samples for each iteration
  seed: int
      random seed for shuffling the data
  shuffle_level: int (0, 1, 2)
      if 0, shuffle the file lists
      if 1, shuffle the buffer (i.e. list of processing files) and
          all the previous
      if 2, shuffle the returned batch and all the previous
  callbacks: None, or list of `odin.training.Callback`
      callback will be promoted during the execution of the task
  labels: None, or list of string
      labels for printing the confusion matrix in `odin.utils.Progbar`
  name: None or string
      unique name for Task identity.
  verbose : {0, 1, 2, 3, 4}
      specific verbose level controlling the log output
      0 - Turn off all log
      1 - progress off, only notification
      2 - progress off, notification and summary
      3 - progress on, nothing else
      4 - progress on, notification and summary
      5 - progress on, notification, summary and batch report
  """

  def __init__(self, func, data, epoch=1, p=1.0,
               batch_size=128, seed=None, shuffle_level=2,
               callbacks=None, labels=None, name=None,
               verbose=2):
    super(Task, self).__init__()
    self.set_func(func, data)
    # this Progbar will record the history as well
    self._labels = [str(l) for l in labels] \
        if labels is not None else None
    self._progbar = Progbar(target=self.nb_samples, name=name,
                            interval=0.,
                            print_report=True, print_summary=True)
    self._progbar.set_labels(self._labels)
    # ====== set callback and verbose ====== #
    self._callback = CallbackList(callbacks)
    self.set_verbose(verbose)
    # ====== assign other arguments ====== #
    self._nb_epoch = epoch
    self._p = np.clip(p, 0., 1.)
    self._seed = seed
    self.set_batch(batch_size, seed, shuffle_level)
    self._name = name
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False

  def __str__(self):
    return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \
    (ctext(self.name, 'lightyellow'),
     ctext(self.probability, 'cyan'),
     ctext(self.batch_size, 'cyan'),
     ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'),
     ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'),
     ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'),
     ','.join([ctext(i.__class__.__name__, 'cyan')
               for i in self._callback._callbacks]))

  def __getstate__(self):
    return (self._progbar, self._nb_epoch, self._p, self._name,
            self._batch_size, self._rng, self._seed,
            self._shuffle_level, self._verbose)

  def __setstate__(self, states):
    (self._progbar, self._nb_epoch, self._p, self._name,
     self._batch_size, self._rng, self._seed,
     self._shuffle_level, self._verbose) = states
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False
    # ====== reset value of func and data ====== #
    self._func = None
    self._data = None

  def set_callbacks(self, callbacks):
    self._callback.set_callbacks(callbacks)
    if self._verbose == 0:
      self._callback.set_notification(False)
    else:
      self._callback.set_notification(True)
    return self

  def set_verbose(self, verbose):
    verbose = int(verbose)
    self._verbose = verbose
    if verbose == 0: # turn off everything
      self._callback.set_notification(False)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 1: # progress off, only notification
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 2: # progress off, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 3: # progress on, nothing else
      self._callback.set_notification(False)
      self._progbar.print_progress = True
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 4: # progress on, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 5: # progress on, notification, report, summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = True
    else:
      raise ValueError(
          "Only support verbose value: 0, 1, 2, 3, 4, 5; but given: %s" % str(verbose))

  def set_func(self, func, data):
    # ====== check function ====== #
    self._func = func
    if isinstance(func, K.Function):
      self._output_info = [(o.name, o.shape.as_list())
                           for o in self._func.outputs]
    elif hasattr(func, '__call__'):
      self._output_info = [] # No info (normal function)
    else:
      raise ValueError("No support for function type: %s" %
          func.__class__.__name__)
    # ====== check data ====== #
    if not isinstance(data, (tuple, list)):
      data = [data]
    self._data = [fuel.as_data(i, copy=not isinstance(i, fuel.Feeder))
                  for i in data]
    self._nb_samples = min([d.iter_len for d in self._data])
    return self

  def set_batch(self, batch_size=None, seed=-1, shuffle_level=None):
    if batch_size is not None:
      self._batch_size = batch_size
    if seed is None or seed >= 0:
      if seed is not None:
        self._rng = np.random.RandomState(seed)
      else:
        self._rng = struct()
        self._rng.randint = lambda x: None
        self._rng.rand = get_rng().rand
    if shuffle_level is not None:
      self._shuffle_level = min(max(int(shuffle_level), 0), 2)
    return self

  # ==================== Properties ==================== #
  @property
  def history(self):
    """ Return : dictionary type
      {epoch_id : {tensor_name0: [batch_return1, batch_return2, ...],
                   tensor_name1: [batch_return1, batch_return2, ...],
                   ...},
       1 : {tensor_name0: [batch_return1, batch_return2, ...],
                  tensor_name1: [batch_return1, batch_return2, ...],
                  ...},
       ... }

    Example
    -------
    >>> for task_name, task_hist in task.history.items():
    >>>   print(task_name)
    >>>   for epoch_id, values in task_hist.items():
    >>>     print('  Epoch:', epoch_id)
    >>>     for tensor_name, v in values.items():
    >>>       print('  ', tensor_name, len(v))
    """
    return self._progbar.history

  @property
  def progbar(self):
    return self._progbar

  @property
  def name(self):
    return str(self._name)

  @property
  def labels(self):
    return self._labels

  @property
  def nb_epoch(self):
    return self._nb_epoch

  @property
  def nb_samples(self):
    ''' Estimated number of iteration for each epoch '''
    return self._nb_samples

  @property
  def probability(self):
    """Chance that the func will be execute during iteration"""
    return self._p

  @property
  def iter_per_epoch(self):
    ''' Estimated number of iteration for each epoch '''
    return int(np.ceil(self._nb_samples / self._batch_size))

  @property
  def batch_size(self):
    return self._batch_size

  @property
  def curr_epoch(self):
    """Total number of epoch finished since the beginning of the Task"""
    return self._curr_epoch

  @property
  def curr_iter(self):
    """Total number of iteration finished since the beginning of the Task"""
    return self._curr_iter

  @property
  def curr_samples(self):
    """Total number of samples finished since the beginning of the Task"""
    return self._curr_samples

  @property
  def curr_epoch_iter(self):
    """Number of iteration within current epoch"""
    return self._curr_epoch_iter

  @property
  def curr_epoch_samples(self):
    """Number of samples within current epoch"""
    return self._curr_epoch_samples

  @property
  def callback_msg(self):
    return self._callback_msg

  # ==================== control function ==================== #
  def stop(self):
    """ Stop all iterations running for this Task"""
    if self._created_iter is not None:
      self._stop = True
      # just run to end of the iterators
      for i in self._created_iter:
        pass
      self._stop = False
      self._created_iter = None

  def copy(self):
    return Task(self._func, self._data,
                epoch=self.nb_epoch, p=self.probability,
                batch_size=self.batch_size, seed=self._seed,
                shuffle_level=self._shuffle_level,
                name=self._name, verbose=self._verbose)

  def __iter(self):
    '''
    Return
    ------
    One of the following:
    * 'task_start':
    * 'epoch_start' : beginning of epoch
    * 'epoch_end' : epoch ended
    * 'task_end' : task ended
    * (results, nb_iter, nb_samples,
       nb_total_samples, nb_epoch) : results of execute function on data

    Note
    ----
    'end_task' also end of final epoch
    '''
    yield None # just for initalize the iterator
    self._callback_msg = self._callback.task_start(self)
    yield 'task_start'
    if self._stop:
      yield 'task_end'
    else:
      # ====== start of training ====== #
      while self._curr_epoch < self._nb_epoch:
        self._callback_msg = self._callback.epoch_start(self, self._data)
        yield 'epoch_start'
        seed = self._rng.randint(10e8)
        # if only 1 Data, don't need zip or we will mess up
        if len(self._data) == 1:
          data_it = iter(self._data[0].set_batch(batch_size=self._batch_size,
                                                 seed=seed,
                                                 shuffle_level=self._shuffle_level))
          data = data_it
        else:
          data_it = [iter(d.set_batch(batch_size=self._batch_size,
                                      seed=seed,
                                      shuffle_level=self._shuffle_level))
                     for d in self._data]
          data = zip(*data_it)
        # ======  start the iteration ====== #
        self._curr_epoch_samples = 0
        self._curr_epoch_iter = 0
        with self._progbar.safe_progress():
          for i, x in enumerate(data):
            # alread terminated, try to exhausted the iterator
            # if forced_to_terminate: continue
            # preprocessed the data
            if not isinstance(x, (tuple, list)):
              x = [x]
            # update some info
            shape0 = x[0].shape[0]
            self._curr_samples += shape0
            self._curr_iter += 1
            self._curr_epoch_samples += shape0
            self._curr_epoch_iter += 1
            self._callback_msg = self._callback.batch_start(self, x)
            # apply the function
            if self.probability >= 1. or self._rng.rand() < self.probability:
              results = self._func(*x)
              # add msg from batch_end event
              self._callback_msg += self._callback.batch_end(self, results)
              # return results
              yield results
              # update the progress bar
              for (name, shape), res in zip(self._output_info,
                                            as_tuple(results)):
                if len(shape) == 0: # return single value
                  self._progbar[name] = res
                else: # return tensor
                  self._progbar[name] = res
              self._progbar.add(shape0)
            # check TERMINATE signal
            if self._stop:
              # send signal to the data iterators also
              for i in data_it:
                if hasattr(i, 'stop'):
                  i.stop()
                else: # just iterate all over
                  for _ in i: pass
              # break the epoch loop
              break
        ### Epoch end signaling
        self._curr_epoch += 1
        self._callback_msg = self._callback.epoch_end(
            self, self._progbar.history[self._curr_epoch - 1])
        yield 'epoch_end'
        # ====== check if we got the right number for epoch iter ====== #
        if self._curr_epoch_samples != self._nb_samples:
          # just for sure should not smaller than the real number
          self._nb_samples = self._curr_epoch_samples
        # ======  end_epoch or task ====== #
        if self._stop or self._curr_epoch >= self._nb_epoch:
          self._callback_msg = self._callback.task_end(
              self, self._progbar.history)
          yield 'task_end'
          # showing notification
          if self._verbose >= 1 and self._verbose != 3:
            self._progbar.add_notification('Task "%s" ended!' % str(self.name))
          break
    # ====== end of iteration ====== #
    self._created_iter = None

  def __iter__(self):
    if self._created_iter is None:
      # reset all information
      self._curr_epoch = 0
      self._curr_iter = 0
      self._curr_samples = 0
      self._curr_epoch_iter = 0
      self._curr_epoch_samples = 0
      self._callback_msg = []
      # create new iter
      self._created_iter = self.__iter()
      # initialize the iteration
      next(self._created_iter)
    return self._created_iter

  def __del__(self):
    self.stop()
コード例 #3
0
ファイル: trainer.py プロジェクト: imito/odin
class Task(object):
  """
  Parameters
  ----------
  func: call-able
      function will be executed for each iteration
  data: single or list of odin.fuel.Data, numpy.ndarray
      iterate over all these data and execute function on
      the data.
  epoch: int
      how many epoch will be repeated
  p: float (0.0 - 1.0)
      probability the `func` will be execute for each iteration
  batch_size: int (> 0)
      number of samples for each iteration
  seed: int
      random seed for shuffling the data
  shuffle_level: int (0, 1, 2)
      if 0, shuffle the file lists
      if 1, shuffle the buffer (i.e. list of processing files) and
          all the previous
      if 2, shuffle the returned batch and all the previous
  callbacks: None, or list of `odin.training.Callback`
      callback will be promoted during the execution of the task
  labels: None, or list of string
      labels for printing the confusion matrix in `odin.utils.Progbar`
  name: None or string
      unique name for Task identity.
  verbose : {0, 1, 2, 3, 4}
      specific verbose level controlling the log output
      0 - Turn off all log
      1 - progress off, only notification
      2 - progress off, notification and summary
      3 - progress on, nothing else
      4 - progress on, notification and summary
      5 - progress on, notification, summary and batch report
  """

  def __init__(self, func, data, epoch=1, p=1.0,
               batch_size=128, seed=None, shuffle_level=2,
               callbacks=None, labels=None, name=None,
               verbose=2):
    super(Task, self).__init__()
    self.set_func(func, data)
    # this Progbar will record the history as well
    self._labels = [str(l) for l in labels] \
        if labels is not None else None
    self._progbar = Progbar(target=self.nb_samples, name=name,
                            interval=0.,
                            print_report=True, print_summary=True)
    self._progbar.set_labels(self._labels)
    # ====== set callback and verbose ====== #
    self._callback = CallbackList(callbacks)
    self.set_verbose(verbose)
    # ====== assign other arguments ====== #
    self._nb_epoch = epoch
    self._p = np.clip(p, 0., 1.)
    self._seed = seed
    self.set_batch(batch_size, seed, shuffle_level)
    self._name = name
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False

  def __str__(self):
    return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \
    (ctext(self.name, 'lightyellow'),
     ctext(self.probability, 'cyan'),
     ctext(self.batch_size, 'cyan'),
     ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'),
     ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'),
     ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'),
     ','.join([ctext(i.__class__.__name__, 'cyan')
               for i in self._callback._callbacks]))

  def __getstate__(self):
    return (self._progbar, self._nb_epoch, self._p, self._name,
            self._batch_size, self._rng, self._seed,
            self._shuffle_level, self._verbose)

  def __setstate__(self, states):
    (self._progbar, self._nb_epoch, self._p, self._name,
     self._batch_size, self._rng, self._seed,
     self._shuffle_level, self._verbose) = states
    # ====== current info ====== #
    self._curr_epoch = 0
    self._curr_iter = 0
    self._curr_samples = 0
    self._curr_epoch_iter = 0
    self._curr_epoch_samples = 0
    self._callback_msg = []
    # ====== iter tracking ====== #
    self._created_iter = None
    self._stop = False
    # ====== reset value of func and data ====== #
    self._func = None
    self._data = None

  def set_callbacks(self, callbacks):
    self._callback.set_callbacks(callbacks)
    if self._verbose == 0:
      self._callback.set_notification(False)
    else:
      self._callback.set_notification(True)
    return self

  def set_verbose(self, verbose):
    verbose = int(verbose)
    self._verbose = verbose
    if verbose == 0: # turn off everything
      self._callback.set_notification(False)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 1: # progress off, only notification
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 2: # progress off, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = False
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 3: # progress on, nothing else
      self._callback.set_notification(False)
      self._progbar.print_progress = True
      self._progbar.print_summary = False
      self._progbar.print_report = False
    elif verbose == 4: # progress on, notification + summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = False
    elif verbose == 5: # progress on, notification, report, summary
      self._callback.set_notification(True)
      self._progbar.print_progress = True
      self._progbar.print_summary = True
      self._progbar.print_report = True
    else:
      raise ValueError(
          "Only support verbose value: 0, 1, 2, 3, 4, 5; but given: %s" % str(verbose))

  def set_func(self, func, data):
    # ====== check function ====== #
    self._func = func
    if isinstance(func, K.Function):
      self._output_info = [(o.name, o.shape.as_list())
                           for o in self._func.outputs]
    elif hasattr(func, '__call__'):
      self._output_info = [] # No info (normal function)
    else:
      raise ValueError("No support for function type: %s" %
          func.__class__.__name__)
    # ====== check data ====== #
    if not isinstance(data, (tuple, list)):
      data = [data]
    self._data = [fuel.as_data(i, copy=not isinstance(i, fuel.Feeder))
                  for i in data]
    self._nb_samples = min([d.iter_len for d in self._data])
    return self

  def set_batch(self, batch_size=None, seed=-1, shuffle_level=None):
    if batch_size is not None:
      self._batch_size = batch_size
    if seed is None or seed >= 0:
      if seed is not None:
        self._rng = np.random.RandomState(seed)
      else:
        self._rng = struct()
        self._rng.randint = lambda x: None
        self._rng.rand = get_rng().rand
    if shuffle_level is not None:
      self._shuffle_level = min(max(int(shuffle_level), 0), 2)
    return self

  # ==================== Properties ==================== #
  @property
  def history(self):
    """ Return : dictionary type
      {epoch_id : {tensor_name0: [batch_return1, batch_return2, ...],
                   tensor_name1: [batch_return1, batch_return2, ...],
                   ...},
       1 : {tensor_name0: [batch_return1, batch_return2, ...],
                  tensor_name1: [batch_return1, batch_return2, ...],
                  ...},
       ... }

    Example
    -------
    >>> for task_name, task_hist in task.history.items():
    >>>   print(task_name)
    >>>   for epoch_id, values in task_hist.items():
    >>>     print('  Epoch:', epoch_id)
    >>>     for tensor_name, v in values.items():
    >>>       print('  ', tensor_name, len(v))
    """
    return self._progbar.history

  @property
  def progbar(self):
    return self._progbar

  @property
  def name(self):
    return str(self._name)

  @property
  def labels(self):
    return self._labels

  @property
  def nb_epoch(self):
    return self._nb_epoch

  @property
  def nb_samples(self):
    ''' Estimated number of iteration for each epoch '''
    return self._nb_samples

  @property
  def probability(self):
    """Chance that the func will be execute during iteration"""
    return self._p

  @property
  def iter_per_epoch(self):
    ''' Estimated number of iteration for each epoch '''
    return int(np.ceil(self._nb_samples / self._batch_size))

  @property
  def batch_size(self):
    return self._batch_size

  @property
  def curr_epoch(self):
    """Total number of epoch finished since the beginning of the Task"""
    return self._curr_epoch

  @property
  def curr_iter(self):
    """Total number of iteration finished since the beginning of the Task"""
    return self._curr_iter

  @property
  def curr_samples(self):
    """Total number of samples finished since the beginning of the Task"""
    return self._curr_samples

  @property
  def curr_epoch_iter(self):
    """Number of iteration within current epoch"""
    return self._curr_epoch_iter

  @property
  def curr_epoch_samples(self):
    """Number of samples within current epoch"""
    return self._curr_epoch_samples

  @property
  def callback_msg(self):
    return self._callback_msg

  # ==================== control function ==================== #
  def stop(self):
    """ Stop all iterations running for this Task"""
    if self._created_iter is not None:
      self._stop = True
      # just run to end of the iterators
      for i in self._created_iter:
        pass
      self._stop = False
      self._created_iter = None

  def copy(self):
    return Task(self._func, self._data,
                epoch=self.nb_epoch, p=self.probability,
                batch_size=self.batch_size, seed=self._seed,
                shuffle_level=self._shuffle_level,
                name=self._name, verbose=self._verbose)

  def __iter(self):
    '''
    Return
    ------
    One of the following:
    * 'task_start':
    * 'epoch_start' : beginning of epoch
    * 'epoch_end' : epoch ended
    * 'task_end' : task ended
    * (results, nb_iter, nb_samples,
       nb_total_samples, nb_epoch) : results of execute function on data

    Note
    ----
    'end_task' also end of final epoch
    '''
    yield None # just for initalize the iterator
    self._callback_msg = self._callback.task_start(self)
    yield 'task_start'
    if self._stop:
      yield 'task_end'
    else:
      # ====== start of training ====== #
      while self._curr_epoch < self._nb_epoch:
        self._callback_msg = self._callback.epoch_start(self, self._data)
        yield 'epoch_start'
        seed = self._rng.randint(10e8)
        # if only 1 Data, don't need zip or we will mess up
        if len(self._data) == 1:
          data_it = iter(self._data[0].set_batch(batch_size=self._batch_size,
                                                 seed=seed,
                                                 shuffle_level=self._shuffle_level))
          data = data_it
        else:
          data_it = [iter(d.set_batch(batch_size=self._batch_size,
                                      seed=seed,
                                      shuffle_level=self._shuffle_level))
                     for d in self._data]
          data = zip(*data_it)
        # ======  start the iteration ====== #
        self._curr_epoch_samples = 0
        self._curr_epoch_iter = 0
        with self._progbar.safe_progress():
          for i, x in enumerate(data):
            # alread terminated, try to exhausted the iterator
            # if forced_to_terminate: continue
            # preprocessed the data
            if not isinstance(x, (tuple, list)):
              x = [x]
            # update some info
            shape0 = x[0].shape[0]
            self._curr_samples += shape0
            self._curr_iter += 1
            self._curr_epoch_samples += shape0
            self._curr_epoch_iter += 1
            self._callback_msg = self._callback.batch_start(self, x)
            # apply the function
            if self.probability >= 1. or self._rng.rand() < self.probability:
              results = self._func(*x)
              # add msg from batch_end event
              self._callback_msg += self._callback.batch_end(self, results)
              # return results
              yield results
              # update the progress bar
              for (name, shape), res in zip(self._output_info,
                                            as_tuple(results)):
                if len(shape) == 0: # return single value
                  self._progbar[name] = res
                else: # return tensor
                  self._progbar[name] = res
              self._progbar.add(shape0)
            # check TERMINATE signal
            if self._stop:
              # send signal to the data iterators also
              for i in data_it:
                if hasattr(i, 'stop'):
                  i.stop()
                else: # just iterate all over
                  for _ in i: pass
              # break the epoch loop
              break
        ### Epoch end signaling
        self._curr_epoch += 1
        self._callback_msg = self._callback.epoch_end(
            self, self._progbar.history[self._curr_epoch - 1])
        yield 'epoch_end'
        # ====== check if we got the right number for epoch iter ====== #
        if self._curr_epoch_samples != self._nb_samples:
          # just for sure should not smaller than the real number
          self._nb_samples = self._curr_epoch_samples
        # ======  end_epoch or task ====== #
        if self._stop or self._curr_epoch >= self._nb_epoch:
          self._callback_msg = self._callback.task_end(
              self, self._progbar.history)
          yield 'task_end'
          # showing notification
          if self._verbose >= 1 and self._verbose != 3:
            self._progbar.add_notification('Task "%s" ended!' % str(self.name))
          break
    # ====== end of iteration ====== #
    self._created_iter = None

  def __iter__(self):
    if self._created_iter is None:
      # reset all information
      self._curr_epoch = 0
      self._curr_iter = 0
      self._curr_samples = 0
      self._curr_epoch_iter = 0
      self._curr_epoch_samples = 0
      self._callback_msg = []
      # create new iter
      self._created_iter = self.__iter()
      # initialize the iteration
      next(self._created_iter)
    return self._created_iter

  def __del__(self):
    self.stop()
コード例 #4
0
ファイル: processor.py プロジェクト: imito/odin
  def run(self):
    njobs = len(self.jobs)
    dataset = Dataset(self.path)
    if self.n_cache <= 1:
      cache_limit = max(2, int(0.12 * njobs))
    else:
      cache_limit = int(self.n_cache)
    # ====== indices ====== #
    databases = defaultdictkey(lambda key:
        MmapDict(path=os.path.join(dataset.path, key), cache_size=10000,
                 read_only=False))
    last_start = defaultdict(int)
    # ====== statistic ====== #
    # load old statistics
    stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2)
    for key in dataset.keys():
      if 'sum1' == key[-4]:
        stats[key[:-4]][0] = dataset[key][:]
      elif 'sum2' == key[-4:]:
        stats[key[:-4]][1] = dataset[key][:]
    # all data are cached for periodically flushed
    cache = defaultdict(list)
    n_processed = [0] # store the value as reference

    # ====== helper ====== #
    def flush_feature(feat_name, X_cached):
      if len(X_cached) > 0:
        X_cached = np.concatenate(X_cached, 0)
        # flush data
        if feat_name in dataset:
          dataset[feat_name].append(X_cached)
        else:
          dataset[(feat_name, 'memmap')] = X_cached

    # ====== repeated for each result returned ====== #
    def post_processing(result):
      # search for file name
      if self.identifier not in result:
        raise RuntimeError(
            "Cannot find identifier '%s' in returned dictionary" % self.identifier)
      file_name = result[self.identifier]
      # invalid file_name
      if not is_string(file_name):
        raise RuntimeError("Cannot find file name in returned features "
            "list, the file name can be specified in key: 'name', 'path' "
            "and the type of the value must be string. All available "
            "keys are: %s" % str(result.keys()))
      # store all new indices
      # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
      all_indices = {}
      # processing
      for feat_name, X in result.items():
        # some invalid feat_name
        if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
          raise RuntimeError("Returned features' name cannot be one "
                             "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
        # ignore some feat_name
        if feat_name in ('name'):
          continue
        # if numpy ndarray, save to MmapData
        if isinstance(X, np.ndarray) or \
        'sum1' == feat_name[-4:] or \
        'sum2' == feat_name[-4:]:
          # save statistics instead
          if 'sum1' == feat_name[-4:]:
            stats[feat_name[:-4]][0] += X
          elif 'sum2' == feat_name[-4:]:
            stats[feat_name[:-4]][1] += X
          # save features array
          else:
            all_indices[feat_name] = X.shape[0]
            # cache data, only if we have more than 0 sample
            if X.shape[0] > 0:
              cache[feat_name].append(X)
        # else all other kind of data save to MmapDict
        else:
          databases[feat_name][file_name] = X
        # remove data
        del X
      # ====== update indices ====== #
      if len(all_indices) > 0:
        for feat_name, n in all_indices.items():
          ids_name = 'indices_%s' % feat_name
          databases[ids_name][file_name] = (last_start[ids_name],
                                            last_start[ids_name] + n)
          last_start[ids_name] += n
      # ====== flush cache ====== #
      n_processed[0] += 1
      if n_processed[0] % cache_limit == 0: # 12 + 8
        for feat_name, X_cached in cache.items():
          flush_feature(feat_name, X_cached)
        cache.clear()
      # ====== update progress ====== #
      return file_name

    # ====== mapping function ====== #
    def _map_func(dat):
      try:
        ret = self.extractor.transform(dat)
      except Exception as e: # Non-handled exception
        ret = '\n========\n'
        ret += 'Time  : `%s`\n' % str(get_formatted_datetime(only_number=False))
        ret += 'Error : `%s`\n' % str(e)
        ret += 'Input : `%s`\n' % str(dat)
        import traceback
        etype, value, tb = sys.exc_info()
        for line in traceback.TracebackException(
                type(value), value, tb, limit=None).format(chain=True):
          ret += line
      return ret
    # ====== processing ====== #
    mpi = MPI(jobs=self.jobs,
              func=_map_func,
              ncpu=self.n_cpu,
              batch=1,
              hwm=self.n_cpu * 3,
              backend='python')
    # initialize
    prog = Progbar(target=njobs, name=self.path,
                   interval=0.12, print_report=True, print_summary=True)
    start_time = time.time()
    last_time = time.time()
    last_count = 0
    with open(self._log_path, 'w') as flog:
      # writing the log head
      flog.write('============================\n')
      flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False))
      flog.write('Outpath    : %s\n' % self.path)
      flog.write('Extractor  : %s\n' % '->'.join([s[-1].__class__.__name__
                                                  for s in self.extractor.steps]))
      flog.write('#Jobs      : %d\n' % njobs)
      flog.write('#CPU       : %d\n' % self.n_cpu)
      flog.write('#Cache     : %d\n' % cache_limit)
      flog.write('============================\n')
      flog.flush()
      # start processing the file list
      for count, result in enumerate(mpi):
        # Non-handled exception
        if isinstance(result, string_types):
          flog.write(result)
          flog.flush()
          self._error_log.append(result)
          if self.stop_on_failure:
            raise RuntimeError(result)
        # some error might happened
        elif isinstance(result, ExtractorSignal):
          flog.write(str(result)); flog.flush()
          if result.action == 'error':
            prog.add_notification(str(result))
            raise RuntimeError("ExtractorSignal requests terminating processor!")
          elif result.action == 'warn':
            prog.add_notification(str(result))
          elif result.action == 'ignore':
            self._error_log.append(result)
          else:
            raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action)
          prog['File'] = '%-48s' % result.message[:48]
        # otherwise, no error happened, do post-processing
        else:
          name = post_processing(result)
          prog['File'] = '%-48s' % str(name)[:48]
        # update progress
        prog.add(1)
        # manually write to external log file
        if (count + 1) % max(1, int(0.01 * njobs)) == 0:
          curr_time = time.time()
          elap = curr_time - start_time
          avg_speed = (count + 1) / elap
          cur_speed = (count + 1 - last_count) / (curr_time - last_time)
          avg_est = (njobs - count - 1) / avg_speed
          cur_est = (njobs - count - 1) / cur_speed
          flog.write('[%s] Processed: %d(files)   Remain: %d(files)   Elap.: %.2f(secs)\n'
                     '   Avg.Spd: %.2f(obj/sec)  Avg.Est.: %.2f(secs)\n'
                     '   Cur.Spd: %.2f(obj/sec)  Cur.Est.: %.2f(secs)\n' %
                     (get_formatted_datetime(only_number=False),
                      count + 1, njobs - count - 1, elap,
                      avg_speed, avg_est,
                      cur_speed, cur_est))
          flog.flush()
          last_time = curr_time
          last_count = count + 1
    # ====== end, flush the last time ====== #
    for feat_name, X_cached in cache.items():
      flush_feature(feat_name, X_cached)
    cache.clear()
    cache = None
    dataset.flush()
    prog.add_notification("Flushed all data to disk")
    # ====== saving indices ====== #
    for name, db in databases.items():
      db.flush(save_all=True)
      db_size = len(db)
      db.close()
      prog.add_notification('Flush MmapDict "%s" to disk, size: %s' %
                            (ctext(name, 'yellow'),
                             ctext(str(db_size), 'yellow')))

    # ====== save mean and std ====== #
    def save_mean_std(sum1, sum2, name):
      N = dataset[name.split('_')[0]].shape[0]
      mean = sum1 / N
      std = np.sqrt(sum2 / N - np.power(mean, 2))
      if np.any(np.isnan(mean)):
        wprint('Mean contains NaN, name: %s' % name)
      if np.any(np.isnan(std)):
        wprint('Std contains NaN, name: %s' % name)
      dataset[name + 'sum1'] = sum1
      dataset[name + 'sum2'] = sum2
      dataset[name + 'mean'] = mean
      dataset[name + 'std'] = std
    # save all stats
    if len(stats) > 0:
      for feat_name, (sum1, sum2) in stats.items():
        save_mean_std(sum1, sum2, feat_name)
        prog.add_notification('Saved statistics of: %s, shape: %s' %
                              (ctext(feat_name.split('_')[0], 'yellow'),
                               ctext(str(sum1.shape), 'yellow')))
    # ====== dataset flush() ====== #
    dataset.flush()
    dataset.close()
    # ====== saving the extractor ====== #
    # not good idea to save the extractor all the time
    # pipeline_path = os.path.join(dataset.path, 'pipeline')
    # with open(pipeline_path, 'wb') as f:
    #   cPickle.dump(self.extractor, f, protocol=2)
    # prog.add_notification("Saved Extractor pipeline at: %s" %
    #                       ctext(pipeline_path, 'yellow'))
    # ====== saving the configuration ====== #
    config_path = os.path.join(dataset.path, 'config')
    config = MmapDict(config_path)
    config['__configuration_time__'] = time.time()
    config['__processor__'] = self.path
    for i in dir(self):
      if _default_module.match(i) is not None:
        continue
      j = getattr(self, i)
      if isinstance(j, (Number, string_types, bool)):
        config[i] = j
    config.flush(save_all=True)
    self.config = {i: j
                   for i, j in config}
    config.close()
    prog.add_notification("Saved configuration at: %s" %
                          ctext(config_path, 'yellow'))
    # ====== final notification ====== #
    prog.add_notification("Closed all dataset.")
    prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))