Exemple #1
 def add_indices(self, indices, name, override=False):
   # ====== validate name ====== #
   if not is_string(name):
     raise ValueError("`name` must be string, but given: %s" % str(type(name)))
   if name in self._saved_indices and not override:
     raise ValueError("Cannot override pre-defined INDEX with name: '%s'"
                     % name)
   # ====== validate indices ====== #
   path = os.path.join(self.index_path, name)
   ids = MmapDict(path)
   # predefined mapping, save or copy everything to a
   # MmapDict
   if isinstance(indices, Mapping):
     for name, (start, end) in indices.items():
       ids[name] = (start, end)
   # list of name, or (name, (start, end))
   elif isinstance(indices, (tuple, list, np.ndarray)):
     for i in indices:
       if is_string(i): # only name
         ids[i] = self['indices'][i]
       elif len(i) == 2: # name, (start, end)
         name, (start, end) = i
         ids[name] = (int(start), int(end))
       elif len(i) == 3: # name, start, end
         name, start, end = i
         ids[name] = (int(start), int(end))
         raise ValueError("Unsupport index parsing (name, start, end)"
                          "for: %s" % str(i))
   # flush everything to disk
   # ====== assign new index ====== #
   self._saved_indices[name] = MmapDict(path, read_only=True)
   return self
Exemple #2
  def create_feeder(self, data, recipes, indices=None,
                    batch_filter=None, batch_mode='batch',
                    name=None, override=False):
    data: list of str
        list of name for all data used, the order of this
        list is the order of returned data.
    recipes: list or single odin.fuel.FeederRecipe
        the list of recipes defining the rule of transforming
        the data
    indices: None, string, dict, list
        list of (name, (start, end)) for iterating over files in Feeder
    batch_filter: call-able
        must be a function has take a list of np.ndarray as first arguments
        ([X]) or ([X, y]), you can return None to ignore given batch, return the
        data for accepting the batch
    batch_mode: 'batch' or 'file' (string type)
        'batch' mode return shuffling and return everything in small batches
        'file' mode return [(file_name, order_index, data...), ...]
    name: None, or string
        if name is provided, the feeder information will be saved,
        which include the `indices`, `recipes`

    by defaults, the Feeder is created using only 1 CPU with `buffer_size=1`
    using the method `set_multiprocessing(ncpu=None, buffer_size=None,
    maximum_queue_size=None)` for changing this information.
    from odin.fuel.feeder import Feeder, IndexedData
    # check data
    data = [self.__getitem__(dat) if is_string(dat) else
            for dat in as_tuple(data)]
    # check recipes
    if is_string(recipes):
      recipes = self._saved_recipes[recipes]
      recipes = as_tuple(recipes, t=FeederRecipe)
    # check indices
    if indices is None:
      indices = self.__getitem__('indices')
    elif is_string(indices):
      indices = self._saved_indices[indices]
    elif isinstance(indices, (Mapping, tuple, list, np.ndarray)):
    # ====== saving recipes and indices, if name is not None ====== #
    if is_string(name):
      if name not in self._saved_indices or override:
        self.add_indices(indices, name, override=True)
      if name not in self._saved_recipes or override:
        self.add_recipes(recipes, name, override=True)
    # ====== create Feeder ====== #
    feeder = Feeder(IndexedData(data=data, indices=indices),
                    batch_filter=batch_filter, batch_mode=batch_mode,
                    ncpu=1, buffer_size=1)
    return feeder.set_recipes(recipes)
Exemple #3
 def _validate_texts(self, texts):
     if not isinstance(texts, Iterable) and \
     not isinstance(texts, Iterator) and \
     not is_string(texts):
         raise ValueError(
             'texts must be an iterator, generator or a string.')
     if is_string(texts):
         texts = (texts, )
     return texts
Exemple #4
Exemple #5
Exemple #6
 def item2step(x):
     if isinstance(x, (tuple, list)):
         if len(x) == 1 and isinstance(x[0], Extractor):
             x = x[0]
             ID[0] += 1
             return (x.__class__.__name__ + str(ID[0]), x)
         elif len(x) == 2:
             if is_string(x[0]) and isinstance(x[1], Extractor):
                 return x
             elif is_string(x[1]) and isinstance(x[0], Extractor):
                 return (x[1], x[0])
     elif isinstance(x, Extractor):
         ID[0] += 1
         return (x.__class__.__name__ + str(ID[0]), x)
     return None
Exemple #7
Exemple #8
 def __new__(subclass,
     if not is_string(path):
         raise ValueError("`path` for MmapDict must be string, but given "
                          "object with type: %s" % type(path))
     path = os.path.abspath(path)
     read_only = bool(read_only)
     cache_size = int(cache_size)
     # get stored instances
     all_instances = NoSQL._INSTANCES[subclass.__name__]
     # ====== Found pre-defined instance ====== #
     if path in all_instances:
         return all_instances[path]
     # ====== Create new instance ====== #
     new_instance = super(NoSQL, subclass).__new__(subclass)
     all_instances[path] = new_instance
     # some pre-defined attribute
     new_instance._cache_size = cache_size
     new_instance._read_only = read_only
     new_instance._new_args_called = False
     new_instance._path = path
     new_instance._is_closed = False
     return new_instance
Exemple #9
 def add_recipes(self, recipes, name, override=False):
   # ====== validate arguments ====== #
   if not is_string(name):
     raise ValueError("`name` must be string, but given: %s" % str(type(name)))
   if name in self._saved_recipes and not override:
     raise ValueError("Cannot override pre-defined RECIPE with name: '%s'"
                     % name)
   # ====== validate recipes list ====== #
   if isinstance(recipes, RecipeList):
     recipes = tuple(recipes._recipes)
     tmp = []
     for rcp in as_tuple(recipes, t=FeederRecipe):
       if isinstance(rcp, RecipeList):
         tmp += list(rcp._recipes)
     recipes = tuple(tmp)
   # ====== store the recipes to disk ====== #
   path = os.path.join(self.recipe_path, name)
   with open(path, 'wb') as f:
     cPickle.dump(recipes, f, protocol=cPickle.HIGHEST_PROTOCOL)
   # ====== update local recipes list ====== #
   self._saved_recipes[name] = recipes
   return self
Exemple #10
 def embed(self, vocabulary, dtype='float32',
   """Any word not found in the vocabulary will be set to all-zeros"""
   # ====== check vocab ======= #
   if not isinstance(vocabulary, Mapping):
     raise ValueError('"vocabulary" must be any instance of dict.')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== create embedding matrix ====== #
   ndim = len(next(vocabulary.values()))
   matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype)
   for word, idx in self.dictionary.items():
     if len(word) == 0: continue
     if word in vocabulary:
       matrix[idx, :] = vocabulary[word]
     elif token_not_found == 'raise':
       raise Exception('Cannot find token "%s" in the vocabulary.' % word)
     elif isinstance(token_not_found, int):
       matrix[idx, :] == matrix[token_not_found, :]
   return matrix
Exemple #11
 def _apply(self, X):
   axes = self.axes
   ndims = X.shape.ndims
   if is_string(axes) and axes.lower() == 'auto':
     if ndims == 3:
       axes = (1,)
     elif ndims == 4:
       axes = (1, 2)
     elif ndims == 5:
       axes = (1, 2, 3)
   X = K.upsample(X, scale=self.size, axes=axes, method=self.mode)
   # ====== check desire_shape ====== #
   desire_shape = self.desire_shape
   if desire_shape is not None:
     desire_shape = [None if i is None or i < 0 else int(i)
                     for i in desire_shape]
     # do padding if necessary
     paddings = [[0, 0] if i is None or o is None or i >= o else
                 [tf.cast(tf.ceil((o - i) / 2), 'int32'),
                  tf.cast(tf.floor((o - i) / 2), 'int32')]
                 for i, o in zip(X.shape.as_list(), desire_shape)]
     if not all(i == [0, 0] for i in paddings):
       X = tf.pad(X, paddings=paddings, mode='CONSTANT')
     # do slice if necessary
     slices = [slice(tf.cast(tf.floor((i - o) / 2), 'int32'),
                     tf.cast(-tf.ceil((i - o) / 2), 'int32'), None)
               if i is not None and o is not None and i > o else slice(None)
               for i, o in zip(X.shape.as_list(), desire_shape)]
     if any(s is not slice(None) for s in slices):
       X = X[slices]
     K.set_shape(X, tuple([i if is_number(i) else None
                           for i in desire_shape]))
   return X
Exemple #12
 def __getitem__(self, key):
     if is_string(key):
         if key not in self._data_map:
             raise KeyError('%s not found in this dataset' % key)
         dtype, shape, data, path = self._data_map[key]
         return path if data is None else data
     raise ValueError('Only accept key type is string.')
Exemple #13
Exemple #14
Exemple #15
 def __setitem__(self, key, value):
     key : str or tuple
         if tuple is specified, it contain the key and the datatype
         which must be "memmap", "hdf5"
         for example: ds[('X', 'hdf5')] = numpy.ones((8, 12))
     if not is_string(key) and not isinstance(key, (tuple, list)):
         raise ValueError(
             '"key" is the name for Data and must be String or '
             'tuple specified the name and datatype (memmap, hdf5).')
     # ====== check datatype ====== #
     datatype = 'memmap'  # default datatype
     if isinstance(key, (tuple, list)):
         key, datatype = key
         datatype = datatype.lower()
         if datatype not in ('memmap', 'hdf5'):
             raise ValueError(
                 'datatype can only be "memmap" or "hdf5", but '
                 'the given data type is "%s"' % datatype)
     # ====== do nothing ====== #
     if key in self._data_map:
     # ====== dict ====== #
     path = os.path.join(self.path, key)
     if isinstance(value, dict):
         if os.path.exists(path):
             raise Exception('File with path=%s already exist.' % path)
         d = MmapDict(path)
         for i, j in value.iteritems():
             d[i] = j
         # store new dict
         self._data_map[key] = (type(d).__name__, len(d), d, path)
     # ====== ndarray ====== #
     elif isinstance(value, np.ndarray):
         dtype, shape = value.dtype, value.shape
         if datatype == 'memmap':
             data = MmapData(path, dtype=dtype, shape=shape)
             path = os.path.join(self.path, self._default_hdf5)
             f = open_hdf5(path)
             data = Hdf5Data(key, hdf=f, dtype=dtype, shape=shape)
         # store new key
         self._data_map[key] = (data.dtype, data.shape, data, path)
         # check maximum opened memmap
     # ====== other types ====== #
         if os.path.exists(path):
             raise Exception('File with path=%s already exist.' % path)
         with open(path, 'wb') as f:
             cPickle.dump(value, f, protocol=cPickle.HIGHEST_PROTOCOL)
         # store new dict
         self._data_map[key] = (type(value).__name__, len(value) if hasattr(
             value, '__len__') else 0, value, path)
Exemple #16
 def post_processing(result):
   # search for file name
   if self.identifier not in result:
     raise RuntimeError(
         "Cannot find identifier '%s' in returned dictionary" % self.identifier)
   file_name = result[self.identifier]
   # invalid file_name
   if not is_string(file_name):
     raise RuntimeError("Cannot find file name in returned features "
         "list, the file name can be specified in key: 'name', 'path' "
         "and the type of the value must be string. All available "
         "keys are: %s" % str(result.keys()))
   # store all new indices
   # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
   all_indices = {}
   # processing
   for feat_name, X in result.items():
     # some invalid feat_name
     if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
       raise RuntimeError("Returned features' name cannot be one "
                          "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
     # ignore some feat_name
     if feat_name in ('name'):
     # if numpy ndarray, save to MmapData
     if isinstance(X, np.ndarray) or \
     'sum1' == feat_name[-4:] or \
     'sum2' == feat_name[-4:]:
       # save statistics instead
       if 'sum1' == feat_name[-4:]:
         stats[feat_name[:-4]][0] += X
       elif 'sum2' == feat_name[-4:]:
         stats[feat_name[:-4]][1] += X
       # save features array
         all_indices[feat_name] = X.shape[0]
         # cache data, only if we have more than 0 sample
         if X.shape[0] > 0:
     # else all other kind of data save to MmapDict
       databases[feat_name][file_name] = X
     # remove data
     del X
   # ====== update indices ====== #
   if len(all_indices) > 0:
     for feat_name, n in all_indices.items():
       ids_name = 'indices_%s' % feat_name
       databases[ids_name][file_name] = (last_start[ids_name],
                                         last_start[ids_name] + n)
       last_start[ids_name] += n
   # ====== flush cache ====== #
   n_processed[0] += 1
   if n_processed[0] % cache_limit == 0: # 12 + 8
     for feat_name, X_cached in cache.items():
       flush_feature(feat_name, X_cached)
   # ====== update progress ====== #
   return file_name
Exemple #17
Exemple #18
 def __init__(self, pool_size=2, strides=None, dilation=1,
              pad='valid', mode='max', transpose_mode='nn', **kwargs):
   super(Pool, self).__init__(**kwargs)
   self.pool_size = as_tuple(pool_size, t=int)
   self.strides = self.pool_size if strides is None \
       else as_tuple(strides, t=int)
   self.dilation = (1,) if dilation is None else as_tuple(dilation, t=int)
   self.pad = pad.upper() if is_string(pad) else as_tuple(pad, t=int)
   self.mode = mode.upper()
   self.transpose_mode = transpose_mode
Exemple #19
 def __init__(self, task_name, output_name,
              lower_better=True, improvement_margin=0, logging=True):
   super(CheckpointGeneralization, self).__init__(logging=logging)
   self._task_name = str(task_name)
   self._output_name = output_name if is_string(output_name) \
       else output_name.name
   self._lower_better = bool(lower_better)
   self._improvement_margin = float(improvement_margin)
   assert self._improvement_margin >= 0
   self._best_score = None
Exemple #20
def _check_label_mode(mode):
  if is_number(mode):
    return np.clip(float(mode), 0., 1.)
  if is_string(mode):
    mode = mode.lower()
    if mode == 'mid':
      mode = 'middle'
    if mode not in ('common', 'last', 'first', 'middle'):
      raise ValueError(
          "`label_mode` can be: 'common', 'last', 'first', 'middle'")
    return mode
  raise ValueError("No support for `label_mode`=%s" % str(mode))
Exemple #21
 def get_loaded_param(self, name):
   ds = self.__class__.load_parameters()
   if is_string(name):
     return_1_param = True
     return_1_param = False
   name = as_tuple(name, t=str)
   if any(n not in ds for n in name):
     raise RuntimeError("Cannot find parameter with name:'%s' from loaded "
         "dataset at path: '%s'" % (name, ds.path))
   params = [ds[n][:] for n in name]
   return params[0] if return_1_param else tuple(params)
Exemple #22
def _check_dtype(dtype):
  if hasattr(dtype, '__call__'):
    return dtype
  # ====== check dtype ====== #
  if dtype is None:
    dtype = K.floatX
  elif isinstance(dtype, np.dtype) or is_string(dtype):
    dtype = str(dtype)
  elif isinstance(dtype, VariableDesc):
    dtype = dtype.dtype
  elif isinstance(dtype, tf.DType):
    dtype = dtype.base_dtype.name
  return dtype
Exemple #23
Exemple #24
Exemple #25
 def __init__(self, **kwargs):
   super(Model, self).__init__(**kwargs)
   input_info = self.get_input_info()
   if not isinstance(input_info, Mapping) or \
   len(input_info) == 0 or \
   not all(is_string(k) and _validate_shape_dtype(v)
           for k, v in input_info.items()):
     raise ValueError("`get_input_info` must return a (length > 0) Mapping "
         "of: 'input-name' -> (shape-tuple, dtype-string), but the "
         "returned value is: %s" % str(input_info))
   # ====== init kwargs_desc ====== #
   for name, (shape, dtype) in input_info.items():
     self._kwargs_desc[name] = VariableDesc(
         shape=shape, name=name, dtype=dtype)
Exemple #26
def _validate_shape_dtype(x):
  if not isinstance(x, tuple):
    return False
  if not len(x) == 2:
    return False
  shape, dtype = x
  # check shape
  if not isinstance(shape, tuple) and \
  all(is_number(i) or isinstance(i, type(None)) for i in x):
    return False
  # check dtype
  if not is_string(dtype):
    return False
  return True
Exemple #27
 def add_indices(self, indices, name, override=False):
     # ====== validate name ====== #
     if not is_string(name):
         raise ValueError("`name` must be string, but given: %s" %
     if name in self._saved_indices and not override:
         raise ValueError(
             "Cannot override pre-defined INDEX with name: '%s'" % name)
     # ====== validate indices ====== #
     path = os.path.join(self.index_path, name)
     ids = MmapDict(path)
     # predefined mapping, save or copy everything to a
     # MmapDict
     if isinstance(indices, Mapping):
         for name, (start, end) in indices.items():
             ids[name] = (start, end)
     # list of name, or (name, (start, end))
     elif isinstance(indices, (tuple, list, np.ndarray)):
         for i in indices:
             if is_string(i):  # only name
                 ids[i] = self['indices'][i]
             elif len(i) == 2:  # name, (start, end)
                 name, (start, end) = i
                 ids[name] = (int(start), int(end))
             elif len(i) == 3:  # name, start, end
                 name, start, end = i
                 ids[name] = (int(start), int(end))
                 raise ValueError(
                     "Unsupport index parsing (name, start, end)"
                     "for: %s" % str(i))
     # flush everything to disk
     # ====== assign new index ====== #
     self._saved_indices[name] = MmapDict(path, read_only=True)
     return self
Exemple #28
 def __new__(cls, *args, **kwargs):
     path = kwargs.get('path', None)
     if path is None:
         path = args[0]
     if not is_string(path):
         raise ValueError("`path` for Dataset must be string, but given "
                          "object with type: %s" % type(path))
     path = os.path.abspath(path)
     # Found old instance
     if path in Dataset.__INSTANCES:
         return Dataset.__INSTANCES[path]
     # new Dataset
     new_instance = super(Dataset, cls).__new__(cls)
     Dataset.__INSTANCES[path] = new_instance
     return new_instance
Exemple #29
Exemple #30
 def __init__(self, vad, frame_length, padding=None):
     super(VADindex, self).__init__()
     if isinstance(vad, (list, tuple)):
         if len(vad) == 2:
             indices, data = vad
             if is_string(indices) and os.path.exists(indices):
                 indices = np.genfromtxt(indices, dtype=str, delimiter=' ')
             vad = {name: data[int(start): int(end)]
                    for name, start, end in indices}
         else: # a list contain all information is given
             vad = {name: segments for name, segments in vad}
     elif not isinstance(vad, dict):
         raise ValueError('Unsupport "vad" type: %s' % type(vad).__name__)
     self.vad = vad
     self.padding = padding
     self.frame_length = frame_length
Exemple #31
 def process(self, name, X, **kwargs):
   for i, f in enumerate(self._recipes):
     # return iterator (iterate over all of them)
     args = f.process(name, X)
     # break the chain if one of the recipes get error,
     # and return None
     if args is None:
       return None
     if not isinstance(args, (tuple, list)) or \
     len(args) != 2 or \
     not is_string(args[0]) or \
     not isinstance(args[1], (tuple, list)):
       raise ValueError("The returned from `process` must be tuple or "
           "list, of length 2 which contains (name, [x1, x2, x3,...])."
           "`name` must string type, and [x1, x2, ...] is tuple or list.")
     name, X = args
   return name, X
Exemple #32
def copy_dataset2(origin, destination,
                  indices_filter=None, data_filter=None,
  # ====== prepare input ====== #
  if is_string(origin):
    origin = Dataset(origin, read_only=True)
    own_ds = True
  elif isinstance(origin, Dataset):
    own_ds = False
  # ====== pass ====== #
  ds = origin.copy(destination,
  # ====== end and return ====== #
  if own_ds:
  return ds
Exemple #33
  def __init__(self, task_name, output_name, threshold, patience=1,
               get_value=lambda x: np.mean(x),
    super(EarlyStop, self).__init__(logging=logging)
    self._task_name = str(task_name)
    self._output_name = output_name if is_string(output_name) \
        else output_name.name

    self._threshold = float(threshold)
    self._patience = int(patience)

    if get_value is None:
      get_value = lambda x: x
    elif not hasattr(get_value, '__call__'):
      raise ValueError('get_value must call-able')
    self._get_value = get_value
    # ====== history ====== #
    self._history = []
Exemple #34
Exemple #35
Exemple #36
Exemple #37
 def __init__(self, task_name, output_name,
              fn_reduce=lambda x: (np.mean(x)
                                   if isinstance(x[0], Number) else
                                   sum(i for i in x)),
              print_plot=False, save_path=None,
              repeat_freq=1, logging=True):
   super(EpochSummary, self).__init__(logging=logging)
   self._task_name = as_tuple(task_name, t=str)
   # ====== scheduling ====== #
   assert repeat_freq >= 1
   self._repeat_freq = int(repeat_freq)
   self._count = self._repeat_freq * len(self._task_name)
   self._epoch_results = defaultdict(dict)
   # ====== output identity ====== #
   if not isinstance(output_name, (tuple, list, set)):
     output_name = (output_name,)
   output_name = [i if is_string(i) else i.name
                  for i in output_name]
   self.output_name = tuple(output_name)
   self.fn_reduce = FuncDesc(func=fn_reduce)
   # ====== how to output ====== #
   self.print_plot = bool(print_plot)
   self.save_path = save_path
Exemple #38
 def _initialize(self):
     # ====== validate init arguments ====== #
     self.ndim = len(self.input_shape) - 2
     # padding
     if isinstance(self.pad, (tuple, list, int)):
         self.pad = as_tuple(self.pad, self.ndim, int)
     elif self.pad is None:
         self.pad = (0, ) * self.ndim
     elif is_string(self.pad):
         self.pad = self.pad.upper()
     # strides
     if self.strides is None:
         self.strides = (1, ) * self.ndim
         self.strides = as_tuple(self.strides, self.ndim, int)
     # dilation
     if self.dilation is None:
         self.dilation = (1, ) * self.ndim
         self.dilation = as_tuple(self.dilation, self.ndim, int)
     # filter size
     self.filter_size = as_tuple(self.filter_size, self.ndim, int)
     # ====== create config ====== #
     # weights
     if self.b_init is not None:
         if self.untie_biases:
             biases_shape = self.output_shape[1:]
             biases_shape = (self.num_filters, )
Exemple #39
  def shape_transform(self, shapes):
    shapes: list of [(shape0, indices0), (shape1, indices1), ...]
        list of data shape tuple and indices, the indices is list
        of tuple (name, length)

    new shape that transformed by this Recipe
    new indices
    for i in self._recipes:
      shapes = i.shape_transform(shapes)
      # ====== check returned ====== #
      if not all((isinstance(shp, (tuple, list)) and
                  all(is_number(s) for s in shp) and
                  is_string(ids[0][0]) and is_number(ids[0][1]))
                 for shp, ids in shapes):
        raise RuntimeError("Returned `shapes` must be the list of pair "
                           "`(shape, indices)`, where `indices` is the "
                           "list of (name, length(int)).")
    return shapes
Exemple #40
Exemple #41
def train(X, y_true, y_pred, train_data,
          valid_data=None, valid_freq=1.,
          patience=3, threshold=5, rollback=True,
          metrics=[0], training_metrics=[],
          l1_regu=0., l2_regu=0., parameters=[],
          prior_weights=None, sample_weights=None,
          batch_size=256, epochs=8, shuffle=True,
          optimizer='rmsprop', optz_kwargs={'lr': 0.001}, updates=None,
          init_vars=True, labels=None, seed=5218, verbose=2):

  rollback : bool (default: True)
    if True, allow rollback to the best checkpoint during training
  objectives : {callable, tensorflow.Tensor}
    if `callable`, the function must take `y_true`, and `y_pred`
    The objectives must be differentiable and used for training.
  metrics : {callable, tensorflow.Tensor, int}
    if `callable`, the function must take `y_true`, and `y_pred`
    The `metrics` is for monitoring the training process.
    if `int`, it is the index of the loss in `objectives`
    NOTE: the first metrics in the list will be used for
    early-stopping (smaller is better).
  training_metrics : {callable, tensorflow.Tensor, int}
    if `int`, it is the index of the loss in `metrics`
  parameters : {list or tensorflow.Variables}
    All the parameters will be updated by the `optimizer`, if None
    or empty list is given, use ComputationalGraph to get
    all variables with Parameters roles related to the objectives
  init_vars : bool (default: True)
    automatically initialize all variables
  labels : {None, list of string}
    Given labels for classification task
  seed : int
    specific random seed for reproducible
  verbose : int
    0 - Turn off all log
    1 - only show notification
    2 - show notification, important log and summary
    3 - Show progress, summary, notification and logging
    4 - Show debug information and everything

  Function used for prediction
  from odin import backend as K
  # ====== preprocess inputs ====== #
  X = as_tuple(X, t=K.is_tensor)
  y_true = as_tuple(y_true, t=K.is_tensor)
  y_pred = as_tuple(y_pred, t=K.is_tensor)
  # ====== parsing objectives and metrics ====== #
  # for training
  prior_weights = _preprocess_prior_weights(y_true=y_true,
  if prior_weights is not None:
    if sample_weights is not None:
      sample_weights = sample_weights + prior_weights
      sample_weights = prior_weights
  objectives = _preprocessing_losses(as_tuple(objectives), y_true, y_pred,
  # metrics for monitoring
  metrics = as_tuple(metrics)
  get_value = lambda x: np.mean(x)
  if len(metrics) > 0 and \
  (metrics[0] == tf.metrics.accuracy or
   metrics[0] == K.metrics.categorical_accuracy):
    get_value = lambda x: 1 - np.mean(x)
  metrics = _preprocessing_losses(metrics, y_true, y_pred,
  # training_metrics
  training_metrics = _preprocessing_losses(as_tuple(training_metrics),
                                           y_true, y_pred,
  # sum the objectives for differentiable
  if len(objectives) > 0:
    objectives = [sum(objectives) if len(objectives) > 1 else objectives[0]]
  # ====== preprocess optimizer and get updates====== #
  if updates is None: # not given updates
    if is_string(optimizer):
      optimizer = _parse_optimizer(optimizer)
      optimizer = optimizer(**optz_kwargs)
    elif not isinstance(optimizer, K.optimizers.Optimizer):
      raise ValueError("`optimizer` must be string - name of algorithm or instance "
                       "of odin.backend.optimizers.Optimizer")
    parameters = K.ComputationGraph(objectives).parameters\
    if len(parameters) == 0 else as_tuple(parameters, t=K.is_variable)
    # check objectives
    if len(objectives) == 0:
      raise RuntimeError("`objectives` must be given due to `updates=None`")
    weights = [p for p in parameters if K.role.has_roles(p, roles=K.role.Weight)]
    # l1 regularization
    if l1_regu > 0.:
      l1_norm = sum(tf.norm(w, ord=1) for w in weights)
      objectives[0] += l1_norm
    # l2 regularization
    if l2_regu > 0.:
      l2_norm = sum(tf.norm(w, ord=2) for w in weights)
      objectives[0] += l2_norm
    # update rules
    updates = optimizer.get_updates(objectives[0], parameters)
    # adding global norm and learning rate
  elif K.is_operation(updates): # given updates
    optimizer = None
    raise ValueError("`updates` can be None or tensorflow Operation, but given "
      "type: %s" % str(type(updates)))
  # ====== placeholders ====== #
  inputs_plh = []
  for plh in X:
    for i in (K.ComputationGraph(plh).placeholders
              if not K.is_placeholder(plh)
              else as_tuple(plh)):
  outputs_plh = []
  for plh in y_true: # no duplicated inputs (e.g. autoencoder X == y)
    if not K.is_placeholder(plh):
      plh = K.ComputationGraph(plh).placeholders
    for i in as_tuple(plh):
      if i not in inputs_plh:
  inputs = inputs_plh + outputs_plh
  # ====== initialize variables ====== #
  if bool(init_vars):
  # ====== creating function ====== #
  # training function
  f_train = K.function(inputs=inputs,
                       outputs=objectives + training_metrics,
                       updates=updates, training=True)
  # scoring function
  f_score = None
  if len(metrics) > 0:
    f_score = K.function(inputs=inputs, outputs=metrics,
  # prediction function
  f_pred = K.function(inputs=inputs_plh,
                      outputs=y_pred[0] if len(y_pred) == 1 else y_pred,
  # ====== preprocessing data ====== #
  train_data, valid_data = _preprocessing_data(train_data, valid_data)
  # print some debug information if necessary
  if verbose >= 4:
    print("%s %s %s" % (
        ctext("============", 'cyan'),
        ctext("Prepare for Training", 'red'),
        ctext("============", 'cyan')))
    print(ctext("Input placeholders:", 'yellow'))
    for i in inputs_plh:
      print(" * ", str(i))
    print(ctext("Output placeholders:", 'yellow'))
    for i in outputs_plh:
      print(" * ", str(i))
    print(ctext("Parameters:", 'yellow'))
    for p in parameters:
      print(" * ", p.name, '-', p.shape, ';', p.dtype.name)
    print(ctext("Optimizer:", 'yellow'))
    print(" * ", str(optimizer))
    print(" * Optimizer kwargs:", optz_kwargs)
    print(" * L1:", l1_regu)
    print(" * L2:", l2_regu)
    print(ctext("Training:", 'yellow'))
    print(" * Valid freq:", valid_freq)
    print(" * Patience:", patience)
    print(" * Threshold:", threshold)
    print(" * Rollback:", rollback)
    print(" * Batch size:", batch_size)
    print(" * Epoch:", epochs)
    print(" * Shuffle:", shuffle)
    print(" * Seed:", seed)
    print(ctext("Objectives:", 'yellow'))
    for o in objectives:
      print(" * ", str(o))
    print(ctext("Weights:", 'yellow'))
    print(" * Prior:", str(prior_weights))
    print(" * Sample:", str(sample_weights))
    print(ctext("Metrics:", 'yellow'))
    for m in metrics:
      print(" * ", str(m))
    print(ctext("Training metrics:", 'yellow'))
    for t in training_metrics:
      print(" * ", str(t))
    print(ctext("Training Data:", 'yellow'), str(train_data))
    print(ctext("Validating Data:", 'yellow'), str(valid_data))
    print(ctext("Labels:", 'yellow'), labels)
  # ====== create trainer ====== #
  callback_log = True if verbose > 0 else False
  trainer = MainLoop(batch_size=batch_size,
                     seed=seed if shuffle else None,
                     shuffle_level=2 if shuffle else 0,
                     verbose=verbose, labels=labels)
  trainer.set_checkpoint(path=None, obj=None,
  # create callback
  callbacks = [NaNDetector(patience=patience, log=callback_log)]
  if valid_data is not None and f_score is not None:
        EarlyStopGeneralizationLoss(task_name='valid', output_name=metrics[0],
                                    threshold=threshold, patience=patience,
                                    log=callback_log, get_value=get_value))
  # set the tasks
  trainer.set_train_task(func=f_train, data=train_data,
                         epoch=epochs, name='train')
  if valid_data is not None and f_score is not None:
    trainer.set_valid_task(func=f_score, data=valid_data,
  # running
  return f_pred
Exemple #42
 def __init__(self, nb_classes, l1=0., l2=0.,
              fit_intercept=True, confusion_matrix=True,
              tol=1e-4, patience=3, rollback=True,
              batch_size=1024, max_epoch=100, max_iter=None,
              optimizer='adadelta', learning_rate=1.0, class_weight=None,
              dtype='float32', seed=5218,
              verbose=False, path=None, name=None):
   super(LogisticRegression, self).__init__()
   # ====== basic dimensions ====== #
   if isinstance(nb_classes, (tuple, list, np.ndarray)):
     self._labels = tuple([str(i) for i in nb_classes])
     self._nb_classes = len(nb_classes)
   elif is_number(nb_classes):
     self._labels = tuple([str(i) for i in range(nb_classes)])
     self._nb_classes = int(nb_classes)
   self._feat_dim = None
   self._dtype = np.dtype(dtype)
   # ====== preprocessing class weight ====== #
   if class_weight is None:
     class_weight = np.ones(shape=(self.nb_classes,),
   elif is_number(class_weight):
     class_weight = np.zeros(shape=(self.nb_classes,),
                             dtype=self.dtype) + class_weight
   self._class_weight = class_weight
   # ====== flags ====== #
   self.l1 = float(l1)
   self.l2 = float(l2)
   self.fit_intercept = bool(fit_intercept)
   self.confusion_matrix = bool(confusion_matrix)
   # ====== internal states ====== #
   self._is_fitted = False
   # ====== others ====== #
   if name is None:
     name = uuid(length=8)
     self._name = 'LogisticRegression_%s' % name
     self._name = str(name)
   self._path = path
   # ====== training ====== #
   self.batch_size = int(batch_size)
   self.max_epoch = max_epoch
   self.max_iter = max_iter
   if not is_string(optimizer):
     raise ValueError("`optimizer` must be one of the following")
   optimizer = optimizer.lower()
   if optimizer not in _optimizer_list:
     raise ValueError("`optimizer` must be one of the following: %s" %
   self._optimizer = _optimizer_list[optimizer.lower()](lr=float(learning_rate))
   self._optimizer_name = optimizer
   self._optimizer_lr = learning_rate
   # ====== stop training ====== #
   self.tol = float(tol)
   self.patience = int(patience)
   self.rollback = bool(rollback)
   # ====== others ====== #
   self._train_history = []
   self._valid_history = []
   self._rand_state = np.random.RandomState(seed=int(seed))
   self.verbose = int(verbose)
Exemple #43
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
     # append ending document token
     if end_document is not None:
     # add the final results
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
   # transform into one-hot matrix
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results
Exemple #44
def eval(x, feed_dict=None,
         update_before=None, update_after=None,
         options=None, run_metadata=None):
  ''' Generalized version of code evaluation, it
  could evaluate python and tensorflow expression.

  x : list, tuple, dictionary, `Tensor`
      tensorfow `Tensor` for evaluation
  feed_dict : dict
      Input dictionary, mapping placeholder -> values
  update_before: {None, list, or dict}
      mapping from `Tensor` to its new value which is `Tensor` or
      real value, the updates is runned before evaluating
  update_after: {None, list, or dict}
      same as `updates_before`, but run the `updates` after
      evaluate `x`
  options: tensorflow.RunOptions
      thhe options allow controlling the behavior of
      this particular step (e.g. turning tracing on).
  run_metadata: tensorflow.RunMetadata
      When appropriate, the non-Tensor output of this
      step will be collected there. For example,
      when users turn on tracing in options, the
      profiled info will be collected into
      this argument and passed back.

  >>> import tensorflow as tf
  >>> from odin import backend as K
  >>> run_metadata = tf.RunMetadata()
  >>> K.eval(...,
  ...        options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
  ...                              output_partition_graphs=True),
  ...        run_metadata=run_metadata)
  >>> with open('log_path', 'w') as f:
  >>>   f.write(str(run_metadata))

  If "Couldn't open CUDA library libcupti.so.8.0" appears when you
  adding RunOptions, try adding "/usr/local/cuda/extras/CUPTI/lib64/"
  results = ()
  update_before = _validate_updates(update_before)
  update_after = _validate_updates(update_after)
  # ====== run updates before ====== #
  if update_before is not None:
    get_session(update_before.graph).run(update_before, feed_dict=feed_dict,
  # ====== list of Tensor or string ====== #
  if isinstance(x, (tuple, list)):
    string_eval = []
    tensor_eval = []
    tensor_idx = []
    # evaluate string expression
    for i, j in enumerate(x):
      if is_string(j):
    # evaluate tensor
    if len(tensor_eval) > 0:
      graph = [i.graph for i in tensor_eval]
      if len(set(graph)) > 1:
        raise RuntimeError("Cannot evaluate multiple `Tensor` come from "
                           "different `Graph`.")
      tensor_eval = get_session(graph[0]).run(tensor_eval,
    results = tuple([tensor_eval.pop(0) if i in tensor_idx else string_eval.pop(0)
                     for i in range(len(x))])
  # ====== mapping ====== #
  elif isinstance(x, Mapping):
    results = {}
    tensor_eval_key = []
    tensor_eval_value = []
    for k, v in x.items():
      if is_string(v):
        results[k] = builtins.eval(v)
    # evaluate tensor
    if len(tensor_eval) > 0:
      graph = [i.graph for i in tensor_eval_value]
      if len(set(graph)) > 1:
        raise RuntimeError("Cannot evaluate multiple `Tensor` come from "
                           "different `Graph`.")
      tensor_eval_value = get_session(graph[0]).run(tensor_eval_value,
    # update results
    for k, v in zip(tensor_eval_key, tensor_eval_value):
      results[k] = v
  # ====== just a string ====== #
  elif is_string(x):
    results = builtins.eval(x)
  # ====== just a Tensorflow object ====== #
  elif isinstance(x, tf.Operation) or \
  is_tensor(x, inc_distribution=True, inc_variable=True):
    results = get_session(x.graph).run(x, feed_dict=feed_dict,
  # ====== exception ====== #
    raise RuntimeError("Cannot evaluate object of type: %s" % type(x))
  # ====== run updates after ====== #
  if update_after is not None:
    get_session(update_after.graph).run(update_after, feed_dict=feed_dict,
  return results
Exemple #45
Exemple #46
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
        raise ValueError("Cannot acquire Dataset from input: %s" %
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
            pca = MiniBatchPCA(n_components=None,
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
    for n in mpi:
        if is_string(n):
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
    # ====== return ====== #
    if own_dataset:
Exemple #47
def validate_features(ds_or_processor,
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
    elif override:
        if os.path.isfile(path):
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                fail_test = True
            prog['Name'] = file_name
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    if should_close_ds:
Exemple #48
  def save_cache(self, path, name=None, dtype=None, batch_size=1024):
    """ Save all preprocessed data to a Dataset

    path: string
        path to a folder
    name: None, or list of string
        specific name for each returned `numpy.ndarray` during iteration
    dtype: None, or list of dtype, or single dtype
        specific dtype for all or each of returned `numpy.ndarray`
        during iteration
    batch_size: int
        amount of samples for each batch (higher the faster iteration)

    Only returned `numpy.ndarray` are saved
    from odin.fuel.dataset import Dataset
    if not is_string(path):
      raise ValueError("`path` must be string path to a folder.")
    if os.path.exists(path) and os.path.isfile(path):
      raise ValueError("`path` is a file, required a folder for "
                       "saving all cache data.")
    # ====== start caching ====== #
    prog = Progbar(target=len(self),
                   name='Saving cache of preprocessed data',
                   print_report=True, print_summary=True)
    ds = Dataset(path, override=True)
    with self.set_batch_context(batch_size=int(batch_size), seed=None,
                                start=0, end=-1, shuffle_level=0):
      for X in self:
        if not isinstance(X, (tuple, list)):
          X = (X,)
        n = 0
        i = 0
        # saving preprocessed data
        for x in X:
          if isinstance(x, np.ndarray):
            # checking name
            if name is None:
              x_name = 'X%d' % i
              x_name = name[i]
            # checking dtype
            if isinstance(dtype, (tuple, list)):
              x = x.astype(dtype[i])
            elif dtype is not None:
              x = x.astype(dtype)
            # saving to the dataset
            if x_name in ds:
              ds[(x_name, 'memmap')] = x
            # update samples count, and data count
            n = x.shape[0]
            i += 1
        # print progress
    # ====== flush and close everything ====== #
    with open(os.path.join(path, 'README'), 'wb') as f:
    # end
    # ====== check one more time ====== #
    ds = Dataset(path, read_only=True)
    print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)')
    return self
Exemple #49
def _to_numpy_array(self, x):
  if not is_string(x[0]) and len(set(i.shape[1:] for i in x)) == 1:
    return np.concatenate(x, axis=0)
  return np.array(x)
Exemple #50
 def __getitem__(self, key):
   if is_string(key):
     key = slice(*self.indices[key])
   return super(IndexedData, self).__getitem__(key)
Exemple #51
 def _to_numpy_array(self, x):
     if not is_string(x[0]) and len(set(i.shape[1:] for i in x)) == 1:
         return np.concatenate(x, axis=0)
     return np.array(x)
Exemple #52
 def _transform(self, X):
   # ====== file input file ====== #
   raw = None
   path = None
   if isinstance(X, Mapping):
     if 'path' in X:
       path = X['path']
     if 'sr' in X:
       if self.sr is None:
         self.sr = X['sr']
         self._first_config_generated = True
       elif self.sr != X['sr']:
         raise ValueError("Given sample rate: %d, but the audio file has "
                          "sample rate: %d" % (self.sr, X['sr']))
     if 'raw' in X:
       raw = X['raw']
   elif is_string(X):
     path = X
   elif isinstance(X, np.ndarray):
     raw = X
     raise ValueError("openSMILE extractor require path to audio file.")
   # no sample rate specified, cannot generate appropriate config
   if self.sr is None:
     raise RuntimeError("Cannot acquire sample rate for the input.")
   # ====== first time generate config ====== #
   if not self._first_config_generated:
     self._first_config_generated = True
   # ====== extract SAD ====== #
   unique_id = os.getpid() + random.randint(0, 10e8)
   inpath = os.path.join(
       get_logpath(), '%s%d.wav' % (self.__class__.__name__, unique_id))
   outpath = os.path.join(
       get_logpath(), '%s%d.csv' % (self.__class__.__name__, unique_id))
     if path is None or not os.path.exists(path):
       if raw is None:
         raise RuntimeError("openSMILE require input audio file, since "
             "we cannot find any audio file, it is required to provide "
             "raw array and sample rate, so the audio file will be cached.")
       from soundfile import write
       write(inpath, data=raw, samplerate=self.sr)
       path = inpath
     # if in debug mode or not
     command = 'SMILExtract -loglevel %d -C %s -I %s -O %s' % \
         (self._log_level, self.config_path, path, outpath)
     results = np.genfromtxt(outpath, dtype='float32',
                             delimiter=',', skip_header=0)
   except Exception as e:
     import traceback; traceback.print_exc()
     raise e
     if os.path.exists(inpath):
     if os.path.exists(outpath):
   # ====== post-processing ====== #
   X_update = self._post_processing(results)
   if not isinstance(X_update, dict):
     raise ValueError("_post_processing must return a dictionary.")
   return X_update