Beispiel #1
0
 def add_indices(self, indices, name, override=False):
   # ====== validate name ====== #
   if not is_string(name):
     raise ValueError("`name` must be string, but given: %s" % str(type(name)))
   if name in self._saved_indices and not override:
     raise ValueError("Cannot override pre-defined INDEX with name: '%s'"
                     % name)
   # ====== validate indices ====== #
   path = os.path.join(self.index_path, name)
   ids = MmapDict(path)
   # predefined mapping, save or copy everything to a
   # MmapDict
   if isinstance(indices, Mapping):
     for name, (start, end) in indices.items():
       ids[name] = (start, end)
   # list of name, or (name, (start, end))
   elif isinstance(indices, (tuple, list, np.ndarray)):
     for i in indices:
       if is_string(i): # only name
         ids[i] = self['indices'][i]
       elif len(i) == 2: # name, (start, end)
         name, (start, end) = i
         ids[name] = (int(start), int(end))
       elif len(i) == 3: # name, start, end
         name, start, end = i
         ids[name] = (int(start), int(end))
       else:
         raise ValueError("Unsupport index parsing (name, start, end)"
                          "for: %s" % str(i))
   # flush everything to disk
   ids.flush(save_all=True)
   ids.close()
   # ====== assign new index ====== #
   self._saved_indices[name] = MmapDict(path, read_only=True)
   return self
Beispiel #2
0
  def create_feeder(self, data, recipes, indices=None,
                    batch_filter=None, batch_mode='batch',
                    name=None, override=False):
    """
    Parameters
    ----------
    data: list of str
        list of name for all data used, the order of this
        list is the order of returned data.
    recipes: list or single odin.fuel.FeederRecipe
        the list of recipes defining the rule of transforming
        the data
    indices: None, string, dict, list
        list of (name, (start, end)) for iterating over files in Feeder
    batch_filter: call-able
        must be a function has take a list of np.ndarray as first arguments
        ([X]) or ([X, y]), you can return None to ignore given batch, return the
        data for accepting the batch
    batch_mode: 'batch' or 'file' (string type)
        'batch' mode return shuffling and return everything in small batches
        'file' mode return [(file_name, order_index, data...), ...]
    name: None, or string
        if name is provided, the feeder information will be saved,
        which include the `indices`, `recipes`

    Note
    ----
    by defaults, the Feeder is created using only 1 CPU with `buffer_size=1`
    using the method `set_multiprocessing(ncpu=None, buffer_size=None,
    maximum_queue_size=None)` for changing this information.
    """
    from odin.fuel.feeder import Feeder, IndexedData
    # check data
    data = [self.__getitem__(dat) if is_string(dat) else
            as_data(dat)
            for dat in as_tuple(data)]
    # check recipes
    if is_string(recipes):
      recipes = self._saved_recipes[recipes]
    else:
      recipes = as_tuple(recipes, t=FeederRecipe)
    # check indices
    if indices is None:
      indices = self.__getitem__('indices')
    elif is_string(indices):
      indices = self._saved_indices[indices]
    elif isinstance(indices, (Mapping, tuple, list, np.ndarray)):
      pass
    # ====== saving recipes and indices, if name is not None ====== #
    if is_string(name):
      if name not in self._saved_indices or override:
        self.add_indices(indices, name, override=True)
      if name not in self._saved_recipes or override:
        self.add_recipes(recipes, name, override=True)
    # ====== create Feeder ====== #
    feeder = Feeder(IndexedData(data=data, indices=indices),
                    batch_filter=batch_filter, batch_mode=batch_mode,
                    ncpu=1, buffer_size=1)
    return feeder.set_recipes(recipes)
Beispiel #3
0
 def _validate_texts(self, texts):
     if not isinstance(texts, Iterable) and \
     not isinstance(texts, Iterator) and \
     not is_string(texts):
         raise ValueError(
             'texts must be an iterator, generator or a string.')
     if is_string(texts):
         texts = (texts, )
     return texts
Beispiel #4
0
 def _validate_texts(self, texts):
   """ Valiate the input to `fit` and `transform` """
   if not isinstance(texts, Iterable) and \
   not isinstance(texts, Iterator) and \
   not is_string(texts):
     raise ValueError('texts must be an iterator, generator or a string.')
   if is_string(texts):
     texts = (texts,)
   # convert to unicode
   texts = (t.decode('utf-8') for t in texts)
   return texts
Beispiel #5
0
 def _validate_texts(self, texts):
     """ Valiate the input to `fit` and `transform` """
     if not isinstance(texts, Iterable) and \
     not isinstance(texts, Iterator) and \
     not is_string(texts):
         raise ValueError(
             'texts must be an iterator, generator or a string.')
     if is_string(texts):
         texts = (texts, )
     # convert to unicode
     texts = (t.decode('utf-8') for t in texts)
     return texts
Beispiel #6
0
 def item2step(x):
     if isinstance(x, (tuple, list)):
         if len(x) == 1 and isinstance(x[0], Extractor):
             x = x[0]
             ID[0] += 1
             return (x.__class__.__name__ + str(ID[0]), x)
         elif len(x) == 2:
             if is_string(x[0]) and isinstance(x[1], Extractor):
                 return x
             elif is_string(x[1]) and isinstance(x[0], Extractor):
                 return (x[1], x[0])
     elif isinstance(x, Extractor):
         ID[0] += 1
         return (x.__class__.__name__ + str(ID[0]), x)
     return None
Beispiel #7
0
 def item2step(x):
   if isinstance(x, (tuple, list)):
     if len(x) == 1 and isinstance(x[0], Extractor):
       x = x[0]
       ID[0] += 1
       return (x.__class__.__name__ + str(ID[0]), x)
     elif len(x) == 2:
       if is_string(x[0]) and isinstance(x[1], Extractor):
         return x
       elif is_string(x[1]) and isinstance(x[0], Extractor):
         return (x[1], x[0])
   elif isinstance(x, Extractor):
     ID[0] += 1
     return (x.__class__.__name__ + str(ID[0]), x)
   return None
Beispiel #8
0
 def __new__(subclass,
             path,
             read_only=False,
             cache_size=250,
             *args,
             **kwargs):
     if not is_string(path):
         raise ValueError("`path` for MmapDict must be string, but given "
                          "object with type: %s" % type(path))
     path = os.path.abspath(path)
     read_only = bool(read_only)
     cache_size = int(cache_size)
     # get stored instances
     all_instances = NoSQL._INSTANCES[subclass.__name__]
     # ====== Found pre-defined instance ====== #
     if path in all_instances:
         return all_instances[path]
     # ====== Create new instance ====== #
     new_instance = super(NoSQL, subclass).__new__(subclass)
     all_instances[path] = new_instance
     # some pre-defined attribute
     new_instance._cache_size = cache_size
     new_instance._read_only = read_only
     new_instance._new_args_called = False
     new_instance._path = path
     new_instance._is_closed = False
     return new_instance
Beispiel #9
0
 def add_recipes(self, recipes, name, override=False):
   """
   Parameters
   ----------
   """
   # ====== validate arguments ====== #
   if not is_string(name):
     raise ValueError("`name` must be string, but given: %s" % str(type(name)))
   if name in self._saved_recipes and not override:
     raise ValueError("Cannot override pre-defined RECIPE with name: '%s'"
                     % name)
   # ====== validate recipes list ====== #
   if isinstance(recipes, RecipeList):
     recipes = tuple(recipes._recipes)
   else:
     tmp = []
     for rcp in as_tuple(recipes, t=FeederRecipe):
       if isinstance(rcp, RecipeList):
         tmp += list(rcp._recipes)
       else:
         tmp.append(rcp)
     recipes = tuple(tmp)
   # ====== store the recipes to disk ====== #
   path = os.path.join(self.recipe_path, name)
   with open(path, 'wb') as f:
     cPickle.dump(recipes, f, protocol=cPickle.HIGHEST_PROTOCOL)
   # ====== update local recipes list ====== #
   self._saved_recipes[name] = recipes
   return self
Beispiel #10
0
 def embed(self, vocabulary, dtype='float32',
           token_not_found='ignore'):
   """Any word not found in the vocabulary will be set to all-zeros"""
   # ====== check vocab ======= #
   if not isinstance(vocabulary, Mapping):
     raise ValueError('"vocabulary" must be any instance of dict.')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== create embedding matrix ====== #
   ndim = len(next(vocabulary.values()))
   matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype)
   for word, idx in self.dictionary.items():
     if len(word) == 0: continue
     if word in vocabulary:
       matrix[idx, :] = vocabulary[word]
     elif token_not_found == 'raise':
       raise Exception('Cannot find token "%s" in the vocabulary.' % word)
     elif isinstance(token_not_found, int):
       matrix[idx, :] == matrix[token_not_found, :]
   return matrix
Beispiel #11
0
 def _apply(self, X):
   axes = self.axes
   ndims = X.shape.ndims
   if is_string(axes) and axes.lower() == 'auto':
     if ndims == 3:
       axes = (1,)
     elif ndims == 4:
       axes = (1, 2)
     elif ndims == 5:
       axes = (1, 2, 3)
   X = K.upsample(X, scale=self.size, axes=axes, method=self.mode)
   # ====== check desire_shape ====== #
   desire_shape = self.desire_shape
   if desire_shape is not None:
     desire_shape = [None if i is None or i < 0 else int(i)
                     for i in desire_shape]
     # do padding if necessary
     paddings = [[0, 0] if i is None or o is None or i >= o else
                 [tf.cast(tf.ceil((o - i) / 2), 'int32'),
                  tf.cast(tf.floor((o - i) / 2), 'int32')]
                 for i, o in zip(X.shape.as_list(), desire_shape)]
     if not all(i == [0, 0] for i in paddings):
       X = tf.pad(X, paddings=paddings, mode='CONSTANT')
     # do slice if necessary
     slices = [slice(tf.cast(tf.floor((i - o) / 2), 'int32'),
                     tf.cast(-tf.ceil((i - o) / 2), 'int32'), None)
               if i is not None and o is not None and i > o else slice(None)
               for i, o in zip(X.shape.as_list(), desire_shape)]
     if any(s is not slice(None) for s in slices):
       X = X[slices]
     K.set_shape(X, tuple([i if is_number(i) else None
                           for i in desire_shape]))
   return X
Beispiel #12
0
 def __getitem__(self, key):
     if is_string(key):
         if key not in self._data_map:
             raise KeyError('%s not found in this dataset' % key)
         dtype, shape, data, path = self._data_map[key]
         return path if data is None else data
     raise ValueError('Only accept key type is string.')
Beispiel #13
0
 def add_recipes(self, recipes, name, override=False):
     """
 Parameters
 ----------
 """
     # ====== validate arguments ====== #
     if not is_string(name):
         raise ValueError("`name` must be string, but given: %s" %
                          str(type(name)))
     if name in self._saved_recipes and not override:
         raise ValueError(
             "Cannot override pre-defined RECIPE with name: '%s'" % name)
     # ====== validate recipes list ====== #
     if isinstance(recipes, RecipeList):
         recipes = tuple(recipes._recipes)
     else:
         tmp = []
         for rcp in as_tuple(recipes, t=FeederRecipe):
             if isinstance(rcp, RecipeList):
                 tmp += list(rcp._recipes)
             else:
                 tmp.append(rcp)
         recipes = tuple(tmp)
     # ====== store the recipes to disk ====== #
     path = os.path.join(self.recipe_path, name)
     with open(path, 'wb') as f:
         cPickle.dump(recipes, f, protocol=cPickle.HIGHEST_PROTOCOL)
     # ====== update local recipes list ====== #
     self._saved_recipes[name] = recipes
     return self
Beispiel #14
0
 def embed(self, vocabulary, dtype='float32', token_not_found='ignore'):
     """Any word not found in the vocabulary will be set to all-zeros"""
     # ====== check vocab ======= #
     if not isinstance(vocabulary, Mapping):
         raise ValueError('"vocabulary" must be any instance of dict.')
     # ====== check token_not_found ====== #
     if not is_number(token_not_found) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif is_number(token_not_found):
         token_not_found = int(token_not_found)
     # ====== create embedding matrix ====== #
     ndim = len(next(vocabulary.values()))
     matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype)
     for word, idx in self.dictionary.items():
         if len(word) == 0: continue
         if word in vocabulary:
             matrix[idx, :] = vocabulary[word]
         elif token_not_found == 'raise':
             raise Exception('Cannot find token "%s" in the vocabulary.' %
                             word)
         elif isinstance(token_not_found, int):
             matrix[idx, :] == matrix[token_not_found, :]
     return matrix
Beispiel #15
0
 def __setitem__(self, key, value):
     """
     Parameters
     ----------
     key : str or tuple
         if tuple is specified, it contain the key and the datatype
         which must be "memmap", "hdf5"
         for example: ds[('X', 'hdf5')] = numpy.ones((8, 12))
     """
     if not is_string(key) and not isinstance(key, (tuple, list)):
         raise ValueError(
             '"key" is the name for Data and must be String or '
             'tuple specified the name and datatype (memmap, hdf5).')
     # ====== check datatype ====== #
     datatype = 'memmap'  # default datatype
     if isinstance(key, (tuple, list)):
         key, datatype = key
         datatype = datatype.lower()
         if datatype not in ('memmap', 'hdf5'):
             raise ValueError(
                 'datatype can only be "memmap" or "hdf5", but '
                 'the given data type is "%s"' % datatype)
     # ====== do nothing ====== #
     if key in self._data_map:
         return
     # ====== dict ====== #
     path = os.path.join(self.path, key)
     if isinstance(value, dict):
         if os.path.exists(path):
             raise Exception('File with path=%s already exist.' % path)
         d = MmapDict(path)
         for i, j in value.iteritems():
             d[i] = j
         d.flush()
         # store new dict
         self._data_map[key] = (type(d).__name__, len(d), d, path)
     # ====== ndarray ====== #
     elif isinstance(value, np.ndarray):
         dtype, shape = value.dtype, value.shape
         if datatype == 'memmap':
             data = MmapData(path, dtype=dtype, shape=shape)
         else:
             path = os.path.join(self.path, self._default_hdf5)
             f = open_hdf5(path)
             data = Hdf5Data(key, hdf=f, dtype=dtype, shape=shape)
         # store new key
         self._data_map[key] = (data.dtype, data.shape, data, path)
         data.prepend(value)
         # check maximum opened memmap
         self._validate_memmap_max_open(key)
     # ====== other types ====== #
     else:
         if os.path.exists(path):
             raise Exception('File with path=%s already exist.' % path)
         with open(path, 'wb') as f:
             cPickle.dump(value, f, protocol=cPickle.HIGHEST_PROTOCOL)
         # store new dict
         self._data_map[key] = (type(value).__name__, len(value) if hasattr(
             value, '__len__') else 0, value, path)
Beispiel #16
0
 def post_processing(result):
   # search for file name
   if self.identifier not in result:
     raise RuntimeError(
         "Cannot find identifier '%s' in returned dictionary" % self.identifier)
   file_name = result[self.identifier]
   # invalid file_name
   if not is_string(file_name):
     raise RuntimeError("Cannot find file name in returned features "
         "list, the file name can be specified in key: 'name', 'path' "
         "and the type of the value must be string. All available "
         "keys are: %s" % str(result.keys()))
   # store all new indices
   # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
   all_indices = {}
   # processing
   for feat_name, X in result.items():
     # some invalid feat_name
     if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
       raise RuntimeError("Returned features' name cannot be one "
                          "of the following: 'config', 'pipeline', 'sum1', 'sum2'.")
     # ignore some feat_name
     if feat_name in ('name'):
       continue
     # if numpy ndarray, save to MmapData
     if isinstance(X, np.ndarray) or \
     'sum1' == feat_name[-4:] or \
     'sum2' == feat_name[-4:]:
       # save statistics instead
       if 'sum1' == feat_name[-4:]:
         stats[feat_name[:-4]][0] += X
       elif 'sum2' == feat_name[-4:]:
         stats[feat_name[:-4]][1] += X
       # save features array
       else:
         all_indices[feat_name] = X.shape[0]
         # cache data, only if we have more than 0 sample
         if X.shape[0] > 0:
           cache[feat_name].append(X)
     # else all other kind of data save to MmapDict
     else:
       databases[feat_name][file_name] = X
     # remove data
     del X
   # ====== update indices ====== #
   if len(all_indices) > 0:
     for feat_name, n in all_indices.items():
       ids_name = 'indices_%s' % feat_name
       databases[ids_name][file_name] = (last_start[ids_name],
                                         last_start[ids_name] + n)
       last_start[ids_name] += n
   # ====== flush cache ====== #
   n_processed[0] += 1
   if n_processed[0] % cache_limit == 0: # 12 + 8
     for feat_name, X_cached in cache.items():
       flush_feature(feat_name, X_cached)
     cache.clear()
   # ====== update progress ====== #
   return file_name
Beispiel #17
0
 def __setitem__(self, key, value):
   """
   Parameters
   ----------
   key : str or tuple
       if tuple is specified, it contain the key and the datatype
       which must be "memmap", "hdf5"
       for example: ds[('X', 'hdf5')] = numpy.ones((8, 12))
   """
   if not is_string(key) and not isinstance(key, (tuple, list)):
     raise ValueError('"key" is the name for Data and must be String or '
                      'tuple specified the name and datatype (memmap, hdf5).')
   # ====== check datatype ====== #
   datatype = 'memmap' # default datatype
   if isinstance(key, (tuple, list)):
     key, datatype = key
     datatype = str(datatype).lower()
     if datatype not in ('memmap', 'hdf5'):
       raise ValueError('datatype can only be "memmap" or "hdf5", but '
                        'the given data type is "%s"' % datatype)
   # ====== do nothing ====== #
   if key in self._data_map:
     return
   # ====== dict ====== #
   path = os.path.join(self.path, key)
   if isinstance(value, Mapping):
     if os.path.exists(path):
       raise Exception('File with path=%s already exist.' % path)
     d = MmapDict(path)
     for i, j in value.items():
       d[i] = j
     d.flush()
     # store new dict
     self._data_map[key] = (type(d).__name__, len(d), d, path)
   # ====== ndarray ====== #
   elif isinstance(value, np.ndarray):
     dtype, shape = value.dtype, value.shape
     if datatype == 'memmap':
       data = MmapData(path, dtype=dtype, shape=shape)
     else:
       path = os.path.join(self.path, self._default_hdf5)
       f = open_hdf5(path)
       data = Hdf5Data(key, hdf=f, dtype=dtype, shape=shape)
     # store new key
     self._data_map[key] = (data.dtype, data.shape, data, path)
     data[:shape[0]] = value
     # check maximum opened memmap
     self._validate_memmap_max_open(key)
   # ====== other types ====== #
   else:
     if os.path.exists(path):
       raise Exception('File with path=%s already exist.' % path)
     with open(path, 'wb') as f:
       cPickle.dump(value, f, protocol=cPickle.HIGHEST_PROTOCOL)
     # store new dict
     self._data_map[key] = (type(value).__name__,
                            len(value) if hasattr(value, '__len__') else 0,
                            value, path)
Beispiel #18
0
 def __init__(self, pool_size=2, strides=None, dilation=1,
              pad='valid', mode='max', transpose_mode='nn', **kwargs):
   super(Pool, self).__init__(**kwargs)
   self.pool_size = as_tuple(pool_size, t=int)
   self.strides = self.pool_size if strides is None \
       else as_tuple(strides, t=int)
   self.dilation = (1,) if dilation is None else as_tuple(dilation, t=int)
   self.pad = pad.upper() if is_string(pad) else as_tuple(pad, t=int)
   self.mode = mode.upper()
   self.transpose_mode = transpose_mode
Beispiel #19
0
 def __init__(self, task_name, output_name,
              lower_better=True, improvement_margin=0, logging=True):
   super(CheckpointGeneralization, self).__init__(logging=logging)
   self._task_name = str(task_name)
   self._output_name = output_name if is_string(output_name) \
       else output_name.name
   self._lower_better = bool(lower_better)
   self._improvement_margin = float(improvement_margin)
   assert self._improvement_margin >= 0
   self._best_score = None
Beispiel #20
0
def _check_label_mode(mode):
  if is_number(mode):
    return np.clip(float(mode), 0., 1.)
  if is_string(mode):
    mode = mode.lower()
    if mode == 'mid':
      mode = 'middle'
    if mode not in ('common', 'last', 'first', 'middle'):
      raise ValueError(
          "`label_mode` can be: 'common', 'last', 'first', 'middle'")
    return mode
  raise ValueError("No support for `label_mode`=%s" % str(mode))
Beispiel #21
0
 def get_loaded_param(self, name):
   ds = self.__class__.load_parameters()
   if is_string(name):
     return_1_param = True
   else:
     return_1_param = False
   name = as_tuple(name, t=str)
   if any(n not in ds for n in name):
     raise RuntimeError("Cannot find parameter with name:'%s' from loaded "
         "dataset at path: '%s'" % (name, ds.path))
   params = [ds[n][:] for n in name]
   return params[0] if return_1_param else tuple(params)
Beispiel #22
0
def _check_dtype(dtype):
  if hasattr(dtype, '__call__'):
    return dtype
  # ====== check dtype ====== #
  if dtype is None:
    dtype = K.floatX
  elif isinstance(dtype, np.dtype) or is_string(dtype):
    dtype = str(dtype)
  elif isinstance(dtype, VariableDesc):
    dtype = dtype.dtype
  elif isinstance(dtype, tf.DType):
    dtype = dtype.base_dtype.name
  return dtype
Beispiel #23
0
 def __getitem__(self, key):
   if is_string(key):
     if key not in self._data_map:
       raise KeyError('%s not found in this dataset' % key)
     dtype, shape, data, path = self._data_map[key]
     # return type is just a descriptor, create MmapData for it
     if data is None and \
     dtype is not 'unknown' and shape is not 'unknown':
       data = MmapData(path, read_only=self.read_only)
       self._data_map[key] = (data.dtype, data.shape, data, path)
       self._validate_memmap_max_open(key)
     return path if data is None else data
   raise ValueError('Only accept key type is string.')
Beispiel #24
0
 def __getitem__(self, key):
     if is_string(key):
         if key not in self._data_map:
             raise KeyError('%s not found in this dataset' % key)
         dtype, shape, data, path = self._data_map[key]
         # return type is just a descriptor, create MmapData for it
         if data is None and \
         dtype is not 'unknown' and shape is not 'unknown':
             data = MmapData(path, read_only=self.read_only)
             self._data_map[key] = (data.dtype, data.shape, data, path)
             self._validate_memmap_max_open(key)
         return path if data is None else data
     raise ValueError('Only accept key type is string.')
Beispiel #25
0
 def __init__(self, **kwargs):
   super(Model, self).__init__(**kwargs)
   input_info = self.get_input_info()
   if not isinstance(input_info, Mapping) or \
   len(input_info) == 0 or \
   not all(is_string(k) and _validate_shape_dtype(v)
           for k, v in input_info.items()):
     raise ValueError("`get_input_info` must return a (length > 0) Mapping "
         "of: 'input-name' -> (shape-tuple, dtype-string), but the "
         "returned value is: %s" % str(input_info))
   # ====== init kwargs_desc ====== #
   for name, (shape, dtype) in input_info.items():
     self._kwargs_desc[name] = VariableDesc(
         shape=shape, name=name, dtype=dtype)
Beispiel #26
0
def _validate_shape_dtype(x):
  if not isinstance(x, tuple):
    return False
  if not len(x) == 2:
    return False
  shape, dtype = x
  # check shape
  if not isinstance(shape, tuple) and \
  all(is_number(i) or isinstance(i, type(None)) for i in x):
    return False
  # check dtype
  if not is_string(dtype):
    return False
  return True
Beispiel #27
0
 def add_indices(self, indices, name, override=False):
     # ====== validate name ====== #
     if not is_string(name):
         raise ValueError("`name` must be string, but given: %s" %
                          str(type(name)))
     if name in self._saved_indices and not override:
         raise ValueError(
             "Cannot override pre-defined INDEX with name: '%s'" % name)
     # ====== validate indices ====== #
     path = os.path.join(self.index_path, name)
     ids = MmapDict(path)
     # predefined mapping, save or copy everything to a
     # MmapDict
     if isinstance(indices, Mapping):
         for name, (start, end) in indices.items():
             ids[name] = (start, end)
     # list of name, or (name, (start, end))
     elif isinstance(indices, (tuple, list, np.ndarray)):
         for i in indices:
             if is_string(i):  # only name
                 ids[i] = self['indices'][i]
             elif len(i) == 2:  # name, (start, end)
                 name, (start, end) = i
                 ids[name] = (int(start), int(end))
             elif len(i) == 3:  # name, start, end
                 name, start, end = i
                 ids[name] = (int(start), int(end))
             else:
                 raise ValueError(
                     "Unsupport index parsing (name, start, end)"
                     "for: %s" % str(i))
     # flush everything to disk
     ids.flush(save_all=True)
     ids.close()
     # ====== assign new index ====== #
     self._saved_indices[name] = MmapDict(path, read_only=True)
     return self
Beispiel #28
0
 def __new__(cls, *args, **kwargs):
     path = kwargs.get('path', None)
     if path is None:
         path = args[0]
     if not is_string(path):
         raise ValueError("`path` for Dataset must be string, but given "
                          "object with type: %s" % type(path))
     path = os.path.abspath(path)
     # Found old instance
     if path in Dataset.__INSTANCES:
         return Dataset.__INSTANCES[path]
     # new Dataset
     new_instance = super(Dataset, cls).__new__(cls)
     Dataset.__INSTANCES[path] = new_instance
     return new_instance
Beispiel #29
0
 def __new__(clazz, *args, **kwargs):
   path = kwargs.get('path', None)
   if path is None:
     path = args[0]
   if not is_string(path):
     raise ValueError("`path` for Dataset must be string, but given "
                      "object with type: %s" % type(path))
   path = os.path.abspath(path)
   # Found old instance
   if path in Dataset.__INSTANCES:
     return Dataset.__INSTANCES[path]
   # new Dataset
   new_instance = super(Dataset, clazz).__new__(clazz)
   Dataset.__INSTANCES[path] = new_instance
   return new_instance
Beispiel #30
0
 def __init__(self, vad, frame_length, padding=None):
     super(VADindex, self).__init__()
     if isinstance(vad, (list, tuple)):
         if len(vad) == 2:
             indices, data = vad
             if is_string(indices) and os.path.exists(indices):
                 indices = np.genfromtxt(indices, dtype=str, delimiter=' ')
             vad = {name: data[int(start): int(end)]
                    for name, start, end in indices}
         else: # a list contain all information is given
             vad = {name: segments for name, segments in vad}
     elif not isinstance(vad, dict):
         raise ValueError('Unsupport "vad" type: %s' % type(vad).__name__)
     self.vad = vad
     self.padding = padding
     self.frame_length = frame_length
Beispiel #31
0
 def process(self, name, X, **kwargs):
   for i, f in enumerate(self._recipes):
     # return iterator (iterate over all of them)
     args = f.process(name, X)
     # break the chain if one of the recipes get error,
     # and return None
     if args is None:
       return None
     if not isinstance(args, (tuple, list)) or \
     len(args) != 2 or \
     not is_string(args[0]) or \
     not isinstance(args[1], (tuple, list)):
       raise ValueError("The returned from `process` must be tuple or "
           "list, of length 2 which contains (name, [x1, x2, x3,...])."
           "`name` must string type, and [x1, x2, ...] is tuple or list.")
     name, X = args
   return name, X
Beispiel #32
0
def copy_dataset2(origin, destination,
                  indices_filter=None, data_filter=None,
                  override=False):
  # ====== prepare input ====== #
  if is_string(origin):
    origin = Dataset(origin, read_only=True)
    own_ds = True
  elif isinstance(origin, Dataset):
    own_ds = False
  # ====== pass ====== #
  ds = origin.copy(destination,
                   indices_filter=indices_filter,
                   data_filter=data_filter,
                   override=override)
  # ====== end and return ====== #
  if own_ds:
    origin.close()
  return ds
Beispiel #33
0
  def __init__(self, task_name, output_name, threshold, patience=1,
               get_value=lambda x: np.mean(x),
               logging=True):
    super(EarlyStop, self).__init__(logging=logging)
    self._task_name = str(task_name)
    self._output_name = output_name if is_string(output_name) \
        else output_name.name

    self._threshold = float(threshold)
    self._patience = int(patience)

    if get_value is None:
      get_value = lambda x: x
    elif not hasattr(get_value, '__call__'):
      raise ValueError('get_value must call-able')
    self._get_value = get_value
    # ====== history ====== #
    self._history = []
Beispiel #34
0
 def __new__(clazz, *args, **kwargs):
   path = kwargs.get('path', None)
   if path is None:
     path = args[0]
   if not is_string(path):
     raise ValueError("`path` for MmapData must be string, but given "
                      "object with type: %s" % type(path))
   path = os.path.abspath(path)
   # Found old instance
   if path in MmapData._INSTANCES:
     return MmapData._INSTANCES[path]
   # new MmapData
   # ====== increase memmap count ====== #
   if len(MmapData._INSTANCES) + 1 > MAX_OPEN_MMAP:
     raise ValueError('Only allowed to open maximum of {} memmap file'.format(MAX_OPEN_MMAP))
   # ====== create new instance ====== #
   new_instance = super(MmapData, clazz).__new__(clazz)
   MmapData._INSTANCES[path] = new_instance
   return new_instance
Beispiel #35
0
 def process(self, name, X, **kwargs):
     for i, f in enumerate(self._recipes):
         # return iterator (iterate over all of them)
         args = f.process(name, X)
         # break the chain if one of the recipes get error,
         # and return None
         if args is None:
             return None
         if not isinstance(args, (tuple, list)) or \
         len(args) != 2 or \
         not is_string(args[0]) or \
         not isinstance(args[1], (tuple, list)):
             raise ValueError(
                 "The returned from `process` must be tuple or "
                 "list, of length 2 which contains (name, [x1, x2, x3,...])."
                 "`name` must string type, and [x1, x2, ...] is tuple or list."
             )
         name, X = args
     return name, X
Beispiel #36
0
def copy_dataset2(origin,
                  destination,
                  indices_filter=None,
                  data_filter=None,
                  override=False):
    # ====== prepare input ====== #
    if is_string(origin):
        origin = Dataset(origin, read_only=True)
        own_ds = True
    elif isinstance(origin, Dataset):
        own_ds = False
    # ====== pass ====== #
    ds = origin.copy(destination,
                     indices_filter=indices_filter,
                     data_filter=data_filter,
                     override=override)
    # ====== end and return ====== #
    if own_ds:
        origin.close()
    return ds
Beispiel #37
0
 def __init__(self, task_name, output_name,
              fn_reduce=lambda x: (np.mean(x)
                                   if isinstance(x[0], Number) else
                                   sum(i for i in x)),
              print_plot=False, save_path=None,
              repeat_freq=1, logging=True):
   super(EpochSummary, self).__init__(logging=logging)
   self._task_name = as_tuple(task_name, t=str)
   # ====== scheduling ====== #
   assert repeat_freq >= 1
   self._repeat_freq = int(repeat_freq)
   self._count = self._repeat_freq * len(self._task_name)
   self._epoch_results = defaultdict(dict)
   # ====== output identity ====== #
   if not isinstance(output_name, (tuple, list, set)):
     output_name = (output_name,)
   output_name = [i if is_string(i) else i.name
                  for i in output_name]
   self.output_name = tuple(output_name)
   self.fn_reduce = FuncDesc(func=fn_reduce)
   # ====== how to output ====== #
   self.print_plot = bool(print_plot)
   self.save_path = save_path
Beispiel #38
0
 def _initialize(self):
     # ====== validate init arguments ====== #
     self.ndim = len(self.input_shape) - 2
     # padding
     if isinstance(self.pad, (tuple, list, int)):
         self.pad = as_tuple(self.pad, self.ndim, int)
     elif self.pad is None:
         self.pad = (0, ) * self.ndim
     elif is_string(self.pad):
         self.pad = self.pad.upper()
     # strides
     if self.strides is None:
         self.strides = (1, ) * self.ndim
     else:
         self.strides = as_tuple(self.strides, self.ndim, int)
     # dilation
     if self.dilation is None:
         self.dilation = (1, ) * self.ndim
     else:
         self.dilation = as_tuple(self.dilation, self.ndim, int)
     # filter size
     self.filter_size = as_tuple(self.filter_size, self.ndim, int)
     # ====== create config ====== #
     # weights
     self.get_variable_nnop(initializer=self.W_init,
                            shape=self.kernel_shape,
                            name='W',
                            roles=ConvKernel)
     if self.b_init is not None:
         if self.untie_biases:
             biases_shape = self.output_shape[1:]
         else:
             biases_shape = (self.num_filters, )
         self.get_variable_nnop(initializer=self.b_init,
                                shape=biases_shape,
                                name='b',
                                roles=Bias)
Beispiel #39
0
  def shape_transform(self, shapes):
    """
    Parameters
    ----------
    shapes: list of [(shape0, indices0), (shape1, indices1), ...]
        list of data shape tuple and indices, the indices is list
        of tuple (name, length)

    Return
    ------
    new shape that transformed by this Recipe
    new indices
    """
    for i in self._recipes:
      shapes = i.shape_transform(shapes)
      # ====== check returned ====== #
      if not all((isinstance(shp, (tuple, list)) and
                  all(is_number(s) for s in shp) and
                  is_string(ids[0][0]) and is_number(ids[0][1]))
                 for shp, ids in shapes):
        raise RuntimeError("Returned `shapes` must be the list of pair "
                           "`(shape, indices)`, where `indices` is the "
                           "list of (name, length(int)).")
    return shapes
Beispiel #40
0
    def shape_transform(self, shapes):
        """
    Parameters
    ----------
    shapes: list of [(shape0, indices0), (shape1, indices1), ...]
        list of data shape tuple and indices, the indices is list
        of tuple (name, length)

    Return
    ------
    new shape that transformed by this Recipe
    new indices
    """
        for i in self._recipes:
            shapes = i.shape_transform(shapes)
            # ====== check returned ====== #
            if not all((isinstance(shp, (tuple, list)) and all(
                    is_number(s) for s in shp) and is_string(ids[0][0])
                        and is_number(ids[0][1])) for shp, ids in shapes):
                raise RuntimeError(
                    "Returned `shapes` must be the list of pair "
                    "`(shape, indices)`, where `indices` is the "
                    "list of (name, length(int)).")
        return shapes
Beispiel #41
0
def train(X, y_true, y_pred, train_data,
          valid_data=None, valid_freq=1.,
          patience=3, threshold=5, rollback=True,
          objectives=[tf.losses.softmax_cross_entropy],
          metrics=[0], training_metrics=[],
          l1_regu=0., l2_regu=0., parameters=[],
          prior_weights=None, sample_weights=None,
          batch_size=256, epochs=8, shuffle=True,
          optimizer='rmsprop', optz_kwargs={'lr': 0.001}, updates=None,
          init_vars=True, labels=None, seed=5218, verbose=2):
  """

  Parameters
  ----------
  rollback : bool (default: True)
    if True, allow rollback to the best checkpoint during training
  objectives : {callable, tensorflow.Tensor}
    if `callable`, the function must take `y_true`, and `y_pred`
    The objectives must be differentiable and used for training.
  metrics : {callable, tensorflow.Tensor, int}
    if `callable`, the function must take `y_true`, and `y_pred`
    The `metrics` is for monitoring the training process.
    if `int`, it is the index of the loss in `objectives`
    NOTE: the first metrics in the list will be used for
    early-stopping (smaller is better).
  training_metrics : {callable, tensorflow.Tensor, int}
    if `int`, it is the index of the loss in `metrics`
  parameters : {list or tensorflow.Variables}
    All the parameters will be updated by the `optimizer`, if None
    or empty list is given, use ComputationalGraph to get
    all variables with Parameters roles related to the objectives
  init_vars : bool (default: True)
    automatically initialize all variables
  labels : {None, list of string}
    Given labels for classification task
  seed : int
    specific random seed for reproducible
  verbose : int
    0 - Turn off all log
    1 - only show notification
    2 - show notification, important log and summary
    3 - Show progress, summary, notification and logging
    4 - Show debug information and everything

  Return
  ------
  Function used for prediction
  """
  from odin import backend as K
  # ====== preprocess inputs ====== #
  X = as_tuple(X, t=K.is_tensor)
  y_true = as_tuple(y_true, t=K.is_tensor)
  y_pred = as_tuple(y_pred, t=K.is_tensor)
  # ====== parsing objectives and metrics ====== #
  # for training
  prior_weights = _preprocess_prior_weights(y_true=y_true,
                                            prior_weights=prior_weights)
  if prior_weights is not None:
    if sample_weights is not None:
      sample_weights = sample_weights + prior_weights
    else:
      sample_weights = prior_weights
  objectives = _preprocessing_losses(as_tuple(objectives), y_true, y_pred,
                                     sample_weights=sample_weights)
  # metrics for monitoring
  metrics = as_tuple(metrics)
  get_value = lambda x: np.mean(x)
  if len(metrics) > 0 and \
  (metrics[0] == tf.metrics.accuracy or
   metrics[0] == K.metrics.categorical_accuracy):
    get_value = lambda x: 1 - np.mean(x)
  metrics = _preprocessing_losses(metrics, y_true, y_pred,
                                  inherit_losses=objectives)
  # training_metrics
  training_metrics = _preprocessing_losses(as_tuple(training_metrics),
                                           y_true, y_pred,
                                           inherit_losses=metrics)
  # sum the objectives for differentiable
  if len(objectives) > 0:
    objectives = [sum(objectives) if len(objectives) > 1 else objectives[0]]
  # ====== preprocess optimizer and get updates====== #
  if updates is None: # not given updates
    if is_string(optimizer):
      optimizer = _parse_optimizer(optimizer)
      optimizer = optimizer(**optz_kwargs)
    elif not isinstance(optimizer, K.optimizers.Optimizer):
      raise ValueError("`optimizer` must be string - name of algorithm or instance "
                       "of odin.backend.optimizers.Optimizer")
    parameters = K.ComputationGraph(objectives).parameters\
    if len(parameters) == 0 else as_tuple(parameters, t=K.is_variable)
    # check objectives
    if len(objectives) == 0:
      raise RuntimeError("`objectives` must be given due to `updates=None`")
    weights = [p for p in parameters if K.role.has_roles(p, roles=K.role.Weight)]
    # l1 regularization
    if l1_regu > 0.:
      l1_norm = sum(tf.norm(w, ord=1) for w in weights)
      objectives[0] += l1_norm
    # l2 regularization
    if l2_regu > 0.:
      l2_norm = sum(tf.norm(w, ord=2) for w in weights)
      objectives[0] += l2_norm
    # update rules
    updates = optimizer.get_updates(objectives[0], parameters)
    # adding global norm and learning rate
    training_metrics.append(optimizer.norm)
    training_metrics.append(optimizer.lr)
  elif K.is_operation(updates): # given updates
    optimizer = None
  else:
    raise ValueError("`updates` can be None or tensorflow Operation, but given "
      "type: %s" % str(type(updates)))
  # ====== placeholders ====== #
  inputs_plh = []
  for plh in X:
    for i in (K.ComputationGraph(plh).placeholders
              if not K.is_placeholder(plh)
              else as_tuple(plh)):
      inputs_plh.append(i)
  outputs_plh = []
  for plh in y_true: # no duplicated inputs (e.g. autoencoder X == y)
    if not K.is_placeholder(plh):
      plh = K.ComputationGraph(plh).placeholders
    for i in as_tuple(plh):
      if i not in inputs_plh:
        outputs_plh.append(i)
  inputs = inputs_plh + outputs_plh
  # ====== initialize variables ====== #
  if bool(init_vars):
    K.initialize_all_variables()
  # ====== creating function ====== #
  # training function
  f_train = K.function(inputs=inputs,
                       outputs=objectives + training_metrics,
                       updates=updates, training=True)
  # scoring function
  f_score = None
  if len(metrics) > 0:
    f_score = K.function(inputs=inputs, outputs=metrics,
                         training=False)
  # prediction function
  f_pred = K.function(inputs=inputs_plh,
                      outputs=y_pred[0] if len(y_pred) == 1 else y_pred,
                      training=False)
  # ====== preprocessing data ====== #
  train_data, valid_data = _preprocessing_data(train_data, valid_data)
  # print some debug information if necessary
  if verbose >= 4:
    print("%s %s %s" % (
        ctext("============", 'cyan'),
        ctext("Prepare for Training", 'red'),
        ctext("============", 'cyan')))
    print(ctext("Input placeholders:", 'yellow'))
    for i in inputs_plh:
      print(" * ", str(i))
    print(ctext("Output placeholders:", 'yellow'))
    for i in outputs_plh:
      print(" * ", str(i))
    print(ctext("Parameters:", 'yellow'))
    for p in parameters:
      print(" * ", p.name, '-', p.shape, ';', p.dtype.name)
    print(ctext("Optimizer:", 'yellow'))
    print(" * ", str(optimizer))
    print(" * Optimizer kwargs:", optz_kwargs)
    print(" * L1:", l1_regu)
    print(" * L2:", l2_regu)
    print(ctext("Training:", 'yellow'))
    print(" * Valid freq:", valid_freq)
    print(" * Patience:", patience)
    print(" * Threshold:", threshold)
    print(" * Rollback:", rollback)
    print(" * Batch size:", batch_size)
    print(" * Epoch:", epochs)
    print(" * Shuffle:", shuffle)
    print(" * Seed:", seed)
    print(ctext("Objectives:", 'yellow'))
    for o in objectives:
      print(" * ", str(o))
    print(ctext("Weights:", 'yellow'))
    print(" * Prior:", str(prior_weights))
    print(" * Sample:", str(sample_weights))
    print(ctext("Metrics:", 'yellow'))
    for m in metrics:
      print(" * ", str(m))
    print(ctext("Training metrics:", 'yellow'))
    for t in training_metrics:
      print(" * ", str(t))
    print(ctext("Training Data:", 'yellow'), str(train_data))
    print(ctext("Validating Data:", 'yellow'), str(valid_data))
    print(ctext("Labels:", 'yellow'), labels)
  # ====== create trainer ====== #
  callback_log = True if verbose > 0 else False
  trainer = MainLoop(batch_size=batch_size,
                     seed=seed if shuffle else None,
                     shuffle_level=2 if shuffle else 0,
                     allow_rollback=rollback,
                     verbose=verbose, labels=labels)
  trainer.set_checkpoint(path=None, obj=None,
                         variables=parameters)
  # create callback
  callbacks = [NaNDetector(patience=patience, log=callback_log)]
  if valid_data is not None and f_score is not None:
    callbacks.append(
        EarlyStopGeneralizationLoss(task_name='valid', output_name=metrics[0],
                                    threshold=threshold, patience=patience,
                                    log=callback_log, get_value=get_value))
  trainer.set_callbacks(callbacks)
  # set the tasks
  trainer.set_train_task(func=f_train, data=train_data,
                         epoch=epochs, name='train')
  if valid_data is not None and f_score is not None:
    trainer.set_valid_task(func=f_score, data=valid_data,
                           freq=Timer(percentage=valid_freq),
                           name='valid')
  # running
  trainer.run()
  return f_pred
Beispiel #42
0
 def __init__(self, nb_classes, l1=0., l2=0.,
              fit_intercept=True, confusion_matrix=True,
              tol=1e-4, patience=3, rollback=True,
              batch_size=1024, max_epoch=100, max_iter=None,
              optimizer='adadelta', learning_rate=1.0, class_weight=None,
              dtype='float32', seed=5218,
              verbose=False, path=None, name=None):
   super(LogisticRegression, self).__init__()
   # ====== basic dimensions ====== #
   if isinstance(nb_classes, (tuple, list, np.ndarray)):
     self._labels = tuple([str(i) for i in nb_classes])
     self._nb_classes = len(nb_classes)
   elif is_number(nb_classes):
     self._labels = tuple([str(i) for i in range(nb_classes)])
     self._nb_classes = int(nb_classes)
   self._feat_dim = None
   self._dtype = np.dtype(dtype)
   # ====== preprocessing class weight ====== #
   if class_weight is None:
     class_weight = np.ones(shape=(self.nb_classes,),
                            dtype=self.dtype)
   elif is_number(class_weight):
     class_weight = np.zeros(shape=(self.nb_classes,),
                             dtype=self.dtype) + class_weight
   self._class_weight = class_weight
   # ====== flags ====== #
   self.l1 = float(l1)
   self.l2 = float(l2)
   self.fit_intercept = bool(fit_intercept)
   self.confusion_matrix = bool(confusion_matrix)
   # ====== internal states ====== #
   self._is_fitted = False
   # ====== others ====== #
   if name is None:
     name = uuid(length=8)
     self._name = 'LogisticRegression_%s' % name
   else:
     self._name = str(name)
   self._path = path
   # ====== training ====== #
   self.batch_size = int(batch_size)
   self.max_epoch = max_epoch
   self.max_iter = max_iter
   if not is_string(optimizer):
     raise ValueError("`optimizer` must be one of the following")
   optimizer = optimizer.lower()
   if optimizer not in _optimizer_list:
     raise ValueError("`optimizer` must be one of the following: %s" %
       str(list(_optimizer_list.keys())))
   self._optimizer = _optimizer_list[optimizer.lower()](lr=float(learning_rate))
   self._optimizer_name = optimizer
   self._optimizer_lr = learning_rate
   # ====== stop training ====== #
   self.tol = float(tol)
   self.patience = int(patience)
   self.rollback = bool(rollback)
   # ====== others ====== #
   self._train_history = []
   self._valid_history = []
   self._rand_state = np.random.RandomState(seed=int(seed))
   self.verbose = int(verbose)
Beispiel #43
0
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
               token_not_found='ignore'):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
       pass
   """
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
   else:
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
         continue
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
         vec.append(token_not_found)
     # append ending document token
     if end_document is not None:
       vec.append(end_document)
     # add the final results
     results.append(vec)
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       prog.add(1)
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
                             value=value)
   # transform into one-hot matrix
   else:
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results
Beispiel #44
0
def eval(x, feed_dict=None,
         update_before=None, update_after=None,
         options=None, run_metadata=None):
  ''' Generalized version of code evaluation, it
  could evaluate python and tensorflow expression.

  Parameters
  ----------
  x : list, tuple, dictionary, `Tensor`
      tensorfow `Tensor` for evaluation
  feed_dict : dict
      Input dictionary, mapping placeholder -> values
  update_before: {None, list, or dict}
      mapping from `Tensor` to its new value which is `Tensor` or
      real value, the updates is runned before evaluating
  update_after: {None, list, or dict}
      same as `updates_before`, but run the `updates` after
      evaluate `x`
  options: tensorflow.RunOptions
      thhe options allow controlling the behavior of
      this particular step (e.g. turning tracing on).
  run_metadata: tensorflow.RunMetadata
      When appropriate, the non-Tensor output of this
      step will be collected there. For example,
      when users turn on tracing in options, the
      profiled info will be collected into
      this argument and passed back.

  Example
  -------
  >>> import tensorflow as tf
  >>> from odin import backend as K
  >>> run_metadata = tf.RunMetadata()
  >>> K.eval(...,
  ...        options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
  ...                              output_partition_graphs=True),
  ...        run_metadata=run_metadata)
  >>> with open('log_path', 'w') as f:
  >>>   f.write(str(run_metadata))

  Note
  ----
  If "Couldn't open CUDA library libcupti.so.8.0" appears when you
  adding RunOptions, try adding "/usr/local/cuda/extras/CUPTI/lib64/"
  to your LD_LIBRARY_PATH
  '''
  results = ()
  update_before = _validate_updates(update_before)
  update_after = _validate_updates(update_after)
  # ====== run updates before ====== #
  if update_before is not None:
    get_session(update_before.graph).run(update_before, feed_dict=feed_dict,
                                   options=options,
                                   run_metadata=run_metadata)
  # ====== list of Tensor or string ====== #
  if isinstance(x, (tuple, list)):
    string_eval = []
    tensor_eval = []
    tensor_idx = []
    # evaluate string expression
    for i, j in enumerate(x):
      if is_string(j):
        string_eval.append(builtins.eval(j))
      else:
        tensor_eval.append(j)
        tensor_idx.append(i)
    # evaluate tensor
    if len(tensor_eval) > 0:
      graph = [i.graph for i in tensor_eval]
      if len(set(graph)) > 1:
        raise RuntimeError("Cannot evaluate multiple `Tensor` come from "
                           "different `Graph`.")
      tensor_eval = get_session(graph[0]).run(tensor_eval,
                                              feed_dict=feed_dict,
                                              options=options,
                                              run_metadata=run_metadata)
    results = tuple([tensor_eval.pop(0) if i in tensor_idx else string_eval.pop(0)
                     for i in range(len(x))])
  # ====== mapping ====== #
  elif isinstance(x, Mapping):
    results = {}
    tensor_eval_key = []
    tensor_eval_value = []
    for k, v in x.items():
      if is_string(v):
        results[k] = builtins.eval(v)
      else:
        tensor_eval_key.append(k)
        tensor_eval_value.append(v)
    # evaluate tensor
    if len(tensor_eval) > 0:
      graph = [i.graph for i in tensor_eval_value]
      if len(set(graph)) > 1:
        raise RuntimeError("Cannot evaluate multiple `Tensor` come from "
                           "different `Graph`.")
      tensor_eval_value = get_session(graph[0]).run(tensor_eval_value,
                                                    feed_dict=feed_dict,
                                                    options=options,
                                                    run_metadata=run_metadata)
    # update results
    for k, v in zip(tensor_eval_key, tensor_eval_value):
      results[k] = v
  # ====== just a string ====== #
  elif is_string(x):
    results = builtins.eval(x)
  # ====== just a Tensorflow object ====== #
  elif isinstance(x, tf.Operation) or \
  is_tensor(x, inc_distribution=True, inc_variable=True):
    results = get_session(x.graph).run(x, feed_dict=feed_dict,
                                       options=options,
                                       run_metadata=run_metadata)
  # ====== exception ====== #
  else:
    raise RuntimeError("Cannot evaluate object of type: %s" % type(x))
  # ====== run updates after ====== #
  if update_after is not None:
    get_session(update_after.graph).run(update_after, feed_dict=feed_dict,
                                        options=options,
                                        run_metadata=run_metadata)
  return results
Beispiel #45
0
 def post_processing(result):
     # search for file name
     if self.identifier not in result:
         raise RuntimeError(
             "Cannot find identifier '%s' in returned dictionary" %
             self.identifier)
     file_name = result[self.identifier]
     # invalid file_name
     if not is_string(file_name):
         raise RuntimeError(
             "Cannot find file name in returned features "
             "list, the file name can be specified in key: 'name', 'path' "
             "and the type of the value must be string. All available "
             "keys are: %s" % str(result.keys()))
     # store all new indices
     # mapping [X.shape[0]] -> [feat_name, feat_name, ...]
     all_indices = {}
     # processing
     for feat_name, X in result.items():
         # some invalid feat_name
         if feat_name in ('config', 'pipeline', 'sum1', 'sum2'):
             raise RuntimeError(
                 "Returned features' name cannot be one "
                 "of the following: 'config', 'pipeline', 'sum1', 'sum2'."
             )
         # ignore some feat_name
         if feat_name in ('name'):
             continue
         # if numpy ndarray, save to MmapData
         if isinstance(X, np.ndarray) or \
         'sum1' == feat_name[-4:] or \
         'sum2' == feat_name[-4:]:
             # save statistics instead
             if 'sum1' == feat_name[-4:]:
                 stats[feat_name[:-4]][0] += X
             elif 'sum2' == feat_name[-4:]:
                 stats[feat_name[:-4]][1] += X
             # save features array
             else:
                 all_indices[feat_name] = X.shape[0]
                 # cache data, only if we have more than 0 sample
                 if X.shape[0] > 0:
                     cache[feat_name].append(X)
         # else all other kind of data save to MmapDict
         else:
             databases[feat_name][file_name] = X
         # remove data
         del X
     # ====== update indices ====== #
     if len(all_indices) > 0:
         for feat_name, n in all_indices.items():
             ids_name = 'indices_%s' % feat_name
             databases[ids_name][file_name] = (last_start[ids_name],
                                               last_start[ids_name] + n)
             last_start[ids_name] += n
     # ====== flush cache ====== #
     n_processed[0] += 1
     if n_processed[0] % cache_limit == 0:  # 12 + 8
         for feat_name, X_cached in cache.items():
             flush_feature(feat_name, X_cached)
         cache.clear()
     # ====== update progress ====== #
     return file_name
Beispiel #46
0
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False):
    """ Using parallel MiniBatchPCA to do PCA for multiple features
  at once.

  """
    # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec)
    # add reading data from indices also
    # ====== check input dataset ====== #
    own_dataset = True
    if is_string(dataset) and os.path.isdir(dataset):
        dataset = Dataset(dataset, read_only=True)
    elif isinstance(dataset, Dataset):
        own_dataset = False
    elif isinstance(dataset, FeatureProcessor):
        dataset = Dataset(dataset.path, read_only=True)
    else:
        raise ValueError("Cannot acquire Dataset from input: %s" %
                         str(dataset))
    # ====== extract all feat_name ====== #
    if is_string(feat_name) and feat_name == 'auto':
        feat_name = []
        for k in dataset.keys():
            X = dataset[k]
            if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1:
                feat_name.append(k)
    else:
        feat_name = [
            name for name in as_tuple(feat_name, t=str) if name in dataset
        ]
    # ====== load PCA ====== #
    from odin.ml import MiniBatchPCA
    # init PCA
    nb_samples = 0
    for feat in feat_name:
        nb_samples += dataset[feat].shape[0]
    # ====== prepare MPI PCA ====== #
    add_notification("Selected features for PCA: " +
                     ctext(', '.join(feat_name), 'yellow'))

    def map_pca(name):
        X = dataset[name]
        # found exist pca model
        if 'pca_' + feat in dataset and not override:
            pca = dataset['pca_' + feat]
        # create new PCA
        else:
            pca = MiniBatchPCA(n_components=None,
                               whiten=False,
                               copy=True,
                               batch_size=None)
        # No shuffling make iter much faster
        for x in X.set_batch(batch_size=batch_size, seed=None,
                             shuffle_level=0):
            pca.partial_fit(x)
            yield x.shape[0]
        # save PCA model
        with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f:
            cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL)
        # finish return feature name
        yield name

    mpi = MPI(jobs=feat_name,
              func=map_pca,
              ncpu=None,
              batch=1,
              hwm=12082518,
              backend='python')
    # ====== running the MPI ====== #
    remain_features = list(feat_name)
    finished_features = []
    prog = Progbar(target=nb_samples,
                   print_summary=True,
                   print_report=True,
                   name='PCA')
    for n in mpi:
        if is_string(n):
            remain_features.remove(n)
            finished_features.append(n)
        else:
            prog['Remain'] = ', '.join(remain_features)
            prog['Finished'] = ', '.join(finished_features)
            prog.add(n)
    # ====== return ====== #
    if own_dataset:
        dataset.close()
Beispiel #47
0
def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()
Beispiel #48
0
  def save_cache(self, path, name=None, dtype=None, batch_size=1024):
    """ Save all preprocessed data to a Dataset

    Parameters
    ----------
    path: string
        path to a folder
    name: None, or list of string
        specific name for each returned `numpy.ndarray` during iteration
    dtype: None, or list of dtype, or single dtype
        specific dtype for all or each of returned `numpy.ndarray`
        during iteration
    batch_size: int
        amount of samples for each batch (higher the faster iteration)

    Note
    ----
    Only returned `numpy.ndarray` are saved
    """
    from odin.fuel.dataset import Dataset
    if not is_string(path):
      raise ValueError("`path` must be string path to a folder.")
    if os.path.exists(path) and os.path.isfile(path):
      raise ValueError("`path` is a file, required a folder for "
                       "saving all cache data.")
    # ====== start caching ====== #
    prog = Progbar(target=len(self),
                   name='Saving cache of preprocessed data',
                   print_report=True, print_summary=True)
    ds = Dataset(path, override=True)
    with self.set_batch_context(batch_size=int(batch_size), seed=None,
                                start=0, end=-1, shuffle_level=0):
      for X in self:
        if not isinstance(X, (tuple, list)):
          X = (X,)
        n = 0
        i = 0
        # saving preprocessed data
        for x in X:
          if isinstance(x, np.ndarray):
            # checking name
            if name is None:
              x_name = 'X%d' % i
            else:
              x_name = name[i]
            # checking dtype
            if isinstance(dtype, (tuple, list)):
              x = x.astype(dtype[i])
            elif dtype is not None:
              x = x.astype(dtype)
            # saving to the dataset
            if x_name in ds:
              ds[x_name].append(x)
            else:
              ds[(x_name, 'memmap')] = x
            # update samples count, and data count
            n = x.shape[0]
            i += 1
        # print progress
        prog.add(n)
    # ====== flush and close everything ====== #
    ds.flush()
    ds.close()
    with open(os.path.join(path, 'README'), 'wb') as f:
      f.write(str(self))
    # end
    # ====== check one more time ====== #
    ds = Dataset(path, read_only=True)
    print(ds)
    print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)')
    ds.close()
    return self
Beispiel #49
0
def _to_numpy_array(self, x):
  if not is_string(x[0]) and len(set(i.shape[1:] for i in x)) == 1:
    return np.concatenate(x, axis=0)
  return np.array(x)
Beispiel #50
0
 def __getitem__(self, key):
   if is_string(key):
     key = slice(*self.indices[key])
   return super(IndexedData, self).__getitem__(key)
Beispiel #51
0
 def _to_numpy_array(self, x):
     if not is_string(x[0]) and len(set(i.shape[1:] for i in x)) == 1:
         return np.concatenate(x, axis=0)
     return np.array(x)
Beispiel #52
0
 def _transform(self, X):
   # ====== file input file ====== #
   raw = None
   path = None
   if isinstance(X, Mapping):
     if 'path' in X:
       path = X['path']
     if 'sr' in X:
       if self.sr is None:
         self.sr = X['sr']
         self._update_config()
         self._first_config_generated = True
       elif self.sr != X['sr']:
         raise ValueError("Given sample rate: %d, but the audio file has "
                          "sample rate: %d" % (self.sr, X['sr']))
     if 'raw' in X:
       raw = X['raw']
   elif is_string(X):
     path = X
   elif isinstance(X, np.ndarray):
     raw = X
   else:
     raise ValueError("openSMILE extractor require path to audio file.")
   # no sample rate specified, cannot generate appropriate config
   if self.sr is None:
     raise RuntimeError("Cannot acquire sample rate for the input.")
   # ====== first time generate config ====== #
   if not self._first_config_generated:
     self._first_config_generated = True
     self._update_config()
   # ====== extract SAD ====== #
   unique_id = os.getpid() + random.randint(0, 10e8)
   inpath = os.path.join(
       get_logpath(), '%s%d.wav' % (self.__class__.__name__, unique_id))
   outpath = os.path.join(
       get_logpath(), '%s%d.csv' % (self.__class__.__name__, unique_id))
   try:
     if path is None or not os.path.exists(path):
       if raw is None:
         raise RuntimeError("openSMILE require input audio file, since "
             "we cannot find any audio file, it is required to provide "
             "raw array and sample rate, so the audio file will be cached.")
       from soundfile import write
       write(inpath, data=raw, samplerate=self.sr)
       path = inpath
     # if in debug mode or not
     command = 'SMILExtract -loglevel %d -C %s -I %s -O %s' % \
         (self._log_level, self.config_path, path, outpath)
     os.system(command)
     results = np.genfromtxt(outpath, dtype='float32',
                             delimiter=',', skip_header=0)
   except Exception as e:
     import traceback; traceback.print_exc()
     raise e
   finally:
     if os.path.exists(inpath):
       os.remove(inpath)
     if os.path.exists(outpath):
       os.remove(outpath)
   # ====== post-processing ====== #
   X_update = self._post_processing(results)
   if not isinstance(X_update, dict):
     raise ValueError("_post_processing must return a dictionary.")
   return X_update