def add_indices(self, indices, name, override=False): # ====== validate name ====== # if not is_string(name): raise ValueError("`name` must be string, but given: %s" % str(type(name))) if name in self._saved_indices and not override: raise ValueError("Cannot override pre-defined INDEX with name: '%s'" % name) # ====== validate indices ====== # path = os.path.join(self.index_path, name) ids = MmapDict(path) # predefined mapping, save or copy everything to a # MmapDict if isinstance(indices, Mapping): for name, (start, end) in indices.items(): ids[name] = (start, end) # list of name, or (name, (start, end)) elif isinstance(indices, (tuple, list, np.ndarray)): for i in indices: if is_string(i): # only name ids[i] = self['indices'][i] elif len(i) == 2: # name, (start, end) name, (start, end) = i ids[name] = (int(start), int(end)) elif len(i) == 3: # name, start, end name, start, end = i ids[name] = (int(start), int(end)) else: raise ValueError("Unsupport index parsing (name, start, end)" "for: %s" % str(i)) # flush everything to disk ids.flush(save_all=True) ids.close() # ====== assign new index ====== # self._saved_indices[name] = MmapDict(path, read_only=True) return self
def create_feeder(self, data, recipes, indices=None, batch_filter=None, batch_mode='batch', name=None, override=False): """ Parameters ---------- data: list of str list of name for all data used, the order of this list is the order of returned data. recipes: list or single odin.fuel.FeederRecipe the list of recipes defining the rule of transforming the data indices: None, string, dict, list list of (name, (start, end)) for iterating over files in Feeder batch_filter: call-able must be a function has take a list of np.ndarray as first arguments ([X]) or ([X, y]), you can return None to ignore given batch, return the data for accepting the batch batch_mode: 'batch' or 'file' (string type) 'batch' mode return shuffling and return everything in small batches 'file' mode return [(file_name, order_index, data...), ...] name: None, or string if name is provided, the feeder information will be saved, which include the `indices`, `recipes` Note ---- by defaults, the Feeder is created using only 1 CPU with `buffer_size=1` using the method `set_multiprocessing(ncpu=None, buffer_size=None, maximum_queue_size=None)` for changing this information. """ from odin.fuel.feeder import Feeder, IndexedData # check data data = [self.__getitem__(dat) if is_string(dat) else as_data(dat) for dat in as_tuple(data)] # check recipes if is_string(recipes): recipes = self._saved_recipes[recipes] else: recipes = as_tuple(recipes, t=FeederRecipe) # check indices if indices is None: indices = self.__getitem__('indices') elif is_string(indices): indices = self._saved_indices[indices] elif isinstance(indices, (Mapping, tuple, list, np.ndarray)): pass # ====== saving recipes and indices, if name is not None ====== # if is_string(name): if name not in self._saved_indices or override: self.add_indices(indices, name, override=True) if name not in self._saved_recipes or override: self.add_recipes(recipes, name, override=True) # ====== create Feeder ====== # feeder = Feeder(IndexedData(data=data, indices=indices), batch_filter=batch_filter, batch_mode=batch_mode, ncpu=1, buffer_size=1) return feeder.set_recipes(recipes)
def _validate_texts(self, texts): if not isinstance(texts, Iterable) and \ not isinstance(texts, Iterator) and \ not is_string(texts): raise ValueError( 'texts must be an iterator, generator or a string.') if is_string(texts): texts = (texts, ) return texts
def _validate_texts(self, texts): """ Valiate the input to `fit` and `transform` """ if not isinstance(texts, Iterable) and \ not isinstance(texts, Iterator) and \ not is_string(texts): raise ValueError('texts must be an iterator, generator or a string.') if is_string(texts): texts = (texts,) # convert to unicode texts = (t.decode('utf-8') for t in texts) return texts
def _validate_texts(self, texts): """ Valiate the input to `fit` and `transform` """ if not isinstance(texts, Iterable) and \ not isinstance(texts, Iterator) and \ not is_string(texts): raise ValueError( 'texts must be an iterator, generator or a string.') if is_string(texts): texts = (texts, ) # convert to unicode texts = (t.decode('utf-8') for t in texts) return texts
def item2step(x): if isinstance(x, (tuple, list)): if len(x) == 1 and isinstance(x[0], Extractor): x = x[0] ID[0] += 1 return (x.__class__.__name__ + str(ID[0]), x) elif len(x) == 2: if is_string(x[0]) and isinstance(x[1], Extractor): return x elif is_string(x[1]) and isinstance(x[0], Extractor): return (x[1], x[0]) elif isinstance(x, Extractor): ID[0] += 1 return (x.__class__.__name__ + str(ID[0]), x) return None
def __new__(subclass, path, read_only=False, cache_size=250, *args, **kwargs): if not is_string(path): raise ValueError("`path` for MmapDict must be string, but given " "object with type: %s" % type(path)) path = os.path.abspath(path) read_only = bool(read_only) cache_size = int(cache_size) # get stored instances all_instances = NoSQL._INSTANCES[subclass.__name__] # ====== Found pre-defined instance ====== # if path in all_instances: return all_instances[path] # ====== Create new instance ====== # new_instance = super(NoSQL, subclass).__new__(subclass) all_instances[path] = new_instance # some pre-defined attribute new_instance._cache_size = cache_size new_instance._read_only = read_only new_instance._new_args_called = False new_instance._path = path new_instance._is_closed = False return new_instance
def add_recipes(self, recipes, name, override=False): """ Parameters ---------- """ # ====== validate arguments ====== # if not is_string(name): raise ValueError("`name` must be string, but given: %s" % str(type(name))) if name in self._saved_recipes and not override: raise ValueError("Cannot override pre-defined RECIPE with name: '%s'" % name) # ====== validate recipes list ====== # if isinstance(recipes, RecipeList): recipes = tuple(recipes._recipes) else: tmp = [] for rcp in as_tuple(recipes, t=FeederRecipe): if isinstance(rcp, RecipeList): tmp += list(rcp._recipes) else: tmp.append(rcp) recipes = tuple(tmp) # ====== store the recipes to disk ====== # path = os.path.join(self.recipe_path, name) with open(path, 'wb') as f: cPickle.dump(recipes, f, protocol=cPickle.HIGHEST_PROTOCOL) # ====== update local recipes list ====== # self._saved_recipes[name] = recipes return self
def embed(self, vocabulary, dtype='float32', token_not_found='ignore'): """Any word not found in the vocabulary will be set to all-zeros""" # ====== check vocab ======= # if not isinstance(vocabulary, Mapping): raise ValueError('"vocabulary" must be any instance of dict.') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== create embedding matrix ====== # ndim = len(next(vocabulary.values())) matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype) for word, idx in self.dictionary.items(): if len(word) == 0: continue if word in vocabulary: matrix[idx, :] = vocabulary[word] elif token_not_found == 'raise': raise Exception('Cannot find token "%s" in the vocabulary.' % word) elif isinstance(token_not_found, int): matrix[idx, :] == matrix[token_not_found, :] return matrix
def _apply(self, X): axes = self.axes ndims = X.shape.ndims if is_string(axes) and axes.lower() == 'auto': if ndims == 3: axes = (1,) elif ndims == 4: axes = (1, 2) elif ndims == 5: axes = (1, 2, 3) X = K.upsample(X, scale=self.size, axes=axes, method=self.mode) # ====== check desire_shape ====== # desire_shape = self.desire_shape if desire_shape is not None: desire_shape = [None if i is None or i < 0 else int(i) for i in desire_shape] # do padding if necessary paddings = [[0, 0] if i is None or o is None or i >= o else [tf.cast(tf.ceil((o - i) / 2), 'int32'), tf.cast(tf.floor((o - i) / 2), 'int32')] for i, o in zip(X.shape.as_list(), desire_shape)] if not all(i == [0, 0] for i in paddings): X = tf.pad(X, paddings=paddings, mode='CONSTANT') # do slice if necessary slices = [slice(tf.cast(tf.floor((i - o) / 2), 'int32'), tf.cast(-tf.ceil((i - o) / 2), 'int32'), None) if i is not None and o is not None and i > o else slice(None) for i, o in zip(X.shape.as_list(), desire_shape)] if any(s is not slice(None) for s in slices): X = X[slices] K.set_shape(X, tuple([i if is_number(i) else None for i in desire_shape])) return X
def __getitem__(self, key): if is_string(key): if key not in self._data_map: raise KeyError('%s not found in this dataset' % key) dtype, shape, data, path = self._data_map[key] return path if data is None else data raise ValueError('Only accept key type is string.')
def add_recipes(self, recipes, name, override=False): """ Parameters ---------- """ # ====== validate arguments ====== # if not is_string(name): raise ValueError("`name` must be string, but given: %s" % str(type(name))) if name in self._saved_recipes and not override: raise ValueError( "Cannot override pre-defined RECIPE with name: '%s'" % name) # ====== validate recipes list ====== # if isinstance(recipes, RecipeList): recipes = tuple(recipes._recipes) else: tmp = [] for rcp in as_tuple(recipes, t=FeederRecipe): if isinstance(rcp, RecipeList): tmp += list(rcp._recipes) else: tmp.append(rcp) recipes = tuple(tmp) # ====== store the recipes to disk ====== # path = os.path.join(self.recipe_path, name) with open(path, 'wb') as f: cPickle.dump(recipes, f, protocol=cPickle.HIGHEST_PROTOCOL) # ====== update local recipes list ====== # self._saved_recipes[name] = recipes return self
def __setitem__(self, key, value): """ Parameters ---------- key : str or tuple if tuple is specified, it contain the key and the datatype which must be "memmap", "hdf5" for example: ds[('X', 'hdf5')] = numpy.ones((8, 12)) """ if not is_string(key) and not isinstance(key, (tuple, list)): raise ValueError( '"key" is the name for Data and must be String or ' 'tuple specified the name and datatype (memmap, hdf5).') # ====== check datatype ====== # datatype = 'memmap' # default datatype if isinstance(key, (tuple, list)): key, datatype = key datatype = datatype.lower() if datatype not in ('memmap', 'hdf5'): raise ValueError( 'datatype can only be "memmap" or "hdf5", but ' 'the given data type is "%s"' % datatype) # ====== do nothing ====== # if key in self._data_map: return # ====== dict ====== # path = os.path.join(self.path, key) if isinstance(value, dict): if os.path.exists(path): raise Exception('File with path=%s already exist.' % path) d = MmapDict(path) for i, j in value.iteritems(): d[i] = j d.flush() # store new dict self._data_map[key] = (type(d).__name__, len(d), d, path) # ====== ndarray ====== # elif isinstance(value, np.ndarray): dtype, shape = value.dtype, value.shape if datatype == 'memmap': data = MmapData(path, dtype=dtype, shape=shape) else: path = os.path.join(self.path, self._default_hdf5) f = open_hdf5(path) data = Hdf5Data(key, hdf=f, dtype=dtype, shape=shape) # store new key self._data_map[key] = (data.dtype, data.shape, data, path) data.prepend(value) # check maximum opened memmap self._validate_memmap_max_open(key) # ====== other types ====== # else: if os.path.exists(path): raise Exception('File with path=%s already exist.' % path) with open(path, 'wb') as f: cPickle.dump(value, f, protocol=cPickle.HIGHEST_PROTOCOL) # store new dict self._data_map[key] = (type(value).__name__, len(value) if hasattr( value, '__len__') else 0, value, path)
def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError("Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError("Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'.") # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name
def __setitem__(self, key, value): """ Parameters ---------- key : str or tuple if tuple is specified, it contain the key and the datatype which must be "memmap", "hdf5" for example: ds[('X', 'hdf5')] = numpy.ones((8, 12)) """ if not is_string(key) and not isinstance(key, (tuple, list)): raise ValueError('"key" is the name for Data and must be String or ' 'tuple specified the name and datatype (memmap, hdf5).') # ====== check datatype ====== # datatype = 'memmap' # default datatype if isinstance(key, (tuple, list)): key, datatype = key datatype = str(datatype).lower() if datatype not in ('memmap', 'hdf5'): raise ValueError('datatype can only be "memmap" or "hdf5", but ' 'the given data type is "%s"' % datatype) # ====== do nothing ====== # if key in self._data_map: return # ====== dict ====== # path = os.path.join(self.path, key) if isinstance(value, Mapping): if os.path.exists(path): raise Exception('File with path=%s already exist.' % path) d = MmapDict(path) for i, j in value.items(): d[i] = j d.flush() # store new dict self._data_map[key] = (type(d).__name__, len(d), d, path) # ====== ndarray ====== # elif isinstance(value, np.ndarray): dtype, shape = value.dtype, value.shape if datatype == 'memmap': data = MmapData(path, dtype=dtype, shape=shape) else: path = os.path.join(self.path, self._default_hdf5) f = open_hdf5(path) data = Hdf5Data(key, hdf=f, dtype=dtype, shape=shape) # store new key self._data_map[key] = (data.dtype, data.shape, data, path) data[:shape[0]] = value # check maximum opened memmap self._validate_memmap_max_open(key) # ====== other types ====== # else: if os.path.exists(path): raise Exception('File with path=%s already exist.' % path) with open(path, 'wb') as f: cPickle.dump(value, f, protocol=cPickle.HIGHEST_PROTOCOL) # store new dict self._data_map[key] = (type(value).__name__, len(value) if hasattr(value, '__len__') else 0, value, path)
def __init__(self, pool_size=2, strides=None, dilation=1, pad='valid', mode='max', transpose_mode='nn', **kwargs): super(Pool, self).__init__(**kwargs) self.pool_size = as_tuple(pool_size, t=int) self.strides = self.pool_size if strides is None \ else as_tuple(strides, t=int) self.dilation = (1,) if dilation is None else as_tuple(dilation, t=int) self.pad = pad.upper() if is_string(pad) else as_tuple(pad, t=int) self.mode = mode.upper() self.transpose_mode = transpose_mode
def __init__(self, task_name, output_name, lower_better=True, improvement_margin=0, logging=True): super(CheckpointGeneralization, self).__init__(logging=logging) self._task_name = str(task_name) self._output_name = output_name if is_string(output_name) \ else output_name.name self._lower_better = bool(lower_better) self._improvement_margin = float(improvement_margin) assert self._improvement_margin >= 0 self._best_score = None
def _check_label_mode(mode): if is_number(mode): return np.clip(float(mode), 0., 1.) if is_string(mode): mode = mode.lower() if mode == 'mid': mode = 'middle' if mode not in ('common', 'last', 'first', 'middle'): raise ValueError( "`label_mode` can be: 'common', 'last', 'first', 'middle'") return mode raise ValueError("No support for `label_mode`=%s" % str(mode))
def get_loaded_param(self, name): ds = self.__class__.load_parameters() if is_string(name): return_1_param = True else: return_1_param = False name = as_tuple(name, t=str) if any(n not in ds for n in name): raise RuntimeError("Cannot find parameter with name:'%s' from loaded " "dataset at path: '%s'" % (name, ds.path)) params = [ds[n][:] for n in name] return params[0] if return_1_param else tuple(params)
def _check_dtype(dtype): if hasattr(dtype, '__call__'): return dtype # ====== check dtype ====== # if dtype is None: dtype = K.floatX elif isinstance(dtype, np.dtype) or is_string(dtype): dtype = str(dtype) elif isinstance(dtype, VariableDesc): dtype = dtype.dtype elif isinstance(dtype, tf.DType): dtype = dtype.base_dtype.name return dtype
def __getitem__(self, key): if is_string(key): if key not in self._data_map: raise KeyError('%s not found in this dataset' % key) dtype, shape, data, path = self._data_map[key] # return type is just a descriptor, create MmapData for it if data is None and \ dtype is not 'unknown' and shape is not 'unknown': data = MmapData(path, read_only=self.read_only) self._data_map[key] = (data.dtype, data.shape, data, path) self._validate_memmap_max_open(key) return path if data is None else data raise ValueError('Only accept key type is string.')
def __init__(self, **kwargs): super(Model, self).__init__(**kwargs) input_info = self.get_input_info() if not isinstance(input_info, Mapping) or \ len(input_info) == 0 or \ not all(is_string(k) and _validate_shape_dtype(v) for k, v in input_info.items()): raise ValueError("`get_input_info` must return a (length > 0) Mapping " "of: 'input-name' -> (shape-tuple, dtype-string), but the " "returned value is: %s" % str(input_info)) # ====== init kwargs_desc ====== # for name, (shape, dtype) in input_info.items(): self._kwargs_desc[name] = VariableDesc( shape=shape, name=name, dtype=dtype)
def _validate_shape_dtype(x): if not isinstance(x, tuple): return False if not len(x) == 2: return False shape, dtype = x # check shape if not isinstance(shape, tuple) and \ all(is_number(i) or isinstance(i, type(None)) for i in x): return False # check dtype if not is_string(dtype): return False return True
def add_indices(self, indices, name, override=False): # ====== validate name ====== # if not is_string(name): raise ValueError("`name` must be string, but given: %s" % str(type(name))) if name in self._saved_indices and not override: raise ValueError( "Cannot override pre-defined INDEX with name: '%s'" % name) # ====== validate indices ====== # path = os.path.join(self.index_path, name) ids = MmapDict(path) # predefined mapping, save or copy everything to a # MmapDict if isinstance(indices, Mapping): for name, (start, end) in indices.items(): ids[name] = (start, end) # list of name, or (name, (start, end)) elif isinstance(indices, (tuple, list, np.ndarray)): for i in indices: if is_string(i): # only name ids[i] = self['indices'][i] elif len(i) == 2: # name, (start, end) name, (start, end) = i ids[name] = (int(start), int(end)) elif len(i) == 3: # name, start, end name, start, end = i ids[name] = (int(start), int(end)) else: raise ValueError( "Unsupport index parsing (name, start, end)" "for: %s" % str(i)) # flush everything to disk ids.flush(save_all=True) ids.close() # ====== assign new index ====== # self._saved_indices[name] = MmapDict(path, read_only=True) return self
def __new__(cls, *args, **kwargs): path = kwargs.get('path', None) if path is None: path = args[0] if not is_string(path): raise ValueError("`path` for Dataset must be string, but given " "object with type: %s" % type(path)) path = os.path.abspath(path) # Found old instance if path in Dataset.__INSTANCES: return Dataset.__INSTANCES[path] # new Dataset new_instance = super(Dataset, cls).__new__(cls) Dataset.__INSTANCES[path] = new_instance return new_instance
def __new__(clazz, *args, **kwargs): path = kwargs.get('path', None) if path is None: path = args[0] if not is_string(path): raise ValueError("`path` for Dataset must be string, but given " "object with type: %s" % type(path)) path = os.path.abspath(path) # Found old instance if path in Dataset.__INSTANCES: return Dataset.__INSTANCES[path] # new Dataset new_instance = super(Dataset, clazz).__new__(clazz) Dataset.__INSTANCES[path] = new_instance return new_instance
def __init__(self, vad, frame_length, padding=None): super(VADindex, self).__init__() if isinstance(vad, (list, tuple)): if len(vad) == 2: indices, data = vad if is_string(indices) and os.path.exists(indices): indices = np.genfromtxt(indices, dtype=str, delimiter=' ') vad = {name: data[int(start): int(end)] for name, start, end in indices} else: # a list contain all information is given vad = {name: segments for name, segments in vad} elif not isinstance(vad, dict): raise ValueError('Unsupport "vad" type: %s' % type(vad).__name__) self.vad = vad self.padding = padding self.frame_length = frame_length
def process(self, name, X, **kwargs): for i, f in enumerate(self._recipes): # return iterator (iterate over all of them) args = f.process(name, X) # break the chain if one of the recipes get error, # and return None if args is None: return None if not isinstance(args, (tuple, list)) or \ len(args) != 2 or \ not is_string(args[0]) or \ not isinstance(args[1], (tuple, list)): raise ValueError("The returned from `process` must be tuple or " "list, of length 2 which contains (name, [x1, x2, x3,...])." "`name` must string type, and [x1, x2, ...] is tuple or list.") name, X = args return name, X
def copy_dataset2(origin, destination, indices_filter=None, data_filter=None, override=False): # ====== prepare input ====== # if is_string(origin): origin = Dataset(origin, read_only=True) own_ds = True elif isinstance(origin, Dataset): own_ds = False # ====== pass ====== # ds = origin.copy(destination, indices_filter=indices_filter, data_filter=data_filter, override=override) # ====== end and return ====== # if own_ds: origin.close() return ds
def __init__(self, task_name, output_name, threshold, patience=1, get_value=lambda x: np.mean(x), logging=True): super(EarlyStop, self).__init__(logging=logging) self._task_name = str(task_name) self._output_name = output_name if is_string(output_name) \ else output_name.name self._threshold = float(threshold) self._patience = int(patience) if get_value is None: get_value = lambda x: x elif not hasattr(get_value, '__call__'): raise ValueError('get_value must call-able') self._get_value = get_value # ====== history ====== # self._history = []
def __new__(clazz, *args, **kwargs): path = kwargs.get('path', None) if path is None: path = args[0] if not is_string(path): raise ValueError("`path` for MmapData must be string, but given " "object with type: %s" % type(path)) path = os.path.abspath(path) # Found old instance if path in MmapData._INSTANCES: return MmapData._INSTANCES[path] # new MmapData # ====== increase memmap count ====== # if len(MmapData._INSTANCES) + 1 > MAX_OPEN_MMAP: raise ValueError('Only allowed to open maximum of {} memmap file'.format(MAX_OPEN_MMAP)) # ====== create new instance ====== # new_instance = super(MmapData, clazz).__new__(clazz) MmapData._INSTANCES[path] = new_instance return new_instance
def process(self, name, X, **kwargs): for i, f in enumerate(self._recipes): # return iterator (iterate over all of them) args = f.process(name, X) # break the chain if one of the recipes get error, # and return None if args is None: return None if not isinstance(args, (tuple, list)) or \ len(args) != 2 or \ not is_string(args[0]) or \ not isinstance(args[1], (tuple, list)): raise ValueError( "The returned from `process` must be tuple or " "list, of length 2 which contains (name, [x1, x2, x3,...])." "`name` must string type, and [x1, x2, ...] is tuple or list." ) name, X = args return name, X
def __init__(self, task_name, output_name, fn_reduce=lambda x: (np.mean(x) if isinstance(x[0], Number) else sum(i for i in x)), print_plot=False, save_path=None, repeat_freq=1, logging=True): super(EpochSummary, self).__init__(logging=logging) self._task_name = as_tuple(task_name, t=str) # ====== scheduling ====== # assert repeat_freq >= 1 self._repeat_freq = int(repeat_freq) self._count = self._repeat_freq * len(self._task_name) self._epoch_results = defaultdict(dict) # ====== output identity ====== # if not isinstance(output_name, (tuple, list, set)): output_name = (output_name,) output_name = [i if is_string(i) else i.name for i in output_name] self.output_name = tuple(output_name) self.fn_reduce = FuncDesc(func=fn_reduce) # ====== how to output ====== # self.print_plot = bool(print_plot) self.save_path = save_path
def _initialize(self): # ====== validate init arguments ====== # self.ndim = len(self.input_shape) - 2 # padding if isinstance(self.pad, (tuple, list, int)): self.pad = as_tuple(self.pad, self.ndim, int) elif self.pad is None: self.pad = (0, ) * self.ndim elif is_string(self.pad): self.pad = self.pad.upper() # strides if self.strides is None: self.strides = (1, ) * self.ndim else: self.strides = as_tuple(self.strides, self.ndim, int) # dilation if self.dilation is None: self.dilation = (1, ) * self.ndim else: self.dilation = as_tuple(self.dilation, self.ndim, int) # filter size self.filter_size = as_tuple(self.filter_size, self.ndim, int) # ====== create config ====== # # weights self.get_variable_nnop(initializer=self.W_init, shape=self.kernel_shape, name='W', roles=ConvKernel) if self.b_init is not None: if self.untie_biases: biases_shape = self.output_shape[1:] else: biases_shape = (self.num_filters, ) self.get_variable_nnop(initializer=self.b_init, shape=biases_shape, name='b', roles=Bias)
def shape_transform(self, shapes): """ Parameters ---------- shapes: list of [(shape0, indices0), (shape1, indices1), ...] list of data shape tuple and indices, the indices is list of tuple (name, length) Return ------ new shape that transformed by this Recipe new indices """ for i in self._recipes: shapes = i.shape_transform(shapes) # ====== check returned ====== # if not all((isinstance(shp, (tuple, list)) and all(is_number(s) for s in shp) and is_string(ids[0][0]) and is_number(ids[0][1])) for shp, ids in shapes): raise RuntimeError("Returned `shapes` must be the list of pair " "`(shape, indices)`, where `indices` is the " "list of (name, length(int)).") return shapes
def shape_transform(self, shapes): """ Parameters ---------- shapes: list of [(shape0, indices0), (shape1, indices1), ...] list of data shape tuple and indices, the indices is list of tuple (name, length) Return ------ new shape that transformed by this Recipe new indices """ for i in self._recipes: shapes = i.shape_transform(shapes) # ====== check returned ====== # if not all((isinstance(shp, (tuple, list)) and all( is_number(s) for s in shp) and is_string(ids[0][0]) and is_number(ids[0][1])) for shp, ids in shapes): raise RuntimeError( "Returned `shapes` must be the list of pair " "`(shape, indices)`, where `indices` is the " "list of (name, length(int)).") return shapes
def train(X, y_true, y_pred, train_data, valid_data=None, valid_freq=1., patience=3, threshold=5, rollback=True, objectives=[tf.losses.softmax_cross_entropy], metrics=[0], training_metrics=[], l1_regu=0., l2_regu=0., parameters=[], prior_weights=None, sample_weights=None, batch_size=256, epochs=8, shuffle=True, optimizer='rmsprop', optz_kwargs={'lr': 0.001}, updates=None, init_vars=True, labels=None, seed=5218, verbose=2): """ Parameters ---------- rollback : bool (default: True) if True, allow rollback to the best checkpoint during training objectives : {callable, tensorflow.Tensor} if `callable`, the function must take `y_true`, and `y_pred` The objectives must be differentiable and used for training. metrics : {callable, tensorflow.Tensor, int} if `callable`, the function must take `y_true`, and `y_pred` The `metrics` is for monitoring the training process. if `int`, it is the index of the loss in `objectives` NOTE: the first metrics in the list will be used for early-stopping (smaller is better). training_metrics : {callable, tensorflow.Tensor, int} if `int`, it is the index of the loss in `metrics` parameters : {list or tensorflow.Variables} All the parameters will be updated by the `optimizer`, if None or empty list is given, use ComputationalGraph to get all variables with Parameters roles related to the objectives init_vars : bool (default: True) automatically initialize all variables labels : {None, list of string} Given labels for classification task seed : int specific random seed for reproducible verbose : int 0 - Turn off all log 1 - only show notification 2 - show notification, important log and summary 3 - Show progress, summary, notification and logging 4 - Show debug information and everything Return ------ Function used for prediction """ from odin import backend as K # ====== preprocess inputs ====== # X = as_tuple(X, t=K.is_tensor) y_true = as_tuple(y_true, t=K.is_tensor) y_pred = as_tuple(y_pred, t=K.is_tensor) # ====== parsing objectives and metrics ====== # # for training prior_weights = _preprocess_prior_weights(y_true=y_true, prior_weights=prior_weights) if prior_weights is not None: if sample_weights is not None: sample_weights = sample_weights + prior_weights else: sample_weights = prior_weights objectives = _preprocessing_losses(as_tuple(objectives), y_true, y_pred, sample_weights=sample_weights) # metrics for monitoring metrics = as_tuple(metrics) get_value = lambda x: np.mean(x) if len(metrics) > 0 and \ (metrics[0] == tf.metrics.accuracy or metrics[0] == K.metrics.categorical_accuracy): get_value = lambda x: 1 - np.mean(x) metrics = _preprocessing_losses(metrics, y_true, y_pred, inherit_losses=objectives) # training_metrics training_metrics = _preprocessing_losses(as_tuple(training_metrics), y_true, y_pred, inherit_losses=metrics) # sum the objectives for differentiable if len(objectives) > 0: objectives = [sum(objectives) if len(objectives) > 1 else objectives[0]] # ====== preprocess optimizer and get updates====== # if updates is None: # not given updates if is_string(optimizer): optimizer = _parse_optimizer(optimizer) optimizer = optimizer(**optz_kwargs) elif not isinstance(optimizer, K.optimizers.Optimizer): raise ValueError("`optimizer` must be string - name of algorithm or instance " "of odin.backend.optimizers.Optimizer") parameters = K.ComputationGraph(objectives).parameters\ if len(parameters) == 0 else as_tuple(parameters, t=K.is_variable) # check objectives if len(objectives) == 0: raise RuntimeError("`objectives` must be given due to `updates=None`") weights = [p for p in parameters if K.role.has_roles(p, roles=K.role.Weight)] # l1 regularization if l1_regu > 0.: l1_norm = sum(tf.norm(w, ord=1) for w in weights) objectives[0] += l1_norm # l2 regularization if l2_regu > 0.: l2_norm = sum(tf.norm(w, ord=2) for w in weights) objectives[0] += l2_norm # update rules updates = optimizer.get_updates(objectives[0], parameters) # adding global norm and learning rate training_metrics.append(optimizer.norm) training_metrics.append(optimizer.lr) elif K.is_operation(updates): # given updates optimizer = None else: raise ValueError("`updates` can be None or tensorflow Operation, but given " "type: %s" % str(type(updates))) # ====== placeholders ====== # inputs_plh = [] for plh in X: for i in (K.ComputationGraph(plh).placeholders if not K.is_placeholder(plh) else as_tuple(plh)): inputs_plh.append(i) outputs_plh = [] for plh in y_true: # no duplicated inputs (e.g. autoencoder X == y) if not K.is_placeholder(plh): plh = K.ComputationGraph(plh).placeholders for i in as_tuple(plh): if i not in inputs_plh: outputs_plh.append(i) inputs = inputs_plh + outputs_plh # ====== initialize variables ====== # if bool(init_vars): K.initialize_all_variables() # ====== creating function ====== # # training function f_train = K.function(inputs=inputs, outputs=objectives + training_metrics, updates=updates, training=True) # scoring function f_score = None if len(metrics) > 0: f_score = K.function(inputs=inputs, outputs=metrics, training=False) # prediction function f_pred = K.function(inputs=inputs_plh, outputs=y_pred[0] if len(y_pred) == 1 else y_pred, training=False) # ====== preprocessing data ====== # train_data, valid_data = _preprocessing_data(train_data, valid_data) # print some debug information if necessary if verbose >= 4: print("%s %s %s" % ( ctext("============", 'cyan'), ctext("Prepare for Training", 'red'), ctext("============", 'cyan'))) print(ctext("Input placeholders:", 'yellow')) for i in inputs_plh: print(" * ", str(i)) print(ctext("Output placeholders:", 'yellow')) for i in outputs_plh: print(" * ", str(i)) print(ctext("Parameters:", 'yellow')) for p in parameters: print(" * ", p.name, '-', p.shape, ';', p.dtype.name) print(ctext("Optimizer:", 'yellow')) print(" * ", str(optimizer)) print(" * Optimizer kwargs:", optz_kwargs) print(" * L1:", l1_regu) print(" * L2:", l2_regu) print(ctext("Training:", 'yellow')) print(" * Valid freq:", valid_freq) print(" * Patience:", patience) print(" * Threshold:", threshold) print(" * Rollback:", rollback) print(" * Batch size:", batch_size) print(" * Epoch:", epochs) print(" * Shuffle:", shuffle) print(" * Seed:", seed) print(ctext("Objectives:", 'yellow')) for o in objectives: print(" * ", str(o)) print(ctext("Weights:", 'yellow')) print(" * Prior:", str(prior_weights)) print(" * Sample:", str(sample_weights)) print(ctext("Metrics:", 'yellow')) for m in metrics: print(" * ", str(m)) print(ctext("Training metrics:", 'yellow')) for t in training_metrics: print(" * ", str(t)) print(ctext("Training Data:", 'yellow'), str(train_data)) print(ctext("Validating Data:", 'yellow'), str(valid_data)) print(ctext("Labels:", 'yellow'), labels) # ====== create trainer ====== # callback_log = True if verbose > 0 else False trainer = MainLoop(batch_size=batch_size, seed=seed if shuffle else None, shuffle_level=2 if shuffle else 0, allow_rollback=rollback, verbose=verbose, labels=labels) trainer.set_checkpoint(path=None, obj=None, variables=parameters) # create callback callbacks = [NaNDetector(patience=patience, log=callback_log)] if valid_data is not None and f_score is not None: callbacks.append( EarlyStopGeneralizationLoss(task_name='valid', output_name=metrics[0], threshold=threshold, patience=patience, log=callback_log, get_value=get_value)) trainer.set_callbacks(callbacks) # set the tasks trainer.set_train_task(func=f_train, data=train_data, epoch=epochs, name='train') if valid_data is not None and f_score is not None: trainer.set_valid_task(func=f_score, data=valid_data, freq=Timer(percentage=valid_freq), name='valid') # running trainer.run() return f_pred
def __init__(self, nb_classes, l1=0., l2=0., fit_intercept=True, confusion_matrix=True, tol=1e-4, patience=3, rollback=True, batch_size=1024, max_epoch=100, max_iter=None, optimizer='adadelta', learning_rate=1.0, class_weight=None, dtype='float32', seed=5218, verbose=False, path=None, name=None): super(LogisticRegression, self).__init__() # ====== basic dimensions ====== # if isinstance(nb_classes, (tuple, list, np.ndarray)): self._labels = tuple([str(i) for i in nb_classes]) self._nb_classes = len(nb_classes) elif is_number(nb_classes): self._labels = tuple([str(i) for i in range(nb_classes)]) self._nb_classes = int(nb_classes) self._feat_dim = None self._dtype = np.dtype(dtype) # ====== preprocessing class weight ====== # if class_weight is None: class_weight = np.ones(shape=(self.nb_classes,), dtype=self.dtype) elif is_number(class_weight): class_weight = np.zeros(shape=(self.nb_classes,), dtype=self.dtype) + class_weight self._class_weight = class_weight # ====== flags ====== # self.l1 = float(l1) self.l2 = float(l2) self.fit_intercept = bool(fit_intercept) self.confusion_matrix = bool(confusion_matrix) # ====== internal states ====== # self._is_fitted = False # ====== others ====== # if name is None: name = uuid(length=8) self._name = 'LogisticRegression_%s' % name else: self._name = str(name) self._path = path # ====== training ====== # self.batch_size = int(batch_size) self.max_epoch = max_epoch self.max_iter = max_iter if not is_string(optimizer): raise ValueError("`optimizer` must be one of the following") optimizer = optimizer.lower() if optimizer not in _optimizer_list: raise ValueError("`optimizer` must be one of the following: %s" % str(list(_optimizer_list.keys()))) self._optimizer = _optimizer_list[optimizer.lower()](lr=float(learning_rate)) self._optimizer_name = optimizer self._optimizer_lr = learning_rate # ====== stop training ====== # self.tol = float(tol) self.patience = int(patience) self.rollback = bool(rollback) # ====== others ====== # self._train_history = [] self._valid_history = [] self._rand_state = np.random.RandomState(seed=int(seed)) self.verbose = int(verbose)
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError('Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
def eval(x, feed_dict=None, update_before=None, update_after=None, options=None, run_metadata=None): ''' Generalized version of code evaluation, it could evaluate python and tensorflow expression. Parameters ---------- x : list, tuple, dictionary, `Tensor` tensorfow `Tensor` for evaluation feed_dict : dict Input dictionary, mapping placeholder -> values update_before: {None, list, or dict} mapping from `Tensor` to its new value which is `Tensor` or real value, the updates is runned before evaluating update_after: {None, list, or dict} same as `updates_before`, but run the `updates` after evaluate `x` options: tensorflow.RunOptions thhe options allow controlling the behavior of this particular step (e.g. turning tracing on). run_metadata: tensorflow.RunMetadata When appropriate, the non-Tensor output of this step will be collected there. For example, when users turn on tracing in options, the profiled info will be collected into this argument and passed back. Example ------- >>> import tensorflow as tf >>> from odin import backend as K >>> run_metadata = tf.RunMetadata() >>> K.eval(..., ... options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, ... output_partition_graphs=True), ... run_metadata=run_metadata) >>> with open('log_path', 'w') as f: >>> f.write(str(run_metadata)) Note ---- If "Couldn't open CUDA library libcupti.so.8.0" appears when you adding RunOptions, try adding "/usr/local/cuda/extras/CUPTI/lib64/" to your LD_LIBRARY_PATH ''' results = () update_before = _validate_updates(update_before) update_after = _validate_updates(update_after) # ====== run updates before ====== # if update_before is not None: get_session(update_before.graph).run(update_before, feed_dict=feed_dict, options=options, run_metadata=run_metadata) # ====== list of Tensor or string ====== # if isinstance(x, (tuple, list)): string_eval = [] tensor_eval = [] tensor_idx = [] # evaluate string expression for i, j in enumerate(x): if is_string(j): string_eval.append(builtins.eval(j)) else: tensor_eval.append(j) tensor_idx.append(i) # evaluate tensor if len(tensor_eval) > 0: graph = [i.graph for i in tensor_eval] if len(set(graph)) > 1: raise RuntimeError("Cannot evaluate multiple `Tensor` come from " "different `Graph`.") tensor_eval = get_session(graph[0]).run(tensor_eval, feed_dict=feed_dict, options=options, run_metadata=run_metadata) results = tuple([tensor_eval.pop(0) if i in tensor_idx else string_eval.pop(0) for i in range(len(x))]) # ====== mapping ====== # elif isinstance(x, Mapping): results = {} tensor_eval_key = [] tensor_eval_value = [] for k, v in x.items(): if is_string(v): results[k] = builtins.eval(v) else: tensor_eval_key.append(k) tensor_eval_value.append(v) # evaluate tensor if len(tensor_eval) > 0: graph = [i.graph for i in tensor_eval_value] if len(set(graph)) > 1: raise RuntimeError("Cannot evaluate multiple `Tensor` come from " "different `Graph`.") tensor_eval_value = get_session(graph[0]).run(tensor_eval_value, feed_dict=feed_dict, options=options, run_metadata=run_metadata) # update results for k, v in zip(tensor_eval_key, tensor_eval_value): results[k] = v # ====== just a string ====== # elif is_string(x): results = builtins.eval(x) # ====== just a Tensorflow object ====== # elif isinstance(x, tf.Operation) or \ is_tensor(x, inc_distribution=True, inc_variable=True): results = get_session(x.graph).run(x, feed_dict=feed_dict, options=options, run_metadata=run_metadata) # ====== exception ====== # else: raise RuntimeError("Cannot evaluate object of type: %s" % type(x)) # ====== run updates after ====== # if update_after is not None: get_session(update_after.graph).run(update_after, feed_dict=feed_dict, options=options, run_metadata=run_metadata) return results
def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [ name for name in as_tuple(feat_name, t=str) if name in dataset ] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
def save_cache(self, path, name=None, dtype=None, batch_size=1024): """ Save all preprocessed data to a Dataset Parameters ---------- path: string path to a folder name: None, or list of string specific name for each returned `numpy.ndarray` during iteration dtype: None, or list of dtype, or single dtype specific dtype for all or each of returned `numpy.ndarray` during iteration batch_size: int amount of samples for each batch (higher the faster iteration) Note ---- Only returned `numpy.ndarray` are saved """ from odin.fuel.dataset import Dataset if not is_string(path): raise ValueError("`path` must be string path to a folder.") if os.path.exists(path) and os.path.isfile(path): raise ValueError("`path` is a file, required a folder for " "saving all cache data.") # ====== start caching ====== # prog = Progbar(target=len(self), name='Saving cache of preprocessed data', print_report=True, print_summary=True) ds = Dataset(path, override=True) with self.set_batch_context(batch_size=int(batch_size), seed=None, start=0, end=-1, shuffle_level=0): for X in self: if not isinstance(X, (tuple, list)): X = (X,) n = 0 i = 0 # saving preprocessed data for x in X: if isinstance(x, np.ndarray): # checking name if name is None: x_name = 'X%d' % i else: x_name = name[i] # checking dtype if isinstance(dtype, (tuple, list)): x = x.astype(dtype[i]) elif dtype is not None: x = x.astype(dtype) # saving to the dataset if x_name in ds: ds[x_name].append(x) else: ds[(x_name, 'memmap')] = x # update samples count, and data count n = x.shape[0] i += 1 # print progress prog.add(n) # ====== flush and close everything ====== # ds.flush() ds.close() with open(os.path.join(path, 'README'), 'wb') as f: f.write(str(self)) # end # ====== check one more time ====== # ds = Dataset(path, read_only=True) print(ds) print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)') ds.close() return self
def _to_numpy_array(self, x): if not is_string(x[0]) and len(set(i.shape[1:] for i in x)) == 1: return np.concatenate(x, axis=0) return np.array(x)
def __getitem__(self, key): if is_string(key): key = slice(*self.indices[key]) return super(IndexedData, self).__getitem__(key)
def _transform(self, X): # ====== file input file ====== # raw = None path = None if isinstance(X, Mapping): if 'path' in X: path = X['path'] if 'sr' in X: if self.sr is None: self.sr = X['sr'] self._update_config() self._first_config_generated = True elif self.sr != X['sr']: raise ValueError("Given sample rate: %d, but the audio file has " "sample rate: %d" % (self.sr, X['sr'])) if 'raw' in X: raw = X['raw'] elif is_string(X): path = X elif isinstance(X, np.ndarray): raw = X else: raise ValueError("openSMILE extractor require path to audio file.") # no sample rate specified, cannot generate appropriate config if self.sr is None: raise RuntimeError("Cannot acquire sample rate for the input.") # ====== first time generate config ====== # if not self._first_config_generated: self._first_config_generated = True self._update_config() # ====== extract SAD ====== # unique_id = os.getpid() + random.randint(0, 10e8) inpath = os.path.join( get_logpath(), '%s%d.wav' % (self.__class__.__name__, unique_id)) outpath = os.path.join( get_logpath(), '%s%d.csv' % (self.__class__.__name__, unique_id)) try: if path is None or not os.path.exists(path): if raw is None: raise RuntimeError("openSMILE require input audio file, since " "we cannot find any audio file, it is required to provide " "raw array and sample rate, so the audio file will be cached.") from soundfile import write write(inpath, data=raw, samplerate=self.sr) path = inpath # if in debug mode or not command = 'SMILExtract -loglevel %d -C %s -I %s -O %s' % \ (self._log_level, self.config_path, path, outpath) os.system(command) results = np.genfromtxt(outpath, dtype='float32', delimiter=',', skip_header=0) except Exception as e: import traceback; traceback.print_exc() raise e finally: if os.path.exists(inpath): os.remove(inpath) if os.path.exists(outpath): os.remove(outpath) # ====== post-processing ====== # X_update = self._post_processing(results) if not isinstance(X_update, dict): raise ValueError("_post_processing must return a dictionary.") return X_update