def _split_list(x, rng, train=0.6, idfunc=None, inc_test=True): # ====== shuffle input ====== # if idfunc is not None: x_id = defaultdict(list) for i in x: x_id[idfunc(i)].append(i) else: x_id = {i: [j] for i, j in enumerate(x)} # shuffle ID(s) id_list = list(x_id.keys()) rng.shuffle(id_list) # ====== split ====== # N = len(id_list) if N == 1: raise ValueError("Only find 1 sample, cannot split") train = int(np.floor(float(train) * N)) if train >= N: raise ValueError("train proportion must larger than 0 and smaller than 1.") valid = (N - train) // (2 if inc_test else 1) # ====== return splitted ====== # rets = (flatten_list((x_id[i] for i in id_list[:train]), level=1), flatten_list((x_id[i] for i in id_list[train: train + valid]), level=1)) if inc_test: rets += (flatten_list((x_id[i] for i in id_list[train + valid:]), level=1),) else: rets += ([],) assert sum(len(r) for r in rets) == len(x), \ "Number of returned data inconsitent from original data, %d != %d" % \ (sum(len(r) for r in rets), len(x)) return rets
def _split_list(x, rng, train=0.6, idfunc=None, inc_test=True): # ====== shuffle input ====== # if idfunc is not None: x_id = defaultdict(list) for i in x: x_id[idfunc(i)].append(i) else: x_id = {i: [j] for i, j in enumerate(x)} # shuffle ID(s) id_list = list(x_id.keys()) rng.shuffle(id_list) # ====== split ====== # N = len(id_list) if N == 1: raise ValueError("Only find 1 sample, cannot split") train = int(np.floor(float(train) * N)) if train >= N: raise ValueError( "train proportion must larger than 0 and smaller than 1.") valid = (N - train) // (2 if inc_test else 1) # ====== return splitted ====== # rets = (flatten_list((x_id[i] for i in id_list[:train]), level=1), flatten_list((x_id[i] for i in id_list[train:train + valid]), level=1)) if inc_test: rets += (flatten_list((x_id[i] for i in id_list[train + valid:]), level=1), ) else: rets += ([], ) assert sum(len(r) for r in rets) == len(x), \ "Number of returned data inconsitent from original data, %d != %d" % \ (sum(len(r) for r in rets), len(x)) return rets
def __init__(self, *posteriors, verbose=True): super(ResultsSheet, self).__init__() posteriors = flatten_list(posteriors) assert len(posteriors) > 0, "No posteriors for analysis!" assert all(isinstance(i, Posterior) for i in posteriors) self._posteriors = posteriors self.verbose = bool(verbose)
def __init__(self, inputs, outputs, updates=[], defaults={}, training=None, batch_size=None, batch_vars=[], strict=False): self.training = training self.batch_size = batch_size self.batch_vars = list(batch_vars) self._strict = bool(strict) # ====== validate input ====== # if isinstance(inputs, Mapping): self.inputs_name = inputs.keys() inputs = inputs.values() elif not isinstance(inputs, (tuple, list)): inputs = [inputs] self.inputs = flatten_list(inputs, level=None) if not hasattr(self, 'inputs_name'): self.inputs_name = [i.name.split(':')[0] for i in self.inputs] # ====== defaults ====== # defaults = dict(defaults) self.defaults = defaults # ====== validate outputs ====== # return_list = True if not isinstance(outputs, (tuple, list)): outputs = (outputs,) return_list = False self.outputs = flatten_list(list(outputs), level=None) self._return_list = return_list # ====== validate updates ====== # if isinstance(updates, Mapping): updates = updates.items() with tf.control_dependencies(self.outputs): # create updates ops if not isinstance(updates, tf.Operation): updates_ops = [] for update in updates: if isinstance(update, (tuple, list)): p, new_p = update updates_ops.append(tf.assign(p, new_p)) else: # assumed already an assign op updates_ops.append(update) self.updates_ops = tf.group(*updates_ops) else: # already an tensorflow Ops self.updates_ops = updates # ====== cached shape ====== # self._input_shape = [tuple(i.shape.as_list()) for i in self.inputs] self._output_shape = [tuple(i.shape.as_list()) for i in self.outputs]
def __init__(self, outputs=None, trace_up=False): # it is important to don't have duplicated outputs # otherwise, it can go into infinite loop outputs = list(set([o for o in flatten_list(as_list(outputs), level=None) if o is not None])) self.outputs = outputs self._trace_up = trace_up self._get_variables()
def set_recipes(self, *recipes): # filter out None value recipes = flatten_list(as_tuple(recipes)) recipes = [rcp for rcp in recipes if rcp is not None and isinstance(rcp, FeederRecipe)] # ====== set the new recipes ====== # if len(recipes) > 0: self._recipes = recipes for rcp in self._recipes: rcp.set_feeder_info(self.nb_desc) return self
def set_recipes(self, *recipes): # filter out None value recipes = flatten_list(as_tuple(recipes)) recipes = [ rcp for rcp in recipes if rcp is not None and isinstance(rcp, FeederRecipe) ] # ====== set the new recipes ====== # if len(recipes) > 0: self._recipes = recipes for rcp in self._recipes: rcp.set_feeder_info(self.nb_desc) return self
def set_extractor_debug(extractors, debug): # ====== prepare ====== # if isinstance(extractors, (tuple, list)): extractors = [i for i in flatten_list(extractors) if isinstance(i, Extractor)] elif isinstance(extractors, Pipeline): extractors = [i[-1] for i in extractors.steps] elif isinstance(extractors, Mapping): extractors = [i[-1] for i in extractors.items()] else: raise ValueError("No support for `extractors` type: %s" % type(extractors)) # ====== set the value ====== # for i in extractors: i._debug = bool(debug) return extractors
def set_extractor_debug(extractors, debug): # ====== prepare ====== # if isinstance(extractors, (tuple, list)): extractors = [ i for i in flatten_list(extractors) if isinstance(i, Extractor) ] elif isinstance(extractors, Pipeline): extractors = [i[-1] for i in extractors.steps] elif isinstance(extractors, Mapping): extractors = [i[-1] for i in extractors.items()] else: raise ValueError("No support for `extractors` type: %s" % type(extractors)) # ====== set the value ====== # for i in extractors: i._debug = bool(debug) return extractors
def _initialize(self): param_names = flatten_list([('b%d' % i, 'w%d' % i) for i in range(self.nb_layers)]) weights = self.get_loaded_param(param_names) # ====== create ====== # layers = [] for i in range(self.nb_layers): b = weights[i * 2].ravel() W = weights[i * 2 + 1].T num_units = b.shape[0] if i == self.nb_layers - 1: name = 'Bottleneck' nonlinearity = K.linear else: name = "Layer%d" % (i + 1) nonlinearity = K.relu op = Dense(num_units=num_units, W_init=W, b_init=b, activation=nonlinearity, name=name) layers.append(op) self.layers = layers
def _initialize(self): param_names = flatten_list([('b%d' % i, 'w%d' % i) for i in range(self.nb_layers)]) weights = self.get_loaded_param(param_names) # ====== create ====== # layers = [] for i in range(self.nb_layers): b = weights[i * 2].ravel() W = weights[i * 2 + 1].T num_units = b.shape[0] if i == self.nb_layers - 1: name = 'Bottleneck' nonlinearity = K.linear else: name = "Layer%d" % (i + 1) nonlinearity = K.relu op = Dense(num_units=num_units, W_init=W, b_init=b, activation=nonlinearity, name=name) layers.append(op) self.layers = layers
def role_scope(*roles): """ Example ------- >>> X = K.variable(np.random.rand(12, 8)) >>> with role_scope(Weight, Variational, VariationalMean): ... add_roles(X) >>> print(X.tag.roles) ... # [<class 'odin.basic.Weight'>, <class 'odin.basic.VariationalMean'>] """ roles = [ r for r in flatten_list(roles, level=None) if isinstance(r, type) and issubclass(r, Role) ] # ====== shrink the roles so there is NO subrole ====== # roles = __ROLE_STACK[-1] + roles roles = [ r for r in roles if not any(r != r0 and issubclass(r0, r) for r0 in roles) ] __ROLE_STACK.append(roles) yield roles __ROLE_STACK.pop()
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=5218): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError("`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1,) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=5218): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
def copy(self, destination, indices_filter=None, data_filter=None, override=False): """ Copy the dataset to a new folder and closed the old dataset """ from distutils.dir_util import copy_tree read_only = self.read_only # indices if indices_filter is not None and \ not is_callable(indices_filter) and \ not isinstance(indices_filter, (tuple, list)): raise ValueError( '`indices_filter` must be callable, tuple, list or None') if isinstance(indices_filter, (tuple, list)): tmp = tuple(indices_filter) indices_filter = lambda x: x in tmp # data name if data_filter is not None and \ not is_callable(data_filter) and \ not isinstance(data_filter, (tuple, list)): raise ValueError( '`data_filter` must be callable, tuple, list or None') if isinstance(data_filter, (tuple, list)): tmp = tuple(data_filter) data_filter = lambda x: x in tmp # ====== other files which are not Data ====== # other_files = [i for i in os.listdir(self.path) if i not in self] # ====== preprocessing ====== # destination = os.path.abspath(str(destination)) if not os.path.exists(destination): os.mkdir(destination) elif not os.path.isdir(destination): raise ValueError('path at "%s" must be a folder' % destination) elif override: shutil.rmtree(destination) os.mkdir(destination) else: raise ValueError( "A folder exist at path: '%s', cannot be overrided." % destination) # ====== copy everything ====== # if indices_filter is None and data_filter is None: print("Copying %s files from '%s' to '%s' ..." % (ctext(len(self), 'cyan'), ctext( self.path, 'yellow'), ctext(destination, 'yellow'))) copy_tree(self.path, destination) # ====== only data_filter ====== # elif indices_filter is None: data_list = [i for i in self.keys() if data_filter(i)] # copy all the data for name in data_list: org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) print("Copying from '%s' to '%s' ..." % (ctext(org_path, 'yellow'), ctext(dst_path, 'yellow'))) shutil.copy2(org_path, dst_path) # copy all the related indices for name in self.keys(): org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) if not os.path.exists(dst_path) and \ ('indices' == name or any(i in data_list for i in name.split('_')[1:])): print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'), ctext(dst_path, 'cyan'))) shutil.copy2(org_path, dst_path) # ====== use indices_filter and data_filter ====== # else: if data_filter is None: all_data = list(self.keys()) else: all_data = [i for i in self.keys() if data_filter(i)] # list of data with separated indices separated_data = flatten_list( [k.split('_')[1:] for k in self.keys() if 'indices_' == k[:8]]) # iterate over indices and copy one by one data for ids_name in [k for k in self.keys() if 'indices' == k[:7]]: indices = [(n, (s, e)) for n, (s, e) in self[ids_name] if indices_filter(n)] # no match indices, skip if len(indices) == 0: continue nb_samples = sum(e - s for n, (s, e) in indices) # get all data assigned to given indices data = ids_name.split('_')[1:] if len(data) == 0: data = [i for i in all_data if i not in separated_data] else: data = [i for i in data if i in all_data] # if still no data found, skip if len(data) == 0: continue # copy each data for data_name in data: X = self[data_name] # copy big MmapDict if isinstance(X, MmapDict) and len(X) == len( self[ids_name]): new_path = os.path.join(destination, os.path.basename(X.path)) print("Copying MmapDict from '%s' to '%s'" % (ctext(X.path, 'cyan'), ctext(new_path, 'cyan'))) new_dict = MmapDict(new_path, cache_size=80000, read_only=False) for n, (s, e) in indices: new_dict[n] = X[n] new_dict.flush(save_all=True) new_dict.close() # copy MmapData elif isinstance(X, MmapData): Y = MmapData(path=os.path.join(destination, data_name), dtype=X.dtype, shape=(0, ) + X.shape[1:], read_only=False) prog = Progbar(target=nb_samples, print_report=True, print_summary=True, name="Copying data: '%s' to path:'%s'" % (ctext(data_name, 'yellow'), ctext(Y.data_info, 'cyan'))) for n, (s, e) in indices: Y.append(X[s:e]) prog.add(e - s) # unknown data-type else: org_path = os.path.join(self.path, data_name) new_path = os.path.join(destination, data_name) # just copy directly the files if os.path.isfile(org_path) or \ not os.path.exists(new_path): shutil.copy2(org_path, new_path) print("Copying '%s' to '%s' ..." % (ctext( org_path, 'cyan'), ctext(new_path, 'yellow'))) else: wprint("Cannot copy: '%s' - %s" % (ctext(data_name, 'cyan'), ctext(type(self[data_name]), 'yellow'))) # copy the indices new_indices = MmapDict(os.path.join(destination, ids_name), cache_size=80000, read_only=False) start = 0 for n, (s, e) in indices: size = e - s new_indices[n] = (start, start + size) start += size new_indices.flush(save_all=True) new_indices.close() # ====== copy others files ====== # for f in other_files: org_path = os.path.join(self.path, f) dst_path = os.path.join(destination, f) if not os.path.exists(dst_path): if os.path.isdir(org_path): # directory copy_tree(org_path, dst_path) else: # single file shutil.copy2(org_path, dst_path) # ====== readme ====== # readme_name = os.path.basename(self._readme_path) dst_path = os.path.join(destination, readme_name) if not os.path.exists(dst_path): shutil.copy2(self._readme_path, dst_path) return Dataset(destination, read_only=read_only)
def copy(self, destination, indices_filter=None, data_filter=None, override=False): """ Copy the dataset to a new folder and closed the old dataset """ from distutils.dir_util import copy_tree read_only = self.read_only # indices if indices_filter is not None and \ not is_callable(indices_filter) and \ not isinstance(indices_filter, (tuple, list)): raise ValueError('`indices_filter` must be callable, tuple, list or None') if isinstance(indices_filter, (tuple, list)): tmp = tuple(indices_filter) indices_filter = lambda x: x in tmp # data name if data_filter is not None and \ not is_callable(data_filter) and \ not isinstance(data_filter, (tuple, list)): raise ValueError('`data_filter` must be callable, tuple, list or None') if isinstance(data_filter, (tuple, list)): tmp = tuple(data_filter) data_filter = lambda x: x in tmp # ====== other files which are not Data ====== # other_files = [i for i in os.listdir(self.path) if i not in self] # ====== preprocessing ====== # destination = os.path.abspath(str(destination)) if not os.path.exists(destination): os.mkdir(destination) elif not os.path.isdir(destination): raise ValueError('path at "%s" must be a folder' % destination) elif override: shutil.rmtree(destination) os.mkdir(destination) else: raise ValueError("A folder exist at path: '%s', cannot be overrided." % destination) # ====== copy everything ====== # if indices_filter is None and data_filter is None: print("Copying %s files from '%s' to '%s' ..." % (ctext(len(self), 'cyan'), ctext(self.path, 'yellow'), ctext(destination, 'yellow'))) copy_tree(self.path, destination) # ====== only data_filter ====== # elif indices_filter is None: data_list = [i for i in self.keys() if data_filter(i)] # copy all the data for name in data_list: org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) print("Copying from '%s' to '%s' ..." % (ctext(org_path, 'yellow'), ctext(dst_path, 'yellow'))) shutil.copy2(org_path, dst_path) # copy all the related indices for name in self.keys(): org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) if not os.path.exists(dst_path) and \ ('indices' == name or any(i in data_list for i in name.split('_')[1:])): print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'), ctext(dst_path, 'cyan'))) shutil.copy2(org_path, dst_path) # ====== use indices_filter and data_filter ====== # else: if data_filter is None: all_data = list(self.keys()) else: all_data = [i for i in self.keys() if data_filter(i)] # list of data with separated indices separated_data = flatten_list( [k.split('_')[1:] for k in self.keys() if 'indices_' == k[:8]]) # iterate over indices and copy one by one data for ids_name in [k for k in self.keys() if 'indices' == k[:7]]: indices = [(n, (s, e)) for n, (s, e) in self[ids_name] if indices_filter(n)] # no match indices, skip if len(indices) == 0: continue nb_samples = sum(e - s for n, (s, e) in indices) # get all data assigned to given indices data = ids_name.split('_')[1:] if len(data) == 0: data = [i for i in all_data if i not in separated_data] else: data = [i for i in data if i in all_data] # if still no data found, skip if len(data) == 0: continue # copy each data for data_name in data: X = self[data_name] # copy big MmapDict if isinstance(X, MmapDict) and len(X) == len(self[ids_name]): new_path = os.path.join(destination, os.path.basename(X.path)) print("Copying MmapDict from '%s' to '%s'" % ( ctext(X.path, 'cyan'), ctext(new_path, 'cyan'))) new_dict = MmapDict(new_path, cache_size=80000, read_only=False) for n, (s, e) in indices: new_dict[n] = X[n] new_dict.flush(save_all=True) new_dict.close() # copy MmapData elif isinstance(X, MmapData): Y = MmapData(path=os.path.join(destination, data_name), dtype=X.dtype, shape=(0,) + X.shape[1:], read_only=False) prog = Progbar(target=nb_samples, print_report=True, print_summary=True, name="Copying data: '%s' to path:'%s'" % (ctext(data_name, 'yellow'), ctext(Y.data_info, 'cyan'))) for n, (s, e) in indices: Y.append(X[s:e]) prog.add(e - s) # unknown data-type else: org_path = os.path.join(self.path, data_name) new_path = os.path.join(destination, data_name) # just copy directly the files if os.path.isfile(org_path) or \ not os.path.exists(new_path): shutil.copy2(org_path, new_path) print("Copying '%s' to '%s' ..." % (ctext(org_path, 'cyan'), ctext(new_path, 'yellow'))) else: wprint("Cannot copy: '%s' - %s" % (ctext(data_name, 'cyan'), ctext(type(self[data_name]), 'yellow'))) # copy the indices new_indices = MmapDict(os.path.join(destination, ids_name), cache_size=80000, read_only=False) start = 0 for n, (s, e) in indices: size = e - s new_indices[n] = (start, start + size) start += size new_indices.flush(save_all=True) new_indices.close() # ====== copy others files ====== # for f in other_files: org_path = os.path.join(self.path, f) dst_path = os.path.join(destination, f) if not os.path.exists(dst_path): if os.path.isdir(org_path): # directory copy_tree(org_path, dst_path) else: # single file shutil.copy2(org_path, dst_path) # ====== readme ====== # readme_name = os.path.basename(self._readme_path) dst_path = os.path.join(destination, readme_name) if not os.path.exists(dst_path): shutil.copy2(self._readme_path, dst_path) return Dataset(destination, read_only=read_only)
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
def __call__(self, *inputs, **kwargs): show_progress = kwargs.pop('show_progress', False) # dictionary as inputs if len(kwargs) == len(self.inputs_name): inputs = [kwargs[i] for i in self.inputs_name] # ====== delete un-matchede inputs ====== # inputs_new = [] tmp = list(inputs) shapes = list(self._input_shape) # this process iteratively remove inputs with mismatch shape # to current given input for s in shapes: for i in tuple(tmp): if len(i.shape) != len(s) or \ any(a is not None and a > 0 and a != b for a, b in zip(s, i.shape)): # different ndim, or shape tmp.remove(i) else: inputs_new.append(i) tmp.remove(i) break if len(inputs_new) != len(self.inputs): raise ValueError("Given inputs have shape: %s, cannot match the shape of " "defined inputs: %s" % ('; '.join([str(i.shape) for i in inputs]), '; '.join([str(i) for i in self.input_shape]))) if not self._strict: inputs = inputs_new # ====== create feed_dict ====== # feed_dict = {} inputs = flatten_list(inputs, level=None) for tensor, value in zip(self.inputs, inputs): feed_dict[tensor] = value feed_dict.update(self.defaults) # check if modifying training mode if self.training is None: pass elif self.training: feed_dict.update({is_training(): True}) else: feed_dict.update({is_training(): False}) session = get_session() outputs = None # ====== mini-batches ====== # if self.batch_size is not None: batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)] if len(self.batch_vars) == 0 else self.batch_vars) batch_vars = [i for i in batch_vars if i in feed_dict and hasattr(feed_dict[i], 'shape')] n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars)) assert len(n_samples) == 1, \ "Data have multiple batching dimension: %s" % str(n_samples) n_samples = n_samples[0] # only continue if we have more samples than `batch_size` if n_samples > self.batch_size: n_output = len(self.outputs) outputs = [] all_batches = [] # (optional) showing progress if show_progress: prog = Progbar(target=n_samples, print_report=False, print_summary=False, name='') for s, e in batching(batch_size=int(self.batch_size), n=n_samples): if show_progress: prog.add(e - s) all_batches.append(e - s) feed_dict_minibatch = OrderedDict([(k, v[s:e]) if k in batch_vars else (k, v) for k, v in feed_dict.items()]) updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict_minibatch) updated = updated[:n_output] if not self._return_list: updated = updated[0] outputs.append(updated) ## concatenate all outputs if not self._return_list: o_ndim = outputs[0].ndim if o_ndim == 0: # returned scalars outputs = np.array(outputs) else: # returned array for o_axis in range(o_ndim): all_n = [o.shape[o_axis] for o in outputs] if all_n == all_batches: break outputs = np.concatenate(outputs, axis=o_axis) ## returning a list of outputs else: new_outputs = [] for output_idx in range(len(outputs[0])): o = [x[output_idx] for x in outputs] o_ndim = o[0].ndim if o_ndim == 0: # returned scalars o = np.array(o) else: # returned array for o_axis in range(o[0].ndim): all_n = [val.shape[o_axis] for val in o] if all_n == all_batches: break o = np.concatenate(o, axis=o_axis) new_outputs.append(o) outputs = new_outputs # ====== single batch ====== # if outputs is None: updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict) outputs = updated[:len(self.outputs)] if not self._return_list: outputs = outputs[0] # ====== return final output ====== # return outputs
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=1234): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError( "`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1, ) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices']) # ====== checking indices ====== # main_indices = {name: (start, end) for name, (start, end) in ds['indices'].items()} for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform( ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()