def send_notification(self, msg): if self._log: add_notification( '[%s] %s' % (ctext(self.__class__.__name__, 'magenta'), msg)) return self
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [ name for name in as_tuple(feat_name, t=str) if name in dataset ] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
def _show_noti(self, msg): if self._verbose > 1 and self._verbose != 3: add_notification(msg)
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [name for name in as_tuple(feat_name, t=str) if name in dataset] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()