def transform_mpi(self, X, keep_order=True, ncpu=4, n_components=None): """ Sample as transform but using multiprocessing """ n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size batch_list = [(i, min(i + batch_size, n)) for i in range(0, n + batch_size, batch_size) if i < n] # ====== run MPI jobs ====== # def map_func(batch): start, end = batch x = super(MiniBatchPCA, self).transform(X=X[start:end]) # doing dim reduction here save a lot of memory for # inter-processors transfer if n_components is not None: x = x[:, :n_components] # just need to return the start for ordering yield start, x mpi = MPI(batch_list, func=map_func, ncpu=ncpu, batch=1, hwm=ncpu * 12, backend='python') # ====== process the return ====== # X_transformed = [] for start, x in mpi: X_transformed.append((start, x)) if keep_order: X_transformed = sorted(X_transformed, key=lambda x: x[0]) X_transformed = np.concatenate([x[-1] for x in X_transformed], axis=0) return X_transformed
def mutual_info_estimate(representations: np.ndarray, factors: np.ndarray, continuous_representations: bool = True, continuous_factors: bool = False, n_neighbors: int = 3, n_cpu: int = 1, seed: int = 1, verbose: bool = False): r""" Nonparametric method for estimating entropy from k-nearest neighbors distances (note: this implementation use multi-processing) Parameters ----------- Return -------- matrix `[num_latents, num_factors]`, estimated mutual information between each representation and each factors References ------------ A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”. Phys. Rev. E 69, 2004. B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”. PLoS ONE 9(2), 2014. L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16 """ from sklearn.feature_selection import (mutual_info_classif, mutual_info_regression) mutual_info = mutual_info_regression if continuous_factors else \ mutual_info_classif num_latents = representations.shape[1] num_factors = factors.shape[1] # iterate over each factor mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64) # repeat for each factor def func(idx): mi = mutual_info(representations, factors[:, idx], discrete_features=not continuous_representations, n_neighbors=n_neighbors, random_state=seed) return idx, mi jobs = list(range(num_factors)) if n_cpu < 2: it = (func(i) for i in jobs) else: it = MPI(jobs=jobs, func=func, ncpu=n_cpu, batch=1) if verbose: from tqdm import tqdm it = tqdm(it, desc='Estimating mutual information', total=len(jobs)) for i, mi in it: mi_matrix[:, i] = mi return mi_matrix
def test_mpi(self): X = batching(n=512, batch_size=np.random.randint(low=12000, high=80000)) def map_func(batch): for b in batch: yield b mpi = MPI(X, map_func=map_func, ncpu=12, buffer_size=8, maximum_queue_size=12 * 8) Y = [i for i in mpi] self.assertEqual(len(X), len(Y)) self.assertEqual(sum(j - i for i, j in X), sum(j - i for i, j in Y)) self.assertTrue(all(i == j for i, j in zip( sorted(X, key=lambda x: x[0]), sorted(Y, key=lambda x: x[0]) )))
def mutual_info_estimate(representations, factors, continuous_representations=True, continuous_factors=False, n_neighbors=3, random_state=1234): r""" Nonparametric method for estimating entropy from k-nearest neighbors distances (note: this implementation use multi-processing) Return: matrix `[num_latents, num_factors]`, estimated mutual information between each representation and each factors References: A. Kraskov, H. Stogbauer and P. Grassberger, “Estimating mutual information”. Phys. Rev. E 69, 2004. B. C. Ross “Mutual Information between Discrete and Continuous Data Sets”. PLoS ONE 9(2), 2014. L. F. Kozachenko, N. N. Leonenko, “Sample Estimate of the Entropy of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16 """ from sklearn.feature_selection import (mutual_info_classif, mutual_info_regression) mutual_info = mutual_info_regression if continuous_factors else \ mutual_info_classif num_latents = representations.shape[1] num_factors = factors.shape[1] # iterate over each factor mi_matrix = np.empty(shape=(num_latents, num_factors), dtype=np.float64) # repeat for each factor def func(idx): mi = mutual_info(representations, factors[:, idx], discrete_features=not continuous_representations, n_neighbors=n_neighbors, random_state=random_state) return idx, mi for i, mi in MPI(jobs=list(range(num_factors)), func=func, ncpu=min(max(1, get_cpu_count() - 1), 10), batch=1): mi_matrix[:, i] = mi return mi_matrix
def get_pdf_text(path: str) -> dict: from PyPDF2 import PdfFileReader from odin.utils.mpi import MPI def read_text(fpath): with open(fpath, 'rb') as f: f = PdfFileReader(f) text = [] for i in range(f.numPages): page = f.getPage(i) text.append(page.extractText()) return (fpath, text) results = dict() for filepath, text in MPI(jobs=_to_files(path), func=read_text, ncpu=4, batch=1): results[filepath] = text return results
def launch(self, job_overrides): setup_globals() configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) sweep_dir = self.config.hydra.sweep.dir Path(str(sweep_dir)).mkdir(parents=True, exist_ok=True) LOGGER.info("Launching {} jobs locally".format(len(job_overrides))) def run_task(job): idx, overrides = job LOGGER.info("\t#{} : {}".format( idx, " ".join(filter_overrides(overrides)))) sweep_config = self.config_loader.load_sweep_config( self.config, list(overrides)) with open_dict(sweep_config): # id is concatenated overrides here sweep_config.hydra.job.id = '_'.join(sorted(overrides)) sweep_config.hydra.job.num = idx HydraConfig().set_config(sweep_config) ret = run_job( config=sweep_config, task_function=self.task_function, job_dir_key="hydra.sweep.dir", job_subdir_key="hydra.sweep.subdir", ) configure_log(self.config.hydra.hydra_logging, self.config.hydra.verbose) return (idx, ret) if self.ncpu > 1: jobs = list(enumerate(job_overrides)) runs = sorted([ ret for ret in MPI( jobs=jobs, func=run_task, ncpu=int(self.ncpu), batch=1) ]) runs = [i[1] for i in runs] else: runs = [run_task(job)[1] for job in enumerate(job_overrides)] return runs
def fast_tsne(*X, n_components=2, n_samples=None, perplexity=30.0, early_exaggeration=8.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=1234, method='barnes_hut', angle=0.5, n_jobs=4): """ Parameters ---------- n_components : int, optional (default: 2) Dimension of the embedded space. n_samples : {int, None} if given, downsampling the data to given number of sample perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. early_exaggeration : float, optional (default: 8.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so this value is rounded to the next multiple of 50. min_grad_norm : float, optional (default: 1e-7) If the gradient norm is below this threshold, the optimization will be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. The default is "euclidean" which is interpreted as squared euclidean distance. init : string or numpy array, optional (default: "random") Initialization of embedding. Possible options are 'random', 'pca', and a numpy array of shape (n_samples, n_components). PCA initialization cannot be used with precomputed distances and is usually more globally stable than random initialization. verbose : int, optional (default: 0) Verbosity level. random_state : int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Note that different initializations might result in different local minima of the cost function. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut approximation running in O(NlogN) time. method='exact' will run on the slower, but exact, algorithm in O(N^2) time. The exact algorithm should be used when nearest-neighbor errors need to be better than 3%. However, the exact method cannot scale to millions of examples. angle : float (default: 0.5) Only used if method='barnes_hut' This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. """ assert len(X) > 0, "No input is given!" if isinstance(X[0], (tuple, list)): X = X[0] if not all(isinstance(x, np.ndarray) for x in X): raise ValueError( "`X` can only be list of numpy.ndarray or numpy.ndarray") # ====== kwarg for creating T-SNE class ====== # kwargs = dict(locals()) del kwargs['X'] n_samples = kwargs.pop('n_samples', None) # ====== downsampling ====== # if n_samples is not None: n_samples = int(n_samples) assert n_samples > 0 new_X = [] rand = random_state if isinstance(random_state, np.random.RandomState) else \ np.random.RandomState(seed=random_state) for x in X: if x.shape[0] > n_samples: ids = rand.permutation(x.shape[0])[:n_samples] x = x[ids] new_X.append(x) X = new_X # ====== import proper T-SNE ====== # tsne_version = None try: from tsnecuda import TSNE from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE tsne_version = 'cuda' except ImportError: # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` " # "for significant speed up.") try: from MulticoreTSNE import MulticoreTSNE as TSNE tsne_version = 'multicore' except ImportError: wprint( "Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`" ' to accelerate the T-SNE on multiple CPU cores.') try: from sklearn.manifold import TSNE tsne_version = 'sklearn' except Exception as e: raise e # ====== modify kwargs ====== # if tsne_version == 'cuda': kwargs['random_seed'] = kwargs['random_state'] kwargs['theta'] = angle if method == 'exact': TSNE = _exact_TSNE del kwargs['theta'] del kwargs['random_state'] del kwargs['n_jobs'] del kwargs['angle'] del kwargs['method'] elif tsne_version == 'multicore': pass else: del kwargs['n_jobs'] # ====== getting cached values ====== # results = [] X_new = [] for i, x in enumerate(X): md5 = md5_checksum(x) key = _create_key(kwargs, md5) if key in _cached_values: results.append((i, _cached_values[key])) else: X_new.append((i, md5, x)) # ====== perform T-SNE ====== # def apply_tsne(j): idx, md5, x = j tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x)) # only 1 X, no need for MPI if len(X_new) == 1: idx, md5, x = apply_tsne(X_new[0]) results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x else: mpi = MPI(jobs=X_new, func=apply_tsne, batch=1, ncpu=min(len(X_new), cpu_count() - 1)) for idx, md5, x in mpi: results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x # ====== return and clean ====== # results = sorted(results, key=lambda a: a[0]) results = [r[1] for r in results] return results[0] if len(results) == 1 else results
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey( lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join( [s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)) flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError( "ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError( "Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write( '[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification( 'Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification( 'Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [ name for name in as_tuple(feat_name, t=str) if name in dataset ] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
batch_size=batch_size, semi_weight=10, verbose=False) except Exception as e: print("Error:", e) print("Error Config:", name) return print("Finish training %-4s layer:%d hdim:%-3d zdim:%d in %.2f(s)" % (model.id, n, h, z, time.time() - start_time)) with open(os.path.join(path, name), 'wb') as f: pickle.dump(model, f) if not no_train: mpi = MPI(jobs=jobs, func=run_training, ncpu=ncpu, batch=1) for i, j in enumerate(mpi): if i % 5 == 0: print(" == Training %d/%d jobs ==" % (i + 1, len(jobs))) # =========================================================================== # Generate scores file for all model # =========================================================================== def run_scoring(args): n, h, z, model = args name = model.id + '_%d_%d_%d' % (n, h, z) with open(os.path.join(path, name), 'rb') as f: model = pickle.load(f) start_time = time.time()
def unsupervised_clustering_scores(factors: np.ndarray, representations: Optional[np.ndarray] = None, predictions: Optional[np.ndarray] = None, algorithm: str = 'both', random_state: int = 1, n_cpu: int = 1, verbose: bool = True) -> Dict[str, float]: r""" Calculating the unsupervised clustering Scores: - ASW : silhouette_score ([-1, 1], higher is better) is calculated using the mean intra-cluster distance and the mean nearest-cluster distance (b) for each sample. Values near 0 indicate overlapping clusters - ARI : adjusted_rand_score ([-1, 1], higher is better) A similarity measure between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. Similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.0. 1.0 stands for perfect match. - NMI : normalized_mutual_info_score ([0, 1], higher is better) Normalized Mutual Information between two clusterings. 1.0 stands for perfectly complete labeling - UCA : unsupervised_clustering_accuracy ([0, 1], higher is better) accuracy of the linear assignment between predicted labels and ground-truth labels. - HOS : homogeneity_score ([0, 1], higher is better) A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class. 1.0 stands for perfectly homogeneous - COS : completeness_score ([0, 1], higher is better) A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster. 1.0 stands for perfectly complete labeling Arguments: factors : a Matrix. Categorical factors (i.e. one-hot encoded), or multiple factors. algorithm : {'kmeans', 'gmm', 'both'}. The clustering algorithm for assigning the cluster from representations Return: Dict mapping score alias to its scalar value Note: The time complexity is exponential as the number of labels increasing """ if factors.ndim == 1: factors = np.expand_dims(factors, axis=-1) assert representations is not None or predictions is not None, \ "either representations or predictions must be provided" ### preprocessing factors # multinomial : # binary : # multibinary : factor_type = 'multinomial' if np.all(np.unique(factors) == [0., 1.]): if np.all(np.sum(factors, axis=1) == 1.): factor_type = 'binary' else: factor_type = 'multibinary' # start scoring if factor_type == 'binary': return _clustering_scores(X=representations, z=predictions, y=np.argmax(factors, axis=1), algo=algorithm, random_state=random_state) if factor_type in ('multinomial', 'multibinary'): def _get_scores(idx): y = factors[:, idx] if factor_type == 'multinomial': uni = {v: i for i, v in enumerate(sorted(np.unique(y)))} y = np.array([uni[i] for i in y]) else: y = y.astype(np.int32) return _clustering_scores(X=representations, z=predictions, y=y, algo=algorithm, random_state=random_state) scores = defaultdict(list) if factors.shape[1] == 1: verbose = False prog = tqdm(desc="Scoring clusters", total=factors.shape[1], disable=not verbose) if n_cpu == 1: it = (_get_scores(idx) for idx in range(factors.shape[1])) else: it = MPI(jobs=list(range(factors.shape[1])), func=_get_scores, batch=1, ncpu=n_cpu) for s in it: prog.update(1) for k, v in s.items(): scores[k].append(v) return {k: np.mean(v) for k, v in scores.items()}
def run(self): if self.pca: from odin.ml import MiniBatchPCA if not hasattr(self, 'jobs'): raise Exception( 'the Processor must has "jobs" attribute, which is ' 'the list of all jobs.') njobs = len(self.jobs) if self.njobs == 0 else self.njobs prog = Progbar(target=njobs) dataset = self.dataset datatype = self.datatype if self.ncpu is None: # auto select number of CPU ncpu = min(njobs, int(1.2 * cpu_count())) else: ncpu = self.ncpu # ====== indices ====== # indices = defaultdict(list) # ====== MmapDict ====== # dicts = {} for name, dtype, stats in self.features_properties: if 'dict' in str(dtype).lower(): dicts[name] = MmapDict(os.path.join(dataset.path, name)) # ====== statistic ====== # statistic_able = {i[0]: i[-1] for i in self.features_properties} sum1 = defaultdict(int) sum2 = defaultdict(int) # init PCA pca = defaultdict(lambda *args, **kwargs: MiniBatchPCA( n_components=None, whiten=self.pca_whiten, copy=True, batch_size=None) if self.pca else None) # all data are cached for periodically flushed cache = defaultdict(list) if self.ncache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.ncache) ref_vars = {'start': defaultdict(int), 'processed_count': 0} # ====== helper ====== # def flush_feature(name, cache_data): if len(cache_data) > 0: cache_data = np.concatenate(cache_data, 0) # NOTE: if nb_samples < nb_features, fitting PCA # will course error if self.pca and statistic_able[name]: pca[name].partial_fit(cache_data) # flush data if name in dataset: dataset[name].append(cache_data) else: dataset[(name, datatype)] = cache_data def wrapped_reduce(result): name, data = result ref_vars['processed_count'] += 1 # check data if not isinstance(data, (tuple, list)): data = (data, ) length = [] # store length of all data for validation # processing for prop, d in zip(self.features_properties, data): n, t, s = prop # data-type-name, dtype, stats # mmapdict type: if 'dict' in str(t).lower(): dicts[n][name] = d.tolist() if isinstance( d, np.ndarray) else d del d continue # auto-create new indices if len(d) not in length: length.append(len(d)) indices[n].append([ name, ref_vars['start'][n], ref_vars['start'][n] + len(d) ]) ref_vars['start'][n] += len(d) # cache data, only if we have more than 0 sample if len(d) > 0: cache[n].append(d.astype(t)) if self.save_stats and s: # save stats sum1[n] += np.sum(d, axis=0, dtype='float64') sum2[n] += np.sum(np.power(d, 2), axis=0, dtype='float64') del d # ====== flush cache ====== # if ref_vars['processed_count'] % cache_limit == 0: # 12 + 8 for i, j in cache.iteritems(): flush_feature(i, j) cache.clear() # ====== update progress ====== # return name # ====== processing ====== # mpi = MPI(self.jobs, self.map, wrapped_reduce, ncpu=ncpu, buffer_size=1, maximum_queue_size=ncpu * 3) for name in mpi: prog.title = '%-20s' % name prog.add(1) # ====== end, flush the last time ====== # for i, j in cache.iteritems(): flush_feature(i, j) cache = None dataset.flush() # ====== saving indices ====== # for n, ids in indices.iteritems(): outpath = os.path.join( dataset.path, 'indices' if n in self.primary_indices else 'indices_%s' % n) _ = MmapDict(outpath) for name, start, end in ids: _[name] = (int(start), int(end)) _.flush() _.close() # ====== save mean and std ====== # def save_mean_std(sum1, sum2, pca, name, dataset): N = dataset[name].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - mean**2) if self.substitute_nan is not None: mean = np.where(np.isnan(mean), self.substitute_nan, mean) std = np.where(np.isnan(std), self.substitute_nan, std) else: assert not np.any( np.isnan(mean)), 'Mean contains NaN, %s' % name assert not np.any(np.isnan(std)), 'Std contains NaN, %s' % name dataset[name + '_sum1'] = sum1 dataset[name + '_sum2'] = sum2 dataset[name + '_mean'] = mean dataset[name + '_std'] = std dataset[name + '_pca'] = pca # save all stats if self.save_stats: print('Saving statistics of each data ...') for n, d, s in self.features_properties: if s: # save stats print(' * Name:', n) s1, s2, pca_ = sum1[n], sum2[n], pca[n] save_mean_std(s1, s2, pca_, n, dataset) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== all MmapDict flush() ====== # for d in dicts.itervalues(): d.flush() d.close()
def count_frames(specifiers: List[str], is_matrix: bool = False, is_bool_index: bool = True, progressbar: bool = False, num_workers: int = 3, concat_char: str = '&') -> List[int]: """ Parameters ---------- specifiers : list of `str` list of sepcifier `["raw_mfcc_voxceleb.1.ark:42", ...]` is_matrix : `bool` (default=`False`) input data is matrix or vector is_bool_index : `bool` (default=`True`) if `True`, the loaded data is boolean index of speech activity detection, the length of audio file is calculated by summing the index array. concat_char : `str` (default='&') by concatenating multiple specifier using given character, multiple utterance could be sequentially loaded and concatenated. (e.g. 'raw_mfcc_sre18_dev.1.ark:3018396&raw_mfcc_sre18_dev.1.ark:5516398') Return ------ List of integer (i.e. the frame count) """ _check_pykaldi() import kaldi.util.io as kio frame_counts = [] fn_read = kio.read_matrix if bool(is_matrix) else kio.read_vector progress = tqdm(total=len(specifiers), desc="Kaldi counting frame", disable=not progressbar, mininterval=0.0, maxinterval=10.0) def _count(specs): res = [] for idx, spec in specs: n = 0 for s in spec.split(concat_char): # both feature and VAD is provided, then get the vad only dat = fn_read(s).numpy() if is_bool_index: # sum of all True values n += np.sum(dat) else: # just get the first dimension n += len(dat) # (utt_id, frame_count) res.append((int(idx), n)) return res jobs = np.array_split([(i, s) for i, s in enumerate(specifiers)], num_workers * 25) if num_workers == 1: for j in jobs: for r in _count(j): frame_counts.append(r) progress.update(n=len(j)) else: from odin.utils.mpi import MPI for r in MPI(jobs=jobs, func=_count, ncpu=num_workers, batch=1): progress.update(n=len(r)) frame_counts.extend(r) return [i[1] for i in sorted(frame_counts)]
def __iter__(self): # ====== check ====== # if self.__recipes is None: raise ValueError('You must "set_recipes" first') # ====== get start and end for indices ====== # n = self._indices.shape[0] start = _apply_approx(n, self._start) end = _apply_approx(n, self._end) indices = self._indices[start:end] outtype = self._outtype # ====== shuffle the indices ====== # rng = None if self._seed is not None: rng = np.random.RandomState(self._seed) indices = indices[rng.permutation(indices.shape[0])] # reset the seed self._seed = None # ====== create iter and its identity ====== # process_func = self.__recipes.process group_func = self.__recipes.group self.__recipes.prepare( batch_size=self._batch_size, seed=rng.randint(10e6) if rng is not None else None, shuffle_level=self._shuffle_level, ) # ====== create wrapped functions ====== # def map_func(jobs): batch = [] for name, start, end in jobs: start = int(start) end = int(end) # data can be list of Data, or just 1 Data if outtype is not None: x = [ np.array(d[start:end], dtype=t) for d, t in zip(self._data, outtype) ] else: x = [np.array(d[start:end]) for d in self._data] x = process_func(name, x) if x is not None: batch.append(x) return group_func(batch) def reduce_func(results): # perform batch level permutation if rng is not None and self._shuffle_level > 1: permutation = rng.permutation(results[0].shape[0]) # different shape NO shuffle results = [r[permutation] for r in results] # convert batch to tuple object if possible if isinstance(results, (tuple, list)) and len(results) == 1: results = results[0] elif isinstance(results, list): results = tuple(results) return results # ====== track and return ====== # it = MPI(indices, map_func, reduce_func, ncpu=self.ncpu, buffer_size=self.buffer_size, maximum_queue_size=self.maximum_queue_size) self.__running_iter.append(it) return it
def __iter__(self): # ====== get start and end for indices ====== # start = _apply_approx(self.nb_files, self._start) end = _apply_approx(self.nb_files, self._end) all_keys = self.indices_keys[start:end] # ====== shuffle the indices ====== # rng = None shuffle_level = self._shuffle_level if self._seed is not None: rng = np.random.RandomState(self._seed) all_keys = all_keys[rng.permutation(self.nb_files)] if shuffle_level < 1: rng = None # reset the seed self._seed = None batch_size = self._batch_size batch_filter = self._batch_filter process_func = self._recipes.process # ====== prepare data, indices and dtype ====== # data_indices_dtype = [] i = 0 for dat in self._data: for d in dat._data: data_indices_dtype.append( (d, dat.indices, self._output_dtype[i])) i += 1 # ====== create wrapped functions ====== # def map_func(jobs): if self.buffer_size == 1: jobs = [jobs] # calculating batch results batch = [] for name in jobs: X = [] for dat, ids, dtype in data_indices_dtype: start, end = ids[name] # data can be list of Data, or just 1 Data dat = dat[start:end] if dat.dtype != dtype: dat = dat.astype(dtype) X.append(dat) X = process_func(name, X) # ignore None returned result if X is not None: batch.append(X) # choose grouping function if self._batch_mode == 'batch': X = _batch_grouping(batch, batch_size, rng, batch_filter) elif self._batch_mode == 'file': X = _file_grouping(batch, batch_size, rng, batch_filter) return X # ====== track and return ====== # it = MPI(jobs=all_keys, func=map_func, ncpu=self.ncpu, batch=self.buffer_size, hwm=self.hwm, backend=self.mpi_backend) self._running_iter.append(it) return iter(it)
def __init__(self, path="~/tensorflow_datasets/lego_faces", image_size=64, background_threshold=255): super().__init__() path = os.path.abspath(os.path.expanduser(path)) if not os.path.exists(path): os.makedirs(path) ### download metadata meta_path = os.path.join(path, 'meta.csv') if not os.path.exists(meta_path): print("Download lego faces metadata ...") meta_path, _ = urlretrieve(url=LegoFaces.METADATA, filename=meta_path) import pandas as pd metadata = pd.read_csv(meta_path) metadata = metadata[metadata["Category Name"] == "Minifigure, Head"] ### check downloaded images image_folder = os.path.join(path, "dataset") if os.path.exists(image_folder): if md5_folder(image_folder) != LegoFaces.MD5: shutil.rmtree(image_folder) ### download data zip_path = os.path.join(path, "dataset.zip") if not os.path.exists(zip_path): print("Download zip lego faces dataset ...") zip_path, _ = urlretrieve(url=LegoFaces.DATASET, filename=zip_path) if not os.path.exists(image_folder): with zipfile.ZipFile(zip_path, mode="r") as f: print("Extract all lego faces images ...") f.extractall(path) ### load all images, downsample if necessary images = glob.glob(image_folder + '/*.jpg', recursive=True) if image_size != 128: image_folder = image_folder + '_%d' % int(image_size) if not os.path.exists(image_folder): os.mkdir(image_folder) if len(os.listdir(image_folder)) != len(images): shutil.rmtree(image_folder) os.mkdir(image_folder) from tqdm import tqdm images = [ i for i in tqdm(MPI(jobs=images, func=partial(_resize, image_size=image_size, outpath=image_folder), ncpu=3, batch=1), total=len(images), desc="Resizing images to %d" % image_size) ] else: images = glob.glob(image_folder + '/*.jpg', recursive=True) ### extract the heuristic factors metadata = { part_id: desc for part_id, desc in zip(metadata["Number"], metadata["Name"]) } images_desc = {} for path in images: name = os.path.basename(path)[:-4] if name in metadata: desc = metadata[name] else: name = name.split('_') desc = metadata[name[0]] images_desc[path] = _process_desc(desc) ### tokenizing the description from PIL import Image def imread(p): img = Image.open(p, mode='r') arr = np.array(img, dtype=np.uint8) del img return arr self.image_size = image_size self.images = np.stack( [i for i in MPI(jobs=images, func=imread, ncpu=2, batch=1)]) self.factors = _extract_factors(list(images_desc.keys()), list(images_desc.values())) ### remove images with background ids = np.array([ True if np.min(i) <= int(background_threshold) else False for i in self.images ]) self.images = self.images[ids] self.factors = self.factors[ids] ### split the dataset rand = np.random.RandomState(seed=1) n = len(self.images) ids = rand.permutation(n) self.train = (self.images[:int(0.8 * n)], self.factors[:int(0.8 * n)]) self.valid = (self.images[int(0.8 * n):int(0.9 * n)], self.factors[int(0.8 * n):int(0.9 * n)]) self.test = (self.images[int(0.9 * n):], self.factors[int(0.9 * n):])
def scrap_lego_faces(metadata, path, resize=64, n_processes=4): r""" This function does not filter out bad images """ from tqdm import tqdm from PIL import Image def _download_image(meta, conn): part_id, desc = meta desc = desc.replace("Minifigure, ", "") return_path = [] with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=InsecureRequestWarning) response = conn.request( "GET", f"https://www.bricklink.com/v2/catalog/catalogitem.page?P={part_id}", preload_content=False) img_url = re.search( rf"\bimg\.bricklink\.com\/ItemImage\/[A-Z]+\/[0-9]+\/{part_id}\.png\b", str(response.read(), 'utf-8'), ) if img_url is not None: img_url = img_url.group(0) img_response = conn.request("GET", f"https://{img_url}", preload_content=False) image_path = f"{path}/{part_id}" # convert to jpg with white background image = Image.open(img_response).convert("RGBA") background = Image.new("RGBA", image.size, (255, 255, 255)) image = Image.alpha_composite(background, image).convert("RGB") del background width, height = image.size ratio = width / height # split the image if ratio >= 1.6 or part_id: im = np.array(image) M = im.shape[0] N = im.shape[1] // 2 halves = [ im[x:x + M, y:y + N] for x in range(0, im.shape[0], M) for y in range(0, im.shape[1], N) ] image = [Image.fromarray(half, "RGB") for half in halves[:2]] else: image = [image] # crop to square image for idx, im in enumerate(image): width, height = im.size new_len = min(width, height) left = (width - new_len) / 2 top = (height - new_len) / 2 right = (width + new_len) / 2 bottom = (height + new_len) / 2 im = im.crop((left, top, right, bottom)) # resize the image if resize is not None: im = im.resize((int(resize), int(resize))) # save image out = image_path + ('.jpg' if idx == 0 else ('_%d.jpg' % idx)) im.save(out, "JPEG", quality=90) return_path.append(out) del im return return_path conn = PoolManager( num_pools=2, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0" }, maxsize=100, cert_reqs='CERT_NONE') all_images = [] for image_path in tqdm(MPI( jobs=list(zip(metadata["Number"].values, metadata["Name"].values)), func=partial(_download_image, conn=conn), ncpu=max(1, int(n_processes)), batch=1, ), desc="Download lego faces", unit="image", total=metadata.shape[0]): all_images += image_path return np.array(all_images)
def fast_tsne( *X, n_components: int = 2, max_samples: Optional[int] = None, perplexity: float = 30.0, early_exaggeration: float = 12.0, learning_rate: float = 200.0, n_iter: int = 1000, n_iter_without_progress: int = 300, exaggeration_iter: int = 250, perplexity_max_iter: int = 100, min_grad_norm: float = 1e-7, method: str = 'barnes_hut', metric: str = "euclidean", init: str = "random", angle: float = 0.5, n_jobs: Optional[int] = 4, merge_inputs: bool = True, pca_preprocessing: bool = True, return_model: bool = False, random_state: int = 1, verbose: int = 0, framework: Literal['auto', 'sklearn', 'cuml'] = 'auto', ): """ t-Stochastic Nearest Neighbors. If the algorithm take unexpected long time for running, lower the `exaggeration_iter`, or reduce the amount of samples by downsampling the dataset. Parameters ---------- n_components : int, optional (default: 2) Dimension of the embedded space. max_samples : {int, None} if given, downsampling the data to given number of sample perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. early_exaggeration : float, optional (default: 8.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so this value is rounded to the next multiple of 50. perplexity_max_iter : int, (default 100) The number of epochs the best gaussian bands are found for. exaggeration_iter : int, (default 250) To promote the growth of clusters, set this higher. min_grad_norm : float, optional (default: 1e-7) If the gradient norm is below this threshold, the optimization will be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. The default is "euclidean" which is interpreted as squared euclidean distance. init : string or numpy array, optional (default: "random") Initialization of embedding. Possible options are 'random', 'pca', and a numpy array of shape (n_samples, n_components). PCA initialization cannot be used with precomputed distances and is usually more globally stable than random initialization. verbose : int, optional (default: 0) Verbosity level, a number from 0 to 6. random_state : int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Note that different initializations might result in different local minima of the cost function. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut approximation running in O(NlogN) time. method='exact' will run on the slower, but exact, algorithm in O(N^2) time. The exact algorithm should be used when nearest-neighbor errors need to be better than 3%. However, the exact method cannot scale to millions of examples. angle : float (default: 0.5) Only used if method='barnes_hut' This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. return_model : a Boolean, if `True`, return the trained t-SNE model merge_inputs : a Boolean, if `True`, merge all arrays into a single array for training t-SNE. """ assert len(X) > 0, "No input is given!" if isinstance(X[0], (tuple, list)): X = X[0] if not all(isinstance(x, np.ndarray) for x in X): raise ValueError( "`X` can only be list of numpy.ndarray or numpy.ndarray") # ====== kwarg for creating T-SNE class ====== # kwargs = dict(locals()) del kwargs['X'] kwargs.pop('merge_inputs') kwargs.pop('return_model') kwargs.pop('max_samples') kwargs.pop('framework') kwargs.pop('pca_preprocessing') # ====== downsampling ====== # if max_samples is not None: max_samples = int(max_samples) assert max_samples > 0 new_X = [] rand = random_state if isinstance(random_state, np.random.RandomState) else \ np.random.RandomState(seed=random_state) for x in X: if x.shape[0] > max_samples: ids = rand.permutation(x.shape[0])[:max_samples] x = x[ids] new_X.append(x) X = new_X # ====== import proper T-SNE ====== # tsne_version = None if framework != 'sklearn': try: from cuml.manifold import TSNE tsne_version = 'cuda' except ImportError: warnings.warn("Install RAPIDSAI cuML GPUs-accelerated t-SNE") try: from MulticoreTSNE import MulticoreTSNE as TSNE tsne_version = 'multicore' except ImportError: warnings.warn( "pip install " "git+https://github.com/DmitryUlyanov/Multicore-TSNE.git") if tsne_version is None: from sklearn.manifold import TSNE tsne_version = 'sklearn' # ====== modify kwargs ====== # if tsne_version == 'cuda': del kwargs['n_jobs'] elif tsne_version == 'multicore': del kwargs['perplexity_max_iter'] del kwargs['exaggeration_iter'] else: del kwargs['n_jobs'] del kwargs['perplexity_max_iter'] del kwargs['exaggeration_iter'] # ====== getting cached values ====== # results = [] X_new = [] X_size = [] if merge_inputs: X_size = [x.shape[0] for x in X] x = np.vstack(X) if len(X) > 1 else X[0] md5 = md5_checksum(x) key = _create_key(tsne_version, kwargs, md5) if key in _cached_values: results.append((0, _cached_values[key])) else: X_new.append((0, md5, x)) else: for i, x in enumerate(X): md5 = md5_checksum(x) key = _create_key(tsne_version, kwargs, md5) if key in _cached_values: results.append((i, _cached_values[key])) else: X_new.append((i, md5, x)) # ====== perform T-SNE ====== # def apply_tsne(j): idx, md5, x = j if pca_preprocessing: x = PCA(n_components=None, random_state=random_state).fit_transform(x) tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x), tsne if return_model else None) # only 1 X, no need for MPI if len(X_new) == 1 or tsne_version in ('cuda', 'multicore'): for x in X_new: idx, md5, x, model = apply_tsne(x) results.append((idx, x)) _cached_values[_create_key(tsne_version, kwargs, md5)] = x else: mpi = MPI(jobs=X_new, func=apply_tsne, batch=1, ncpu=min(len(X_new), cpu_count() - 1)) model = [] for idx, md5, x, m in mpi: results.append((idx, x)) _cached_values[_create_key(tsne_version, kwargs, md5)] = x model.append(m) # ====== return and clean ====== # if merge_inputs and len(X_size) > 1: indices = [0] + np.cumsum(X_size).tolist() results = [results[0][1][s:e] for s, e in zip(indices, indices[1:])] else: results = sorted(results, key=lambda a: a[0]) results = [r[1] for r in results] results = results[0] if len(results) == 1 else results if return_model: return results, model del model return results