def __init__(self, path="/tmp/ydisentanglement.npz"): path = os.path.abspath(os.path.expanduser(path)) if os.path.exists(path): if not os.path.isfile(path): raise ValueError( "path to '%s' is a folder, require path to a file" % path) if md5_checksum(path) != YDisentanglement.MD5: os.remove(path) # create new dataset if not exist if not os.path.exists(path): images_train, attributes_train = YDisentanglement.generate_data( training=True) images_test, attributes_test = YDisentanglement.generate_data( training=False) with open(path, 'wb') as f: np.savez(f, images_train=images_train, attributes_train=attributes_train, images_test=images_test, attributes_test=attributes_test) print(md5_checksum(path)) with open(path, 'rb') as f: data = np.load(f) self.images_train = data['images_train'] self.attributes_train = data['attributes_train'] self.images_test = data['images_test'] self.attributes_test = data['attributes_test']
def hash_config(cfg: DictConfig, exclude_keys: Optional[List[str]] = None, length=6) -> str: """Create an unique hash code from `DictConfig` Arguments: cfg : {dict} the configuration to generate an unique identity exclude_keys : list of string, given keys will be ignored from theconfiguration length: {int} maximum length of the hash code. Return: a hash string """ assert isinstance(cfg, (DictConfig, dict)) # process exclude_keys if exclude_keys is not None and len(exclude_keys) > 0: exclude_keys = as_tuple(exclude_keys, t=string_types) cfg = deepcopy(cfg) for key in exclude_keys: if '.' in key: key = key.split('.') attr = cfg for i in key[:-1]: attr = getattr(attr, i) del attr[key[-1]] else: del cfg[key] cfg = flatten_config(cfg, base='', max_depth=-1) return md5_checksum(cfg)[:int(length)]
def download_file(url, filename, override, md5=None): f""" Download file and check the MD5 (if provided) """ if md5 is None: md5 = r"" from tqdm import tqdm if os.path.exists(filename) and os.path.isfile(filename): if override: os.remove(filename) if len(md5) > 0: if md5 == md5_checksum(filename): return filename else: print(f"MD5 of file {filename} mismatch, re-download the file.") os.remove(filename) else: # no MD5 provide just ignore the download if file exist return filename prog = [None] def progress(blocknum, blocksize, total): if prog[0] is None: prog[0] = tqdm(desc=f"Download {os.path.basename(filename)}", total=int(total / 1024. / 1024.) + 1, unit="MB") prog[0].update(int(blocknum * blocksize / 1024. / 1024.) - prog[0].n) urlretrieve(url=url, filename=filename, reporthook=progress) prog[0].clear() prog[0].close() print(f"File '{filename}' md5:{md5_checksum(filename)}") return filename
def hash_config(cfg: DictConfig, exclude_keys=[]) -> str: r""" cfg : dictionary of configuration to generate an unique identity exclude_keys : list of string, given keys will be ignored from the configuraiton """ assert isinstance(cfg, (DictConfig, dict)) cfg = Experimenter.remove_keys(cfg, copy=True, keys=exclude_keys) return md5_checksum(cfg)[:8]
def get_md5_checksum(self, excluded_name=[]): md5_text = '' all_data_items = {i: j for i, j in self._data_map.items() if i not in excluded_name} for name, (dtype, shape, data, path) in sorted(all_data_items.items(), key=lambda x: x[0]): md5_text += md5_checksum(path) return md5_text
def get_md5_checksum(self, excluded_name=[]): md5_text = '' all_data_items = { i: j for i, j in self._data_map.items() if i not in excluded_name } for name, (dtype, shape, data, path) in sorted(all_data_items.items(), key=lambda x: x[0]): md5_text += md5_checksum(path) return md5_text
def load_fsdd(self): r""" Free Spoken Digit Dataset A simple audio/speech dataset consisting of recordings of spoken digits in wav files at 8kHz. The recordings are trimmed so that they have near minimal silence at the beginnings and ends. Sample rate: 8,000 Reference: Link: https://github.com/Jakobovski/free-spoken-digit-dataset """ LINK = "https://github.com/Jakobovski/free-spoken-digit-dataset/archive/v1.0.8.zip" MD5 = "471b0df71a914629e2993300c1ccf33f" save_path = os.path.join(self.save_path, 'FSDD') if not os.path.exists(save_path): os.mkdir(save_path) # ====== download zip dataset ====== # if md5_checksum(''.join(sorted(os.listdir(save_path)))) != MD5: zip_path = os.path.join(save_path, 'FSDD.zip') urlretrieve(url=LINK, filename=zip_path) try: with ZipFile(zip_path, mode='r', compression=ZIP_DEFLATED) as zf: wav_files = [ name for name in zf.namelist() if '.wav' == name[-4:] ] for name in wav_files: data = zf.read(name) name = os.path.basename(name) with open(os.path.join(save_path, name), 'wb') as f: f.write(data) finally: os.remove(zip_path) # ====== get all records ====== # all_name = os.listdir(save_path) all_files = sorted( [os.path.join(save_path, name) for name in all_name]) all_speakers = list(set(i.split('_')[1] for i in all_name)) # ====== splitting train, test ====== # rand = np.random.RandomState(seed=self.seed) rand.shuffle(all_speakers) train_spk = all_speakers[:-1] test_spk = all_speakers[-1:] train_files = [ i for i in all_files if os.path.basename(i).split('_')[1] in train_spk ] test_files = [ i for i in all_files if os.path.basename(i).split('_')[1] in test_spk ] rand.shuffle(train_files) rand.shuffle(test_files) return train_files, test_files
def read_compressed(in_file, md5_download=None, override=False, verbose=False) -> dict: r""" Return: a Dictionary : mapping from name of files to extracted buffer """ if md5_download is None: md5_download = '' assert os.path.isfile(in_file) ext = os.path.splitext(in_file.lower())[-1] extracted_name = {} opened_files = [] ### validate md5 if len(md5_download) > 0 and md5_download != md5_checksum(in_file): raise RuntimeError(f"MD5 file {in_file} mismatch") ### '.tar' file if tarfile.is_tarfile(in_file): if ext == '.tar': mode = 'r' elif ext == '.tar.gz': mode = 'r:gz' else: raise NotImplementedError(f"No support for decompressing {in_file}") f = tarfile.open(in_file, mode=mode) opened_files.append(f) for info in f.getmembers(): name = info.name data = f.extractfile(info) if os.path.splitext(name)[-1] == '.gz': data = gzip.GzipFile(mode='rb', fileobj=data) name = name[:-3] extracted_name[name] = data ### zip file elif zipfile.is_zipfile(in_file): raise NotImplementedError(in_file) ### '.gz' file elif ext == '.gz': name = os.path.basename(in_file)[:-3] f = gzip.open(in_file, mode='rb') extracted_name[name[:-3]] = f opened_files.append(f) ### '.gz' file else: raise NotImplementedError(in_file) ### return a dictionary of file name to path yield extracted_name for f in opened_files: try: f.close() except: pass
def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % (dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))
def load_command(self): r""" Warden P. Speech Commands: A public dataset for single-word speech recognition, 2017. Available from http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz Sample rate: 16,000 Example: ds = AudioFeatureLoader(sample_rate=16000, frame_length=int(0.025 * 16000), frame_step=int(0.005 * 16000)) train, valid, test = ds.load_command() train = ds.create_dataset(train, max_length=40, return_path=True) """ LINK = "http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz" MD5 = "a08eb256cea8cbb427c6c0035fffd881" save_path = os.path.join(self.save_path, 'speech_commands') if not os.path.exists(save_path): os.mkdir(save_path) audio_path = os.path.join(save_path, 'audio') audio_files = sorted( get_all_files(audio_path, filter_func=lambda x: '.wav' == x[-4:])) md5 = md5_checksum(''.join([os.path.basename(i) for i in audio_files])) # ====== Download and extract the data ====== # if md5 != MD5: zip_path = get_file(fname='speech_commands_v0.01.tar.gz', origin=LINK, outdir=save_path, verbose=True) with tarfile.open(zip_path, 'r:gz') as tar: tar.extractall(audio_path) # ====== processing the audio file list ====== # audio_files = [i for i in audio_files if '_background_noise_' not in i] with open(os.path.join(audio_path, 'validation_list.txt'), 'r') as f: valid_list = {i.strip(): 1 for i in f} with open(os.path.join(audio_path, 'testing_list.txt'), 'r') as f: test_list = {i.strip(): 1 for i in f} train_files = [] valid_files = [] test_files = [] for f in audio_files: name = '/'.join(f.split('/')[-2:]) if name in valid_list: valid_files.append(f) elif name in test_list: test_files.append(f) else: train_files.append(f) return train_files, valid_files, test_files
def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % ( dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))
def __init__(self, dataset='5k', path="~/tensorflow_datasets/pbmc"): super().__init__() path = os.path.abspath(os.path.expanduser(path)) self.dsname = dataset if not os.path.exists(path): os.makedirs(path) url = str(base64.decodebytes(PBMC._URL[str(dataset).lower().strip()]), 'utf-8') name = os.path.basename(url) filename = os.path.join(path, name) urlretrieve(url, filename=filename, reporthook=lambda blocknum, bs, size: None) ### load the data data = np.load(filename, allow_pickle=True) self.x = data['x'].tolist().todense().astype(np.float32) self.y = data['y'].tolist().todense().astype(np.float32) assert md5_checksum(self.x) == data['xmd5'].tolist(), \ "MD5 for transcriptomic data mismatch" assert md5_checksum(self.y) == data['ymd5'].tolist(), \ "MD5 for proteomic data mismatch" self.xvar = data['xvar'] self.yvar = data['yvar'] self.pairs = data['pairs']
def read_CITEseq_PBMC(override=False, verbose=True, filtered_genes=False) -> SingleCellOMIC: download_path = os.path.join( DOWNLOAD_DIR, "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq')) if not os.path.exists(download_path): os.makedirs(download_path) preprocessed_path = (_5000_PBMC_PREPROCESSED if filtered_genes else _CITEseq_PBMC_PREPROCESSED) if override: shutil.rmtree(preprocessed_path) os.makedirs(preprocessed_path) # ******************** preprocessed data NOT found ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): X, X_row, X_col = [], None, None y, y_row, y_col = [], None, None # ====== download the data ====== # download_files = {} for url, md5 in zip( [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN], [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]): url = str(base64.decodebytes(url), 'utf-8') base_name = os.path.basename(url) path = os.path.join(download_path, base_name) download_file(filename=path, url=url, override=False) download_files[base_name] = (path, md5) # ====== extract the data ====== # n = set() for name, (path, md5) in sorted(download_files.items()): if verbose: print(f"Extracting {name} ...") binary_data = decrypt_aes(path, password=_PASSWORD) md5_ = md5_checksum(binary_data) assert md5_ == md5, f"MD5 checksum mismatch for file: {name}" with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f: for name in f.namelist(): data = str(f.read(name), 'utf8') for line in data.split('\n'): if len(line) == 0: continue line = line.strip().split(',') n.add(len(line)) if 'Protein' in name: y.append(line) else: X.append(line) # ====== post-processing ====== # assert len(n) == 1, \ "Number of samples inconsistent between raw count and protein count" if verbose: print("Processing gene count ...") X = np.array(X).T X_row, X_col = X[1:, 0], X[0, 1:] X = X[1:, 1:].astype('float32') # ====== filter mouse genes ====== # human_cols = [True if "HUMAN_" in i else False for i in X_col] if verbose: print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...") X = X[:, human_cols] X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]]) X, X_col = remove_allzeros_columns(matrix=X, colname=X_col, print_log=verbose) # ====== protein ====== # if verbose: print("Processing protein count ...") y = np.array(y).T y_row, y_col = y[1:, 0], y[0, 1:] y = y[1:, 1:].astype('float32') assert np.all(X_row == y_row), \ "Cell order mismatch between gene count and protein count" # save data if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X, X_col, y, y_col, rowname=X_row, print_log=verbose) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return SingleCellOMIC( X=ds['X'], cell_id=ds['X_row'], gene_id=ds['X_col'], omic='transcriptomic', name=f"pbmcCITEseq{'' if filtered_genes else 'all'}", ).add_omic('proteomic', ds['y'], ds['y_col'])
prog.add(X.shape[0]) # ====== save validation set ====== # for name, idx, X, y in valid.set_batch(batch_size=8000, batch_mode='file', seed=None): assert idx == 0 y = np.argmax(y, axis=-1) assert len(set(y)) == 1 y = y[0] z = np.mean(f_z(X), axis=0, keepdims=False).astype('float32') f_train.write(sep.join([str(y)] + [str(i) for i in z]) + '\n') prog.add(X.shape[0]) # ====== save test set ====== # for name, (start, end) in sorted(test_ids.items(), key=lambda x: x[0]): y = test_dat[start:end] z = np.mean(f_z(y), axis=0, keepdims=False).astype('float32') f_test.write(sep.join([name] + [str(i) for i in z]) + '\n') prog.add(1) # convert everything to matlab format csv2mat(exp_dir=EXP_DIR) # =========================================================================== # Evaluate and save the log # =========================================================================== np.random.seed(52181208) shape = inputs[0].shape X = np.random.rand(64, shape[1].value, shape[2].value).astype('float32') Z = f_z(X) # ====== make sure model has the same identity ====== # print(Z.shape, Z.sum(), (Z**2).sum(), Z.std()) print(ctext(crypto.md5_checksum(Z), 'cyan'))
def md5(self): md5_text = '' for name, (dtype, shape, data, path) in sorted(self._data_map.items(), key=lambda x: x[0]): md5_text += md5_checksum(path) return md5_text
def fast_tsne(*X, n_components=2, n_samples=None, perplexity=30.0, early_exaggeration=8.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=5218, method='barnes_hut', angle=0.5, n_jobs=4): """ Parameters ---------- n_components : int, optional (default: 2) Dimension of the embedded space. n_samples : {int, None} if given, downsampling the data to given number of sample perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. early_exaggeration : float, optional (default: 8.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so this value is rounded to the next multiple of 50. min_grad_norm : float, optional (default: 1e-7) If the gradient norm is below this threshold, the optimization will be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. The default is "euclidean" which is interpreted as squared euclidean distance. init : string or numpy array, optional (default: "random") Initialization of embedding. Possible options are 'random', 'pca', and a numpy array of shape (n_samples, n_components). PCA initialization cannot be used with precomputed distances and is usually more globally stable than random initialization. verbose : int, optional (default: 0) Verbosity level. random_state : int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Note that different initializations might result in different local minima of the cost function. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut approximation running in O(NlogN) time. method='exact' will run on the slower, but exact, algorithm in O(N^2) time. The exact algorithm should be used when nearest-neighbor errors need to be better than 3%. However, the exact method cannot scale to millions of examples. angle : float (default: 0.5) Only used if method='barnes_hut' This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. """ assert len(X) > 0, "No input is given!" if isinstance(X[0], (tuple, list)): X = X[0] if not all(isinstance(x, np.ndarray) for x in X): raise ValueError("`X` can only be list of numpy.ndarray or numpy.ndarray") # ====== kwarg for creating T-SNE class ====== # kwargs = dict(locals()) del kwargs['X'] n_samples = kwargs.pop('n_samples', None) # ====== downsampling ====== # if n_samples is not None: n_samples = int(n_samples) assert n_samples > 0 new_X = [] rand = random_state if isinstance(random_state, np.random.RandomState) else \ np.random.RandomState(seed=random_state) for x in X: if x.shape[0] > n_samples: ids = rand.permutation(x.shape[0])[:n_samples] x = x[ids] new_X.append(x) X = new_X # ====== import proper T-SNE ====== # tsne_version = None try: from tsnecuda import TSNE from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE tsne_version = 'cuda' except ImportError: # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` " # "for significant speed up.") try: from MulticoreTSNE import MulticoreTSNE as TSNE tsne_version = 'multicore' except ImportError: wprint("Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`" ' to accelerate the T-SNE on multiple CPU cores.') try: from sklearn.manifold import TSNE tsne_version = 'sklearn' except Exception as e: raise e # ====== modify kwargs ====== # if tsne_version == 'cuda': kwargs['random_seed'] = kwargs['random_state'] kwargs['theta'] = angle if method == 'exact': TSNE = _exact_TSNE del kwargs['theta'] del kwargs['random_state'] del kwargs['n_jobs'] del kwargs['angle'] del kwargs['method'] elif tsne_version == 'multicore': pass else: del kwargs['n_jobs'] # ====== getting cached values ====== # results = [] X_new = [] for i, x in enumerate(X): md5 = md5_checksum(x) key = _create_key(kwargs, md5) if key in _cached_values: results.append((i, _cached_values[key])) else: X_new.append((i, md5, x)) # ====== perform T-SNE ====== # def apply_tsne(j): idx, md5, x = j tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x)) # only 1 X, no need for MPI if len(X_new) == 1: idx, md5, x = apply_tsne(X_new[0]) results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x else: mpi = MPI(jobs=X_new, func=apply_tsne, batch=1, ncpu=min(len(X_new), cpu_count() - 1)) for idx, md5, x in mpi: results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x # ====== return and clean ====== # results = sorted(results, key=lambda a: a[0]) results = [r[1] for r in results] return results[0] if len(results) == 1 else results
def download_and_extract(path, url, extract=True, md5_download=None, md5_extract=None): r""" Download a file to given path then extract the file Arguments: path : a String path to a folder url : a String of download URL extract : a Boolean, if True decompress the file """ from tqdm import tqdm path = os.path.abspath(os.path.expanduser(path)) if not os.path.exists(path): os.makedirs(path) assert os.path.isdir(path), "path to '%s' is not a directory" % path ### file name filename = url.split('/')[-1] filepath = os.path.join(path, filename) ### download if os.path.exists(filepath) and md5_download is not None: md5 = md5_checksum(filepath) if md5 != md5_download: print("MD5 of downloaded file mismatch! downloaded:%s provided:%s" % (md5, md5_download)) os.remove(filepath) if not os.path.exists(filepath): prog = tqdm(desc="Download '%s'" % filename, total=-1, unit="MB") def _progress(count, block_size, total_size): # to MB total_size = total_size / 1024. / 1024. block_size = block_size / 1024. / 1024. if prog.total < 0: prog.total = total_size prog.update(block_size) filepath, _ = urlretrieve(url, filepath, reporthook=_progress) ### no extraction needed if not extract: return filepath ### extract extract_path = os.path.join(path, os.path.basename(filename).split('.')[0]) if os.path.exists(extract_path) and md5_extract is not None: md5 = md5_folder(extract_path) if md5 != md5_extract: print("MD5 extracted folder mismatch! extracted:%s provided:%s" % (md5, md5_extract)) shutil.rmtree(extract_path) if not os.path.exists(extract_path): # .tar.gz if '.tar.gz' in filepath: with tarfile.open(filepath, 'r:gz') as f: print("Extracting files ...") f.extractall(path) # .zip elif '.zip' in filepath: # TODO raise NotImplementedError # unknown extension else: raise NotImplementedError("Cannot extract file: %s" % filepath) ### return return path, extract_path
def fast_tsne(*X, n_components=2, n_samples=None, perplexity=30.0, early_exaggeration=8.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, random_state=1234, method='barnes_hut', angle=0.5, n_jobs=4): """ Parameters ---------- n_components : int, optional (default: 2) Dimension of the embedded space. n_samples : {int, None} if given, downsampling the data to given number of sample perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. early_exaggeration : float, optional (default: 8.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so this value is rounded to the next multiple of 50. min_grad_norm : float, optional (default: 1e-7) If the gradient norm is below this threshold, the optimization will be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. The default is "euclidean" which is interpreted as squared euclidean distance. init : string or numpy array, optional (default: "random") Initialization of embedding. Possible options are 'random', 'pca', and a numpy array of shape (n_samples, n_components). PCA initialization cannot be used with precomputed distances and is usually more globally stable than random initialization. verbose : int, optional (default: 0) Verbosity level. random_state : int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Note that different initializations might result in different local minima of the cost function. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut approximation running in O(NlogN) time. method='exact' will run on the slower, but exact, algorithm in O(N^2) time. The exact algorithm should be used when nearest-neighbor errors need to be better than 3%. However, the exact method cannot scale to millions of examples. angle : float (default: 0.5) Only used if method='barnes_hut' This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. """ assert len(X) > 0, "No input is given!" if isinstance(X[0], (tuple, list)): X = X[0] if not all(isinstance(x, np.ndarray) for x in X): raise ValueError( "`X` can only be list of numpy.ndarray or numpy.ndarray") # ====== kwarg for creating T-SNE class ====== # kwargs = dict(locals()) del kwargs['X'] n_samples = kwargs.pop('n_samples', None) # ====== downsampling ====== # if n_samples is not None: n_samples = int(n_samples) assert n_samples > 0 new_X = [] rand = random_state if isinstance(random_state, np.random.RandomState) else \ np.random.RandomState(seed=random_state) for x in X: if x.shape[0] > n_samples: ids = rand.permutation(x.shape[0])[:n_samples] x = x[ids] new_X.append(x) X = new_X # ====== import proper T-SNE ====== # tsne_version = None try: from tsnecuda import TSNE from tsnecuda.NaiveTSNE import NaiveTSNE as _exact_TSNE tsne_version = 'cuda' except ImportError: # wprint("Install CUDA-TSNE from `https://github.com/CannyLab/tsne-cuda` " # "for significant speed up.") try: from MulticoreTSNE import MulticoreTSNE as TSNE tsne_version = 'multicore' except ImportError: wprint( "Install MulticoreTSNE from `pip install git+https://github.com/DmitryUlyanov/Multicore-TSNE.git`" ' to accelerate the T-SNE on multiple CPU cores.') try: from sklearn.manifold import TSNE tsne_version = 'sklearn' except Exception as e: raise e # ====== modify kwargs ====== # if tsne_version == 'cuda': kwargs['random_seed'] = kwargs['random_state'] kwargs['theta'] = angle if method == 'exact': TSNE = _exact_TSNE del kwargs['theta'] del kwargs['random_state'] del kwargs['n_jobs'] del kwargs['angle'] del kwargs['method'] elif tsne_version == 'multicore': pass else: del kwargs['n_jobs'] # ====== getting cached values ====== # results = [] X_new = [] for i, x in enumerate(X): md5 = md5_checksum(x) key = _create_key(kwargs, md5) if key in _cached_values: results.append((i, _cached_values[key])) else: X_new.append((i, md5, x)) # ====== perform T-SNE ====== # def apply_tsne(j): idx, md5, x = j tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x)) # only 1 X, no need for MPI if len(X_new) == 1: idx, md5, x = apply_tsne(X_new[0]) results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x else: mpi = MPI(jobs=X_new, func=apply_tsne, batch=1, ncpu=min(len(X_new), cpu_count() - 1)) for idx, md5, x in mpi: results.append((idx, x)) _cached_values[_create_key(kwargs, md5)] = x # ====== return and clean ====== # results = sorted(results, key=lambda a: a[0]) results = [r[1] for r in results] return results[0] if len(results) == 1 else results
def fast_tsne( *X, n_components: int = 2, max_samples: Optional[int] = None, perplexity: float = 30.0, early_exaggeration: float = 12.0, learning_rate: float = 200.0, n_iter: int = 1000, n_iter_without_progress: int = 300, exaggeration_iter: int = 250, perplexity_max_iter: int = 100, min_grad_norm: float = 1e-7, method: str = 'barnes_hut', metric: str = "euclidean", init: str = "random", angle: float = 0.5, n_jobs: Optional[int] = 4, merge_inputs: bool = True, pca_preprocessing: bool = True, return_model: bool = False, random_state: int = 1, verbose: int = 0, framework: Literal['auto', 'sklearn', 'cuml'] = 'auto', ): """ t-Stochastic Nearest Neighbors. If the algorithm take unexpected long time for running, lower the `exaggeration_iter`, or reduce the amount of samples by downsampling the dataset. Parameters ---------- n_components : int, optional (default: 2) Dimension of the embedded space. max_samples : {int, None} if given, downsampling the data to given number of sample perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter. early_exaggeration : float, optional (default: 8.0) Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. n_iter_without_progress : int, optional (default: 300) Maximum number of iterations without progress before we abort the optimization, used after 250 initial iterations with early exaggeration. Note that progress is only checked every 50 iterations so this value is rounded to the next multiple of 50. perplexity_max_iter : int, (default 100) The number of epochs the best gaussian bands are found for. exaggeration_iter : int, (default 250) To promote the growth of clusters, set this higher. min_grad_norm : float, optional (default: 1e-7) If the gradient norm is below this threshold, the optimization will be stopped. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. The default is "euclidean" which is interpreted as squared euclidean distance. init : string or numpy array, optional (default: "random") Initialization of embedding. Possible options are 'random', 'pca', and a numpy array of shape (n_samples, n_components). PCA initialization cannot be used with precomputed distances and is usually more globally stable than random initialization. verbose : int, optional (default: 0) Verbosity level, a number from 0 to 6. random_state : int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Note that different initializations might result in different local minima of the cost function. method : string (default: 'barnes_hut') By default the gradient calculation algorithm uses Barnes-Hut approximation running in O(NlogN) time. method='exact' will run on the slower, but exact, algorithm in O(N^2) time. The exact algorithm should be used when nearest-neighbor errors need to be better than 3%. However, the exact method cannot scale to millions of examples. angle : float (default: 0.5) Only used if method='barnes_hut' This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. return_model : a Boolean, if `True`, return the trained t-SNE model merge_inputs : a Boolean, if `True`, merge all arrays into a single array for training t-SNE. """ assert len(X) > 0, "No input is given!" if isinstance(X[0], (tuple, list)): X = X[0] if not all(isinstance(x, np.ndarray) for x in X): raise ValueError( "`X` can only be list of numpy.ndarray or numpy.ndarray") # ====== kwarg for creating T-SNE class ====== # kwargs = dict(locals()) del kwargs['X'] kwargs.pop('merge_inputs') kwargs.pop('return_model') kwargs.pop('max_samples') kwargs.pop('framework') kwargs.pop('pca_preprocessing') # ====== downsampling ====== # if max_samples is not None: max_samples = int(max_samples) assert max_samples > 0 new_X = [] rand = random_state if isinstance(random_state, np.random.RandomState) else \ np.random.RandomState(seed=random_state) for x in X: if x.shape[0] > max_samples: ids = rand.permutation(x.shape[0])[:max_samples] x = x[ids] new_X.append(x) X = new_X # ====== import proper T-SNE ====== # tsne_version = None if framework != 'sklearn': try: from cuml.manifold import TSNE tsne_version = 'cuda' except ImportError: warnings.warn("Install RAPIDSAI cuML GPUs-accelerated t-SNE") try: from MulticoreTSNE import MulticoreTSNE as TSNE tsne_version = 'multicore' except ImportError: warnings.warn( "pip install " "git+https://github.com/DmitryUlyanov/Multicore-TSNE.git") if tsne_version is None: from sklearn.manifold import TSNE tsne_version = 'sklearn' # ====== modify kwargs ====== # if tsne_version == 'cuda': del kwargs['n_jobs'] elif tsne_version == 'multicore': del kwargs['perplexity_max_iter'] del kwargs['exaggeration_iter'] else: del kwargs['n_jobs'] del kwargs['perplexity_max_iter'] del kwargs['exaggeration_iter'] # ====== getting cached values ====== # results = [] X_new = [] X_size = [] if merge_inputs: X_size = [x.shape[0] for x in X] x = np.vstack(X) if len(X) > 1 else X[0] md5 = md5_checksum(x) key = _create_key(tsne_version, kwargs, md5) if key in _cached_values: results.append((0, _cached_values[key])) else: X_new.append((0, md5, x)) else: for i, x in enumerate(X): md5 = md5_checksum(x) key = _create_key(tsne_version, kwargs, md5) if key in _cached_values: results.append((i, _cached_values[key])) else: X_new.append((i, md5, x)) # ====== perform T-SNE ====== # def apply_tsne(j): idx, md5, x = j if pca_preprocessing: x = PCA(n_components=None, random_state=random_state).fit_transform(x) tsne = TSNE(**kwargs) return (idx, md5, tsne.fit_transform(x), tsne if return_model else None) # only 1 X, no need for MPI if len(X_new) == 1 or tsne_version in ('cuda', 'multicore'): for x in X_new: idx, md5, x, model = apply_tsne(x) results.append((idx, x)) _cached_values[_create_key(tsne_version, kwargs, md5)] = x else: mpi = MPI(jobs=X_new, func=apply_tsne, batch=1, ncpu=min(len(X_new), cpu_count() - 1)) model = [] for idx, md5, x, m in mpi: results.append((idx, x)) _cached_values[_create_key(tsne_version, kwargs, md5)] = x model.append(m) # ====== return and clean ====== # if merge_inputs and len(X_size) > 1: indices = [0] + np.cumsum(X_size).tolist() results = [results[0][1][s:e] for s, e in zip(indices, indices[1:])] else: results = sorted(results, key=lambda a: a[0]) results = [r[1] for r in results] results = results[0] if len(results) == 1 else results if return_model: return results, model del model return results