def _create_it_func(X, y, batch_size, start, end): """ Return a lambda function that create new generator """ nb_samples = end - start create_it = None # ====== y is None ====== # if y is None: if hasattr(X, 'set_batch'): create_it = lambda seed: (x for x in X.set_batch( batch_size=batch_size, start=start, end=end, seed=seed)) elif hasattr(X, '__getitem__'): create_it = lambda seed: (X[start:end] for start, end in batching(n=nb_samples, batch_size=batch_size, start=start, end=end, seed=seed)) # ====== provided y ====== # else: if hasattr(X, 'set_batch') and hasattr(y, 'set_batch'): create_it = lambda seed: ((i, j) for i, j in zip( X.set_batch(batch_size=batch_size, start=start, end=end, seed=seed), y.set_batch(batch_size=batch_size, start=start, end=end, seed=seed))) elif hasattr(X, '__getitem__') and hasattr(y, '__getitem__'): create_it = lambda seed: ((X[start:end], y[start:end]) for start, end in batching(n=nb_samples, batch_size=batch_size, start=start, end=end, seed=seed)) # ====== exception ====== # if create_it is None: raise ValueError("`X` and `y` must has attributes 'set_batch' or '__getitem__'") return create_it
def transform(self, X, n_components=None): # ====== check number of components ====== # # specified percentage of explained variance if n_components is not None: # percentage of variances if n_components < 1.: _ = np.cumsum(self.explained_variance_ratio_) n_components = (_ > n_components).nonzero()[0][0] + 1 # specific number of components else: n_components = int(n_components) # ====== other info ====== # n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size # ====== start transforming ====== # X_transformed = [] for start, end in batching(n=n, batch_size=batch_size): x = super(MiniBatchPCA, self).transform(X=X[start:end]) if n_components is not None: x = x[:, :n_components] X_transformed.append(x) return np.concatenate(X_transformed, axis=0)
def describe(self) -> str: text = f"SingleCellOMICs: {self.name}" pad = "\n " for omic in self.omics: X = self.numpy(omic) all_nonzeros = [] for s, e in batching(n=self.n_obs, batch_size=BATCH_SIZE): x = X[s:e] ids = np.nonzero(x) all_nonzeros.append(x[ids[0], ids[1]]) all_nonzeros = np.concatenate(all_nonzeros) text += pad[:-1] + "OMIC: '%s' - dtype: '%s'" % ( omic.name, "binary" if self.is_binary(omic) else "continuous") text += pad + 'Sparsity : %.2f' % self.sparsity(omic) text += pad + 'Nonzeros : %s' % describe( all_nonzeros, shorten=True, float_precision=2) text += pad + 'Cell : %s' % describe( self.counts_per_cell(omic), shorten=True, float_precision=2) text += pad + 'Gene : %s' % describe( self.counts_per_gene(omic), shorten=True, float_precision=2) text += pad + 'LogCount : %s' % describe( self.log_counts(omic), shorten=True, float_precision=2) text += pad + 'LocalMean : %s' % describe( self.local_mean(omic), shorten=True, float_precision=2) text += pad + 'LocalVar : %s' % describe( self.local_var(omic), shorten=True, float_precision=2) return text
def transform(self, X, n_components=None): # ====== check number of components ====== # # specified percentage of explained variance if n_components is not None: # percentage of variances if n_components < 1.: _ = np.cumsum(self.explained_variance_ratio_) n_components = (_ > n_components).nonzero()[0][0] + 1 # specific number of components else: n_components = int(n_components) # ====== other info ====== # n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size # ====== start transforming ====== # X_transformed = [] for start, end in batching(n=n, batch_size=batch_size): x = super(MiniBatchPCA, self).transform(X=X[start:end]) if n_components is not None: x = x[:, :n_components] X_transformed.append(x) return np.concatenate(X_transformed, axis=0)
def counts_per_gene(self, omic=None): r""" Return total number of counts per gene. This method is scalable. """ counts = 0 X = self.numpy(omic) for s, e in batching(batch_size=BATCH_SIZE, n=X.shape[0]): counts += np.sum(X[s:e], axis=0) return counts
def sparsity_percentage(x, batch_size=1234): n_zeros = 0 n_total = np.prod(x.shape) for start, end in batching(batch_size=batch_size, n=x.shape[0], seed=None): y = x[start:end] n_nonzeros = np.count_nonzero(y) n_zeros += np.prod(y.shape) - n_nonzeros return n_zeros / n_total
def sparsity_percentage(x, batch_size=5218): n_zeros = 0 n_total = np.prod(x.shape) for start, end in batching(batch_size=batch_size, n=x.shape[0], seed=None): y = x[start:end] n_nonzeros = np.count_nonzero(y) n_zeros += np.prod(y.shape) - n_nonzeros return n_zeros / n_total
def validate_data(self, path=None): if path is None: path = self.path import h5py with h5py.File(path, 'r') as dataset: images1 = dataset['images'] labels1 = dataset['labels'] for start, end in tqdm(list(batching(8000, n=self.images.shape[0]))): assert np.all(self.images[start:end] == images1[start:end]) and \ np.all(self.factors[start:end] == labels1[start:end]) return self
def _file_grouping(batch, batch_size, rng, batch_filter): """ Return: [(name, index, data1, data2, ...), ...] NOTE: each element in batch is one file """ # ====== shuffle the file ====== # if rng is not None: rng.shuffle(batch) # ====== return batched files with index for ordering ====== # for name, X in batch: n = X[0].shape[0] ret = list(X) for i, (start, end) in enumerate(batching(n=n, batch_size=batch_size)): r = [name, i] + [j[start:end] for j in ret] yield tuple(batch_filter(r))
def _file_grouping(batch, batch_size, rng, batch_filter): """ Return: [(name, index, data1, data2, ...), ...] NOTE: each element in batch is one file """ # ====== shuffle the file ====== # if rng is not None: rng.shuffle(batch) # ====== return batched files with index for ordering ====== # for name, X in batch: n = X[0].shape[0] ret = list(X) for i, (start, end) in enumerate(batching(n=n, batch_size=batch_size)): r = [name, i] + [j[start:end] for j in ret] yield tuple(batch_filter(r))
def _create_it_func(X, y, batch_size, start, end): """ Return a lambda function that create new generator """ nb_samples = end - start create_it = None # ====== y is None ====== # if y is None: if hasattr(X, 'set_batch'): create_it = lambda seed: (x for x in X.set_batch( batch_size=batch_size, start=start, end=end, seed=seed)) elif hasattr(X, '__getitem__'): create_it = lambda seed: (X[start:end] for start, end in batching( n=nb_samples, batch_size=batch_size, start=start, end=end, seed=seed)) # ====== provided y ====== # else: if hasattr(X, 'set_batch') and hasattr(y, 'set_batch'): create_it = lambda seed: ((i, j) for i, j in zip( X.set_batch( batch_size=batch_size, start=start, end=end, seed=seed), y.set_batch( batch_size=batch_size, start=start, end=end, seed=seed))) elif hasattr(X, '__getitem__') and hasattr(y, '__getitem__'): create_it = lambda seed: ( (X[start:end], y[start:end]) for start, end in batching(n=nb_samples, batch_size=batch_size, start=start, end=end, seed=seed)) # ====== exception ====== # if create_it is None: raise ValueError( "`X` and `y` must has attributes 'set_batch' or '__getitem__'") return create_it
def test_mpi(self): X = batching(n=512, batch_size=np.random.randint(low=12000, high=80000)) def map_func(batch): for b in batch: yield b mpi = MPI(X, map_func=map_func, ncpu=12, buffer_size=8, maximum_queue_size=12 * 8) Y = [i for i in mpi] self.assertEqual(len(X), len(Y)) self.assertEqual(sum(j - i for i, j in X), sum(j - i for i, j in Y)) self.assertTrue(all(i == j for i, j in zip( sorted(X, key=lambda x: x[0]), sorted(Y, key=lambda x: x[0]) )))
def __init__(self, path='~/tensorflow_datasets/3dshapes.h5', cache_dir=None, seed=8): path = os.path.abspath(os.path.expanduser(path)) assert os.path.exists(path), "Path to file %s must exists" % path self.path = path if cache_dir is None: cache_dir = os.path.dirname(path) if not os.path.exists(cache_dir): os.mkdir(cache_dir) image_path = os.path.join(cache_dir, '3dshapes.images') label_path = os.path.join(cache_dir, '3dshapes.labels') # ====== read the dataset and cache it again ====== # if not os.path.exists(image_path) or not os.path.exists(label_path): import h5py with h5py.File(path, 'r') as dataset: images = dataset['images'] labels = dataset['labels'] with MmapArrayWriter(image_path, shape=images.shape, dtype=images.dtype, remove_exist=True) as img, \ MmapArrayWriter(label_path, shape=labels.shape, dtype=labels.dtype, remove_exist=True) as lab: for start, end in tqdm(list( batching(8000, n=images.shape[0])), desc="Caching data"): img.write(images[start:end]) lab.write(labels[start:end]) # ====== load the data ====== # self.images = MmapArray(image_path) self.factors = MmapArray(label_path) # ====== split the dataset ====== # rand = np.random.RandomState(seed=seed) n = len(self.images) ids = rand.permutation(n) # train:85% valid:5% test:10% self.train_indices = ids[:int(0.85 * n)] self.valid_indices = ids[int(0.85 * n):int(0.9 * n)] self.test_indices = ids[int(0.9 * n):]
def _predict(self, X, f_pred): if not self.is_fitted: raise RuntimeError("LogisticRegression hasn't been initialized or " "fitted.") if hasattr(X, 'set_batch'): it = iter(X.set_batch(batch_size=self.batch_size, seed=None)) elif hasattr(X, '__getitem__'): it = (X[start:end] for start, end in batching(batch_size=self.batch_size, n=X.shape[0])) else: raise ValueError("`X` must has attributes 'set_batch' or '__getitem__'") # ====== make prediction ====== # y = [] prog = Progbar(target=X.shape[0], print_report=True, print_summary=False, name="Predicting") for x in it: x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes) y.append(f_pred(x)) prog.add(x.shape[0]) return np.concatenate(y, axis=0)
def make_dnn_prediction(functions, X, batch_size=256, title=''): return_list = True if not isinstance(functions, (tuple, list)): functions = [functions] return_list = False n_functions = len(functions) results = [[] for i in range(n_functions)] # ====== prepare progress bar ====== # n_samples = len(X) prog = Progbar(target=n_samples, print_summary=True, name="Making prediction: %s" % str(title)) # ====== for feeder ====== # if isinstance(X, F.Feeder): y_true = [] for x, y in X.set_batch(batch_size=batch_size): for res, fn in zip(results, functions): res.append(fn(x)) prog.add(x.shape[0]) y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y) results = [np.concatenate(res, axis=0) for res in results] y_true = np.concatenate(y_true, axis=0) if return_list: return results, y_true return results[0], y_true # ====== for numpy array ====== # else: for start, end in batching(batch_size=batch_size, n=n_samples): y = X[start:end] for res, fn in zip(results, functions): res.append(fn(y)) prog.add(end - start) results = [np.concatenate(res, axis=0) for res in results] if return_list: return results return results[0]
def make_dnn_prediction(functions, X, batch_size=256, title=''): return_list = True if not isinstance(functions, (tuple, list)): functions = [functions] return_list = False n_functions = len(functions) results = [[] for i in range(n_functions)] # ====== prepare progress bar ====== # n_samples = len(X) prog = Progbar(target=n_samples, print_summary=True, name="Making prediction: %s" % str(title)) # ====== for feeder ====== # if isinstance(X, F.Feeder): y_true = [] for x, y in X.set_batch(batch_size=batch_size): for res, fn in zip(results, functions): res.append(fn(x)) prog.add(x.shape[0]) y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y) results = [np.concatenate(res, axis=0) for res in results] y_true = np.concatenate(y_true, axis=0) if return_list: return results, y_true return results[0], y_true # ====== for numpy array ====== # else: for start, end in batching(batch_size=batch_size, n=n_samples): y = X[start:end] for res, fn in zip(results, functions): res.append(fn(y)) prog.add(end - start) results = [np.concatenate(res, axis=0) for res in results] if return_list: return results return results[0]
def _predict(self, X, f_pred): if not self.is_fitted: raise RuntimeError("LogisticRegression hasn't been initialized or " "fitted.") if hasattr(X, 'set_batch'): it = iter(X.set_batch(batch_size=self.batch_size, seed=None)) elif hasattr(X, '__getitem__'): it = (X[start:end] for start, end in batching(batch_size=self.batch_size, n=X.shape[0])) else: raise ValueError( "`X` must has attributes 'set_batch' or '__getitem__'") # ====== make prediction ====== # y = [] prog = Progbar(target=X.shape[0], print_report=True, print_summary=False, name="Predicting") for x in it: x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes) y.append(f_pred(x)) prog.add(x.shape[0]) return np.concatenate(y, axis=0)
# Optimizing the network # =========================================================================== update_ops = K.optimizers.Adam(lr=0.001).minimize(loss) K.initialize_all_variables() # ====== intitalize ====== # record_train_loss = [] record_valid_loss = [] patience = 3 epoch = 0 # We want the rate to go up but the distortion to go down while True: # ====== training ====== # train_losses = [] prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch) start_time = timeit.default_timer() for start, end in batching(batch_size=args.bs, n=X_train.shape[0], seed=K.get_rng().randint(10e8)): _ = K.eval(loss, feed_dict={X: X_train[start:end]}, update_after=update_ops) prog.add(end - start) train_losses.append(_) # ====== training log ====== # print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time)) print("[Training set] Loss: %.4f" % np.mean(train_losses)) # ====== validation set ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) print("[Valid set] Loss: %.4f" % lo) # ====== record the history ====== # record_train_loss.append(np.mean(train_losses)) record_valid_loss.append(lo) # ====== plotting ====== # if args.dim > 2:
def fast_kmeans(X, n_clusters=8, max_iter=300, tol=0.0001, n_init=10, random_state=1234, init='scalable-k-means++', oversampling_factor=2.0, max_samples_per_batch=32768, force_sklearn=False): r""" KMeans clustering Arguments: n_clusters : int (default = 8) The number of centroids or clusters you want. max_iter : int (default = 300) The more iterations of EM, the more accurate, but slower. tol : float64 (default = 1e-4) Stopping criterion when centroid means do not change much. random_state : int (default = 1) If you want results to be the same when you restart Python, select a state. init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} (default = 'scalable-k-means++') 'scalable-k-means++' or 'k-means||': Uses fast and stable scalable kmeans++ intialization. 'random': Choose 'n_cluster' observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. max_samples_per_batch : int maximum number of samples to use for each batch of the pairwise distance computation. oversampling_factor : int (default = 2) The amount of points to sample in scalable k-means++ initialization for potential centroids. Increasing this value can lead to better initial centroids at the cost of memory. The total number of centroids sampled in scalable k-means++ is oversampling_factor * n_clusters * 8. max_samples_per_batch : int (default = 32768) The number of data samples to use for batches of the pairwise distance computation. This computation is done throughout both fit predict. The default should suit most cases. The total number of elements in the batched pairwise distance computation is max_samples_per_batch * n_clusters. It might become necessary to lower this number when n_clusters becomes prohibitively large. """ kwargs = dict(locals()) X = kwargs.pop('X') kwargs.pop('force_sklearn') ## fine-tuning the kwargs cuml = _check_cuml(force_sklearn) if cuml: from cuml.cluster import KMeans kwargs.pop('n_init') else: from sklearn.cluster import MiniBatchKMeans kwargs.pop('oversampling_factor') kwargs.pop('max_samples_per_batch') if kwargs['init'] in ('scalable-k-means++', 'k-means||'): kwargs['init'] = 'k-means++' ## fitting if not cuml: from odin.utils import batching kmean = MiniBatchKMeans(**kwargs) for s, e in batching(int(max_samples_per_batch), n=X.shape[0], seed=random_state): kmean.partial_fit(X[s:e]) else: kmean = KMeans(verbose=False, **kwargs) kmean.fit(X) return kmean
# Optimizing the network # =========================================================================== update_ops = K.optimizers.Adam(lr=0.001).minimize(loss) K.initialize_all_variables() # ====== intitalize ====== # record_train_loss = [] record_valid_loss = [] patience = 3 epoch = 0 # We want the rate to go up but the distortion to go down while True: # ====== training ====== # train_losses = [] prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch) start_time = timeit.default_timer() for start, end in batching(batch_size=args.bs, n=X_train.shape[0], seed=K.get_rng().randint(10e8)): _ = K.eval(loss, feed_dict={X: X_train[start:end]}, update_after=update_ops) prog.add(end - start) train_losses.append(_) # ====== training log ====== # print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time)) print("[Training set] Loss: %.4f" % np.mean(train_losses)) # ====== validation set ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) print("[Valid set] Loss: %.4f" % lo) # ====== record the history ====== # record_train_loss.append(np.mean(train_losses)) record_valid_loss.append(lo) # ====== plotting ====== # if args.dim > 2:
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=1234): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError( "`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1, ) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
def __call__(self, *inputs, **kwargs): show_progress = kwargs.pop('show_progress', False) # dictionary as inputs if len(kwargs) == len(self.inputs_name): inputs = [kwargs[i] for i in self.inputs_name] # ====== delete un-matchede inputs ====== # inputs_new = [] tmp = list(inputs) shapes = list(self._input_shape) # this process iteratively remove inputs with mismatch shape # to current given input for s in shapes: for i in tuple(tmp): if len(i.shape) != len(s) or \ any(a is not None and a > 0 and a != b for a, b in zip(s, i.shape)): # different ndim, or shape tmp.remove(i) else: inputs_new.append(i) tmp.remove(i) break if len(inputs_new) != len(self.inputs): raise ValueError("Given inputs have shape: %s, cannot match the shape of " "defined inputs: %s" % ('; '.join([str(i.shape) for i in inputs]), '; '.join([str(i) for i in self.input_shape]))) if not self._strict: inputs = inputs_new # ====== create feed_dict ====== # feed_dict = {} inputs = flatten_list(inputs, level=None) for tensor, value in zip(self.inputs, inputs): feed_dict[tensor] = value feed_dict.update(self.defaults) # check if modifying training mode if self.training is None: pass elif self.training: feed_dict.update({is_training(): True}) else: feed_dict.update({is_training(): False}) session = get_session() outputs = None # ====== mini-batches ====== # if self.batch_size is not None: batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)] if len(self.batch_vars) == 0 else self.batch_vars) batch_vars = [i for i in batch_vars if i in feed_dict and hasattr(feed_dict[i], 'shape')] n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars)) assert len(n_samples) == 1, \ "Data have multiple batching dimension: %s" % str(n_samples) n_samples = n_samples[0] # only continue if we have more samples than `batch_size` if n_samples > self.batch_size: n_output = len(self.outputs) outputs = [] all_batches = [] # (optional) showing progress if show_progress: prog = Progbar(target=n_samples, print_report=False, print_summary=False, name='') for s, e in batching(batch_size=int(self.batch_size), n=n_samples): if show_progress: prog.add(e - s) all_batches.append(e - s) feed_dict_minibatch = OrderedDict([(k, v[s:e]) if k in batch_vars else (k, v) for k, v in feed_dict.items()]) updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict_minibatch) updated = updated[:n_output] if not self._return_list: updated = updated[0] outputs.append(updated) ## concatenate all outputs if not self._return_list: o_ndim = outputs[0].ndim if o_ndim == 0: # returned scalars outputs = np.array(outputs) else: # returned array for o_axis in range(o_ndim): all_n = [o.shape[o_axis] for o in outputs] if all_n == all_batches: break outputs = np.concatenate(outputs, axis=o_axis) ## returning a list of outputs else: new_outputs = [] for output_idx in range(len(outputs[0])): o = [x[output_idx] for x in outputs] o_ndim = o[0].ndim if o_ndim == 0: # returned scalars o = np.array(o) else: # returned array for o_axis in range(o[0].ndim): all_n = [val.shape[o_axis] for val in o] if all_n == all_batches: break o = np.concatenate(o, axis=o_axis) new_outputs.append(o) outputs = new_outputs # ====== single batch ====== # if outputs is None: updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict) outputs = updated[:len(self.outputs)] if not self._return_list: outputs = outputs[0] # ====== return final output ====== # return outputs
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
def read_centenarian(override=False, verbose=False): r""" Data used in: "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in supercentenarians" | bioRxiv [WWW Document], n.d. URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20). """ download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original") if not os.path.exists(download_path): os.mkdir(download_path) preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed') if override and os.path.exists(preprocessed_path): shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if not os.path.exists(os.path.join(preprocessed_path, 'X')): labels = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[2])), url=_URL[2], ) data = [] with gzip.open(labels, mode='rb') as f: for line in f: line = str(line, 'utf-8').strip().split('\t') assert line[1][:2] == line[2] data.append(line) labels = np.array(data) y_col = sorted(set(labels[:, 1])) y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]), len(y_col)).astype('float32') y_col = np.array(y_col) # raw = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[0])), url=_URL[0], ) if verbose: print("Unzip and reading raw UMI ...") X_raw, cell_id1, gene_id1 = read_gzip_csv(raw) # norm = download_file( outpath=os.path.join(download_path, os.path.basename(_URL[1])), url=_URL[1], ) if verbose: print("Unzip and reading log-norm UMI ...") X_norm, cell_id2, gene_id2 = read_gzip_csv(norm) # assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \ np.all(gene_id1 == gene_id2) assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \ X_raw.shape[1] == X_norm.shape[1] == len(gene_id1) # if verbose: print(f"Saving data to {preprocessed_path} ...") save_to_dataset(preprocessed_path, X=X_raw, X_col=gene_id1, y=y, y_col=y_col, rowname=cell_id1, print_log=verbose) with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'), shape=(0, X_norm.shape[1]), dtype='float32', remove_exist=True) as f: for s, e in batching(batch_size=2048, n=X_norm.shape[0]): f.write(X_norm[s:e]) # ====== read preprocessed data ====== # ds = Dataset(preprocessed_path, read_only=True) return ds
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=5218): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError("`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1,) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=5218): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)