Ejemplo n.º 1
0
def _create_it_func(X, y, batch_size, start, end):
  """ Return a lambda function that create new generator """
  nb_samples = end - start
  create_it = None
  # ====== y is None ====== #
  if y is None:
    if hasattr(X, 'set_batch'):
      create_it = lambda seed: (x for x in X.set_batch(
          batch_size=batch_size,
          start=start, end=end,
          seed=seed))
    elif hasattr(X, '__getitem__'):
      create_it = lambda seed: (X[start:end]
                        for start, end in batching(n=nb_samples,
                                                   batch_size=batch_size,
                                                   start=start, end=end,
                                                   seed=seed))
  # ====== provided y ====== #
  else:
    if hasattr(X, 'set_batch') and hasattr(y, 'set_batch'):
      create_it = lambda seed: ((i, j) for i, j in zip(
          X.set_batch(batch_size=batch_size, start=start, end=end, seed=seed),
          y.set_batch(batch_size=batch_size, start=start, end=end, seed=seed)))
    elif hasattr(X, '__getitem__') and hasattr(y, '__getitem__'):
      create_it = lambda seed: ((X[start:end], y[start:end])
        for start, end in batching(n=nb_samples,
                                   batch_size=batch_size,
                                   start=start, end=end,
                                   seed=seed))
  # ====== exception ====== #
  if create_it is None:
    raise ValueError("`X` and `y` must has attributes 'set_batch' or '__getitem__'")
  return create_it
Ejemplo n.º 2
0
 def transform(self, X, n_components=None):
   # ====== check number of components ====== #
   # specified percentage of explained variance
   if n_components is not None:
     # percentage of variances
     if n_components < 1.:
       _ = np.cumsum(self.explained_variance_ratio_)
       n_components = (_ > n_components).nonzero()[0][0] + 1
     # specific number of components
     else:
       n_components = int(n_components)
   # ====== other info ====== #
   n = X.shape[0]
   if self.batch_size is None:
     batch_size = 12 * len(self.mean_)
   else:
     batch_size = self.batch_size
   # ====== start transforming ====== #
   X_transformed = []
   for start, end in batching(n=n, batch_size=batch_size):
     x = super(MiniBatchPCA, self).transform(X=X[start:end])
     if n_components is not None:
       x = x[:, :n_components]
     X_transformed.append(x)
   return np.concatenate(X_transformed, axis=0)
Ejemplo n.º 3
0
 def describe(self) -> str:
     text = f"SingleCellOMICs: {self.name}"
     pad = "\n     "
     for omic in self.omics:
         X = self.numpy(omic)
         all_nonzeros = []
         for s, e in batching(n=self.n_obs, batch_size=BATCH_SIZE):
             x = X[s:e]
             ids = np.nonzero(x)
             all_nonzeros.append(x[ids[0], ids[1]])
         all_nonzeros = np.concatenate(all_nonzeros)
         text += pad[:-1] + "OMIC: '%s' - dtype: '%s'" % (
             omic.name, "binary" if self.is_binary(omic) else "continuous")
         text += pad + 'Sparsity  : %.2f' % self.sparsity(omic)
         text += pad + 'Nonzeros  : %s' % describe(
             all_nonzeros, shorten=True, float_precision=2)
         text += pad + 'Cell      : %s' % describe(
             self.counts_per_cell(omic), shorten=True, float_precision=2)
         text += pad + 'Gene      : %s' % describe(
             self.counts_per_gene(omic), shorten=True, float_precision=2)
         text += pad + 'LogCount  : %s' % describe(
             self.log_counts(omic), shorten=True, float_precision=2)
         text += pad + 'LocalMean : %s' % describe(
             self.local_mean(omic), shorten=True, float_precision=2)
         text += pad + 'LocalVar  : %s' % describe(
             self.local_var(omic), shorten=True, float_precision=2)
     return text
Ejemplo n.º 4
0
 def transform(self, X, n_components=None):
     # ====== check number of components ====== #
     # specified percentage of explained variance
     if n_components is not None:
         # percentage of variances
         if n_components < 1.:
             _ = np.cumsum(self.explained_variance_ratio_)
             n_components = (_ > n_components).nonzero()[0][0] + 1
         # specific number of components
         else:
             n_components = int(n_components)
     # ====== other info ====== #
     n = X.shape[0]
     if self.batch_size is None:
         batch_size = 12 * len(self.mean_)
     else:
         batch_size = self.batch_size
     # ====== start transforming ====== #
     X_transformed = []
     for start, end in batching(n=n, batch_size=batch_size):
         x = super(MiniBatchPCA, self).transform(X=X[start:end])
         if n_components is not None:
             x = x[:, :n_components]
         X_transformed.append(x)
     return np.concatenate(X_transformed, axis=0)
Ejemplo n.º 5
0
 def counts_per_gene(self, omic=None):
     r""" Return total number of counts per gene. This method
 is scalable. """
     counts = 0
     X = self.numpy(omic)
     for s, e in batching(batch_size=BATCH_SIZE, n=X.shape[0]):
         counts += np.sum(X[s:e], axis=0)
     return counts
Ejemplo n.º 6
0
def sparsity_percentage(x, batch_size=1234):
    n_zeros = 0
    n_total = np.prod(x.shape)
    for start, end in batching(batch_size=batch_size, n=x.shape[0], seed=None):
        y = x[start:end]
        n_nonzeros = np.count_nonzero(y)
        n_zeros += np.prod(y.shape) - n_nonzeros
    return n_zeros / n_total
Ejemplo n.º 7
0
Archivo: stats.py Proyecto: imito/odin
def sparsity_percentage(x, batch_size=5218):
  n_zeros = 0
  n_total = np.prod(x.shape)
  for start, end in batching(batch_size=batch_size, n=x.shape[0],
                             seed=None):
    y = x[start:end]
    n_nonzeros = np.count_nonzero(y)
    n_zeros += np.prod(y.shape) - n_nonzeros
  return n_zeros / n_total
Ejemplo n.º 8
0
 def validate_data(self, path=None):
   if path is None:
     path = self.path
   import h5py
   with h5py.File(path, 'r') as dataset:
     images1 = dataset['images']
     labels1 = dataset['labels']
     for start, end in tqdm(list(batching(8000, n=self.images.shape[0]))):
       assert np.all(self.images[start:end] == images1[start:end]) and \
         np.all(self.factors[start:end] == labels1[start:end])
   return self
Ejemplo n.º 9
0
def _file_grouping(batch, batch_size, rng, batch_filter):
    """ Return: [(name, index, data1, data2, ...), ...]
      NOTE: each element in batch is one file
  """
    # ====== shuffle the file ====== #
    if rng is not None:
        rng.shuffle(batch)
    # ====== return batched files with index for ordering ====== #
    for name, X in batch:
        n = X[0].shape[0]
        ret = list(X)
        for i, (start, end) in enumerate(batching(n=n, batch_size=batch_size)):
            r = [name, i] + [j[start:end] for j in ret]
            yield tuple(batch_filter(r))
Ejemplo n.º 10
0
Archivo: feeder.py Proyecto: imito/odin
def _file_grouping(batch, batch_size, rng, batch_filter):
  """ Return: [(name, index, data1, data2, ...), ...]
      NOTE: each element in batch is one file
  """
  # ====== shuffle the file ====== #
  if rng is not None:
    rng.shuffle(batch)
  # ====== return batched files with index for ordering ====== #
  for name, X in batch:
    n = X[0].shape[0]
    ret = list(X)
    for i, (start, end) in enumerate(batching(n=n, batch_size=batch_size)):
      r = [name, i] + [j[start:end] for j in ret]
      yield tuple(batch_filter(r))
Ejemplo n.º 11
0
def _create_it_func(X, y, batch_size, start, end):
    """ Return a lambda function that create new generator """
    nb_samples = end - start
    create_it = None
    # ====== y is None ====== #
    if y is None:
        if hasattr(X, 'set_batch'):
            create_it = lambda seed: (x for x in X.set_batch(
                batch_size=batch_size, start=start, end=end, seed=seed))
        elif hasattr(X, '__getitem__'):
            create_it = lambda seed: (X[start:end] for start, end in batching(
                n=nb_samples,
                batch_size=batch_size,
                start=start,
                end=end,
                seed=seed))
    # ====== provided y ====== #
    else:
        if hasattr(X, 'set_batch') and hasattr(y, 'set_batch'):
            create_it = lambda seed: ((i, j) for i, j in zip(
                X.set_batch(
                    batch_size=batch_size, start=start, end=end, seed=seed),
                y.set_batch(
                    batch_size=batch_size, start=start, end=end, seed=seed)))
        elif hasattr(X, '__getitem__') and hasattr(y, '__getitem__'):
            create_it = lambda seed: (
                (X[start:end], y[start:end])
                for start, end in batching(n=nb_samples,
                                           batch_size=batch_size,
                                           start=start,
                                           end=end,
                                           seed=seed))
    # ====== exception ====== #
    if create_it is None:
        raise ValueError(
            "`X` and `y` must has attributes 'set_batch' or '__getitem__'")
    return create_it
Ejemplo n.º 12
0
  def test_mpi(self):
    X = batching(n=512, batch_size=np.random.randint(low=12000, high=80000))

    def map_func(batch):
      for b in batch:
        yield b
    mpi = MPI(X, map_func=map_func, ncpu=12, buffer_size=8,
        maximum_queue_size=12 * 8)

    Y = [i for i in mpi]
    self.assertEqual(len(X), len(Y))
    self.assertEqual(sum(j - i for i, j in X), sum(j - i for i, j in Y))
    self.assertTrue(all(i == j for i, j in zip(
        sorted(X, key=lambda x: x[0]),
        sorted(Y, key=lambda x: x[0])
    )))
Ejemplo n.º 13
0
 def __init__(self,
              path='~/tensorflow_datasets/3dshapes.h5',
              cache_dir=None,
              seed=8):
     path = os.path.abspath(os.path.expanduser(path))
     assert os.path.exists(path), "Path to file %s must exists" % path
     self.path = path
     if cache_dir is None:
         cache_dir = os.path.dirname(path)
     if not os.path.exists(cache_dir):
         os.mkdir(cache_dir)
     image_path = os.path.join(cache_dir, '3dshapes.images')
     label_path = os.path.join(cache_dir, '3dshapes.labels')
     # ====== read the dataset and cache it again ====== #
     if not os.path.exists(image_path) or not os.path.exists(label_path):
         import h5py
         with h5py.File(path, 'r') as dataset:
             images = dataset['images']
             labels = dataset['labels']
             with MmapArrayWriter(image_path,
                                  shape=images.shape,
                                  dtype=images.dtype,
                                  remove_exist=True) as img, \
               MmapArrayWriter(label_path,
                               shape=labels.shape,
                               dtype=labels.dtype,
                               remove_exist=True) as lab:
                 for start, end in tqdm(list(
                         batching(8000, n=images.shape[0])),
                                        desc="Caching data"):
                     img.write(images[start:end])
                     lab.write(labels[start:end])
     # ====== load the data ====== #
     self.images = MmapArray(image_path)
     self.factors = MmapArray(label_path)
     # ====== split the dataset ====== #
     rand = np.random.RandomState(seed=seed)
     n = len(self.images)
     ids = rand.permutation(n)
     # train:85% valid:5% test:10%
     self.train_indices = ids[:int(0.85 * n)]
     self.valid_indices = ids[int(0.85 * n):int(0.9 * n)]
     self.test_indices = ids[int(0.9 * n):]
Ejemplo n.º 14
0
 def _predict(self, X, f_pred):
   if not self.is_fitted:
     raise RuntimeError("LogisticRegression hasn't been initialized or "
                        "fitted.")
   if hasattr(X, 'set_batch'):
     it = iter(X.set_batch(batch_size=self.batch_size, seed=None))
   elif hasattr(X, '__getitem__'):
     it = (X[start:end]
           for start, end in batching(batch_size=self.batch_size,
                                      n=X.shape[0]))
   else:
     raise ValueError("`X` must has attributes 'set_batch' or '__getitem__'")
   # ====== make prediction ====== #
   y = []
   prog = Progbar(target=X.shape[0], print_report=True,
                  print_summary=False, name="Predicting")
   for x in it:
     x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes)
     y.append(f_pred(x))
     prog.add(x.shape[0])
   return np.concatenate(y, axis=0)
Ejemplo n.º 15
0
Archivo: utils.py Proyecto: imito/odin
def make_dnn_prediction(functions, X, batch_size=256, title=''):
  return_list = True
  if not isinstance(functions, (tuple, list)):
    functions = [functions]
    return_list = False
  n_functions = len(functions)
  results = [[] for i in range(n_functions)]
  # ====== prepare progress bar ====== #
  n_samples = len(X)
  prog = Progbar(target=n_samples, print_summary=True,
                 name="Making prediction: %s" % str(title))
  # ====== for feeder ====== #
  if isinstance(X, F.Feeder):
    y_true = []
    for x, y in X.set_batch(batch_size=batch_size):
      for res, fn in zip(results, functions):
        res.append(fn(x))
      prog.add(x.shape[0])
      y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y)
    results = [np.concatenate(res, axis=0)
               for res in results]
    y_true = np.concatenate(y_true, axis=0)
    if return_list:
      return results, y_true
    return results[0], y_true
  # ====== for numpy array ====== #
  else:
    for start, end in batching(batch_size=batch_size, n=n_samples):
      y = X[start:end]
      for res, fn in zip(results, functions):
        res.append(fn(y))
      prog.add(end - start)
    results = [np.concatenate(res, axis=0)
               for res in results]
    if return_list:
      return results
    return results[0]
Ejemplo n.º 16
0
def make_dnn_prediction(functions, X, batch_size=256, title=''):
    return_list = True
    if not isinstance(functions, (tuple, list)):
        functions = [functions]
        return_list = False
    n_functions = len(functions)
    results = [[] for i in range(n_functions)]
    # ====== prepare progress bar ====== #
    n_samples = len(X)
    prog = Progbar(target=n_samples,
                   print_summary=True,
                   name="Making prediction: %s" % str(title))
    # ====== for feeder ====== #
    if isinstance(X, F.Feeder):
        y_true = []
        for x, y in X.set_batch(batch_size=batch_size):
            for res, fn in zip(results, functions):
                res.append(fn(x))
            prog.add(x.shape[0])
            y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y)
        results = [np.concatenate(res, axis=0) for res in results]
        y_true = np.concatenate(y_true, axis=0)
        if return_list:
            return results, y_true
        return results[0], y_true
    # ====== for numpy array ====== #
    else:
        for start, end in batching(batch_size=batch_size, n=n_samples):
            y = X[start:end]
            for res, fn in zip(results, functions):
                res.append(fn(y))
            prog.add(end - start)
        results = [np.concatenate(res, axis=0) for res in results]
        if return_list:
            return results
        return results[0]
Ejemplo n.º 17
0
 def _predict(self, X, f_pred):
     if not self.is_fitted:
         raise RuntimeError("LogisticRegression hasn't been initialized or "
                            "fitted.")
     if hasattr(X, 'set_batch'):
         it = iter(X.set_batch(batch_size=self.batch_size, seed=None))
     elif hasattr(X, '__getitem__'):
         it = (X[start:end]
               for start, end in batching(batch_size=self.batch_size,
                                          n=X.shape[0]))
     else:
         raise ValueError(
             "`X` must has attributes 'set_batch' or '__getitem__'")
     # ====== make prediction ====== #
     y = []
     prog = Progbar(target=X.shape[0],
                    print_report=True,
                    print_summary=False,
                    name="Predicting")
     for x in it:
         x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes)
         y.append(f_pred(x))
         prog.add(x.shape[0])
     return np.concatenate(y, axis=0)
Ejemplo n.º 18
0
# Optimizing the network
# ===========================================================================
update_ops = K.optimizers.Adam(lr=0.001).minimize(loss)
K.initialize_all_variables()
# ====== intitalize ====== #
record_train_loss = []
record_valid_loss = []
patience = 3
epoch = 0
# We want the rate to go up but the distortion to go down
while True:
  # ====== training ====== #
  train_losses = []
  prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch)
  start_time = timeit.default_timer()
  for start, end in batching(batch_size=args.bs, n=X_train.shape[0],
                             seed=K.get_rng().randint(10e8)):
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
  record_valid_loss.append(lo)
  # ====== plotting ====== #
  if args.dim > 2:
Ejemplo n.º 19
0
def fast_kmeans(X,
                n_clusters=8,
                max_iter=300,
                tol=0.0001,
                n_init=10,
                random_state=1234,
                init='scalable-k-means++',
                oversampling_factor=2.0,
                max_samples_per_batch=32768,
                force_sklearn=False):
    r""" KMeans clustering

  Arguments:
    n_clusters : int (default = 8)
        The number of centroids or clusters you want.
    max_iter : int (default = 300)
        The more iterations of EM, the more accurate, but slower.
    tol : float64 (default = 1e-4)
        Stopping criterion when centroid means do not change much.
    random_state : int (default = 1)
        If you want results to be the same when you restart Python, select a
        state.
    init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray}
           (default = 'scalable-k-means++')
        'scalable-k-means++' or 'k-means||': Uses fast and stable scalable
        kmeans++ intialization.
        'random': Choose 'n_cluster' observations (rows) at random from data
        for the initial centroids.
        If an ndarray is passed, it should be of
        shape (n_clusters, n_features) and gives the initial centers.
    max_samples_per_batch : int maximum number of samples to use for each batch
                                of the pairwise distance computation.
    oversampling_factor : int (default = 2) The amount of points to sample
        in scalable k-means++ initialization for potential centroids.
        Increasing this value can lead to better initial centroids at the
        cost of memory. The total number of centroids sampled in scalable
        k-means++ is oversampling_factor * n_clusters * 8.
    max_samples_per_batch : int (default = 32768) The number of data
        samples to use for batches of the pairwise distance computation.
        This computation is done throughout both fit predict. The default
        should suit most cases. The total number of elements in the batched
        pairwise distance computation is max_samples_per_batch * n_clusters.
        It might become necessary to lower this number when n_clusters
        becomes prohibitively large.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    kwargs.pop('force_sklearn')
    ## fine-tuning the kwargs
    cuml = _check_cuml(force_sklearn)
    if cuml:
        from cuml.cluster import KMeans
        kwargs.pop('n_init')
    else:
        from sklearn.cluster import MiniBatchKMeans
        kwargs.pop('oversampling_factor')
        kwargs.pop('max_samples_per_batch')
        if kwargs['init'] in ('scalable-k-means++', 'k-means||'):
            kwargs['init'] = 'k-means++'
    ## fitting
    if not cuml:
        from odin.utils import batching
        kmean = MiniBatchKMeans(**kwargs)
        for s, e in batching(int(max_samples_per_batch),
                             n=X.shape[0],
                             seed=random_state):
            kmean.partial_fit(X[s:e])
    else:
        kmean = KMeans(verbose=False, **kwargs)
        kmean.fit(X)
    return kmean
Ejemplo n.º 20
0
# Optimizing the network
# ===========================================================================
update_ops = K.optimizers.Adam(lr=0.001).minimize(loss)
K.initialize_all_variables()
# ====== intitalize ====== #
record_train_loss = []
record_valid_loss = []
patience = 3
epoch = 0
# We want the rate to go up but the distortion to go down
while True:
  # ====== training ====== #
  train_losses = []
  prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch)
  start_time = timeit.default_timer()
  for start, end in batching(batch_size=args.bs, n=X_train.shape[0],
                             seed=K.get_rng().randint(10e8)):
    _ = K.eval(loss, feed_dict={X: X_train[start:end]},
               update_after=update_ops)
    prog.add(end - start)
    train_losses.append(_)
  # ====== training log ====== #
  print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time))
  print("[Training set] Loss: %.4f" % np.mean(train_losses))
  # ====== validation set ====== #
  code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid})
  print("[Valid set]    Loss: %.4f" % lo)
  # ====== record the history ====== #
  record_train_loss.append(np.mean(train_losses))
  record_valid_loss.append(lo)
  # ====== plotting ====== #
  if args.dim > 2:
Ejemplo n.º 21
0
def fast_pca(*x,
             n_components=None,
             algo='rpca',
             y=None,
             batch_size=1024,
             return_model=False,
             random_state=1234):
    """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
    batch_size = int(batch_size)
    algo = str(algo).lower()
    if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
        raise ValueError(
            "`algo` must be one of the following: 'pca', "
            "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
    if algo in ('sppca', 'plda') and y is None:
        raise RuntimeError("`y` must be not None if `algo='sppca'`")
    x = flatten_list(x, level=None)
    x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x]
    # ====== check input ====== #
    x_train = x[0]
    x_test = x[1:]
    input_shape = None
    if x_train.ndim > 2:  # only 2D for PCA
        input_shape = (-1, ) + x_train.shape[1:]
        new_shape = (-1, np.prod(input_shape[1:]))
        x_train = np.reshape(x_train, new_shape)
        x_test = [np.reshape(x, new_shape) for x in x_test]
        if n_components is not None:  # no need to reshape back
            input_shape = None
    # ====== train PCA ====== #
    if algo == 'sppca':
        pca = SupervisedPPCA(n_components=n_components,
                             random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'plda':
        from odin.ml import PLDA
        pca = PLDA(n_phi=n_components, random_state=random_state)
        pca.fit(x_train, y)
    elif algo == 'pca':
        pca = PCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    elif algo == 'rpca':
        # we copy the implementation of RandomizedPCA because
        # it is significantly faster than PCA(svd_solver='randomize')
        pca = RandomizedPCA(n_components=n_components,
                            iterated_power=2,
                            random_state=random_state)
        pca.fit(x_train)
    elif algo == 'ipca':
        pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        prog = Progbar(target=x_train.shape[0],
                       print_report=False,
                       print_summary=False,
                       name="Fitting PCA")
        for start, end in batching(batch_size=batch_size,
                                   n=x_train.shape[0],
                                   seed=1234):
            pca.partial_fit(x_train[start:end], check_input=False)
            prog.add(end - start)
    elif algo == 'ppca':
        pca = PPCA(n_components=n_components, random_state=random_state)
        pca.fit(x_train)
    # ====== transform ====== #
    x_train = pca.transform(x_train)
    x_test = [pca.transform(x) for x in x_test]
    # reshape back to original shape if necessary
    if input_shape is not None:
        x_train = np.reshape(x_train, input_shape)
        x_test = [np.reshape(x, input_shape) for x in x_test]
    # return the results
    if len(x_test) == 0:
        return x_train if not return_model else (pca, x_train)
    return tuple([x_train] +
                 x_test) if not return_model else tuple([pca, x_train] +
                                                        x_test)
Ejemplo n.º 22
0
 def __call__(self, *inputs, **kwargs):
   show_progress = kwargs.pop('show_progress', False)
   # dictionary as inputs
   if len(kwargs) == len(self.inputs_name):
     inputs = [kwargs[i] for i in self.inputs_name]
   # ====== delete un-matchede inputs ====== #
   inputs_new = []
   tmp = list(inputs)
   shapes = list(self._input_shape)
   # this process iteratively remove inputs with mismatch shape
   # to current given input
   for s in shapes:
     for i in tuple(tmp):
       if len(i.shape) != len(s) or \
       any(a is not None and a > 0 and a != b
               for a, b in zip(s, i.shape)): # different ndim, or shape
         tmp.remove(i)
       else:
         inputs_new.append(i)
         tmp.remove(i)
         break
   if len(inputs_new) != len(self.inputs):
     raise ValueError("Given inputs have shape: %s, cannot match the shape of "
                      "defined inputs: %s" %
                      ('; '.join([str(i.shape) for i in inputs]),
                       '; '.join([str(i) for i in self.input_shape])))
   if not self._strict:
     inputs = inputs_new
   # ====== create feed_dict ====== #
   feed_dict = {}
   inputs = flatten_list(inputs, level=None)
   for tensor, value in zip(self.inputs, inputs):
     feed_dict[tensor] = value
   feed_dict.update(self.defaults)
   # check if modifying training mode
   if self.training is None:
     pass
   elif self.training:
     feed_dict.update({is_training(): True})
   else:
     feed_dict.update({is_training(): False})
   session = get_session()
   outputs = None
   # ====== mini-batches ====== #
   if self.batch_size is not None:
     batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)]
                   if len(self.batch_vars) == 0 else self.batch_vars)
     batch_vars = [i for i in batch_vars
                   if i in feed_dict and hasattr(feed_dict[i], 'shape')]
     n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars))
     assert len(n_samples) == 1, \
     "Data have multiple batching dimension: %s" % str(n_samples)
     n_samples = n_samples[0]
     # only continue if we have more samples than `batch_size`
     if n_samples > self.batch_size:
       n_output = len(self.outputs)
       outputs = []
       all_batches = []
       # (optional) showing progress
       if show_progress:
         prog = Progbar(target=n_samples,
                        print_report=False, print_summary=False,
                        name='')
       for s, e in batching(batch_size=int(self.batch_size),
                            n=n_samples):
         if show_progress:
           prog.add(e - s)
         all_batches.append(e - s)
         feed_dict_minibatch = OrderedDict([(k, v[s:e])
                                            if k in batch_vars else (k, v)
                                            for k, v in feed_dict.items()])
         updated = session.run(self.outputs + [self.updates_ops],
                               feed_dict=feed_dict_minibatch)
         updated = updated[:n_output]
         if not self._return_list:
           updated = updated[0]
         outputs.append(updated)
       ## concatenate all outputs
       if not self._return_list:
         o_ndim = outputs[0].ndim
         if o_ndim == 0: # returned scalars
           outputs = np.array(outputs)
         else: # returned array
           for o_axis in range(o_ndim):
             all_n = [o.shape[o_axis] for o in outputs]
             if all_n == all_batches:
               break
           outputs = np.concatenate(outputs, axis=o_axis)
       ## returning a list of outputs
       else:
         new_outputs = []
         for output_idx in range(len(outputs[0])):
           o = [x[output_idx] for x in outputs]
           o_ndim = o[0].ndim
           if o_ndim == 0: # returned scalars
             o = np.array(o)
           else: # returned array
             for o_axis in range(o[0].ndim):
               all_n = [val.shape[o_axis] for val in o]
               if all_n == all_batches:
                 break
             o = np.concatenate(o, axis=o_axis)
           new_outputs.append(o)
         outputs = new_outputs
   # ====== single batch ====== #
   if outputs is None:
     updated = session.run(self.outputs + [self.updates_ops],
                           feed_dict=feed_dict)
     outputs = updated[:len(self.outputs)]
     if not self._return_list:
       outputs = outputs[0]
   # ====== return final output ====== #
   return outputs
Ejemplo n.º 23
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
Ejemplo n.º 24
0
def read_centenarian(override=False, verbose=False):
    r""" Data used in:

    "Single-cell transcriptomics reveals expansion of cytotoxic CD4 T-cells in
    supercentenarians" | bioRxiv [WWW Document], n.d.
      URL https://www.biorxiv.org/content/10.1101/643528v1 (accessed 5.21.20).

  """
    download_path = os.path.join(DOWNLOAD_DIR, "SuperCentenarian_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(DATA_DIR, 'SuperCentenarian_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        labels = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[2])),
            url=_URL[2],
        )
        data = []
        with gzip.open(labels, mode='rb') as f:
            for line in f:
                line = str(line, 'utf-8').strip().split('\t')
                assert line[1][:2] == line[2]
                data.append(line)
        labels = np.array(data)
        y_col = sorted(set(labels[:, 1]))
        y = one_hot(np.array([y_col.index(i) for i in labels[:, 1]]),
                    len(y_col)).astype('float32')
        y_col = np.array(y_col)
        #
        raw = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[0])),
            url=_URL[0],
        )
        if verbose:
            print("Unzip and reading raw UMI ...")
        X_raw, cell_id1, gene_id1 = read_gzip_csv(raw)
        #
        norm = download_file(
            outpath=os.path.join(download_path, os.path.basename(_URL[1])),
            url=_URL[1],
        )
        if verbose:
            print("Unzip and reading log-norm UMI ...")
        X_norm, cell_id2, gene_id2 = read_gzip_csv(norm)
        #
        assert np.all(cell_id1 == cell_id2) and np.all(labels[:, 0] == cell_id1) and \
          np.all(gene_id1 == gene_id2)
        assert X_raw.shape[0] == X_norm.shape[0] == len(cell_id1) and \
          X_raw.shape[1] == X_norm.shape[1] == len(gene_id1)
        #
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X=X_raw,
                        X_col=gene_id1,
                        y=y,
                        y_col=y_col,
                        rowname=cell_id1,
                        print_log=verbose)
        with MmapArrayWriter(os.path.join(preprocessed_path, 'X_log'),
                             shape=(0, X_norm.shape[1]),
                             dtype='float32',
                             remove_exist=True) as f:
            for s, e in batching(batch_size=2048, n=X_norm.shape[0]):
                f.write(X_norm[s:e])
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Ejemplo n.º 25
0
def fast_pca(*x, n_components=None, algo='rpca', y=None,
             batch_size=1024, return_model=False,
             random_state=5218):
  """ A shortcut for many different PCA algorithms

  Parameters
  ----------
  x : {list, tuple}
    list of matrices for transformation, the first matrix will
    be used for training
  n_components : {None, int}
    number of PCA components
  algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'}
    different PCA algorithm:
      'ipca' - IncrementalPCA,
      'ppca' - Probabilistic PCA,
      'sppca' - Supervised Probabilistic PCA,
      'plda' - Probabilistic LDA,
      'rpca' - randomized PCA using randomized SVD
  y : {numpy.ndarray, None}
    required for labels in case of `sppca`
  batch_size : int (default: 1024)
    batch size, only used for IncrementalPCA
  return_model : bool (default: False)
    if True, return the trained PCA model as the FIRST return
  """
  batch_size = int(batch_size)
  algo = str(algo).lower()
  if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'):
    raise ValueError("`algo` must be one of the following: 'pca', "
                     "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo)
  if algo in ('sppca', 'plda') and y is None:
    raise RuntimeError("`y` must be not None if `algo='sppca'`")
  x = flatten_list(x, level=None)
  x = [i[:] if i.__class__.__name__ == 'MmapData' else i
       for i in x]
  # ====== check input ====== #
  x_train = x[0]
  x_test = x[1:]
  input_shape = None
  if x_train.ndim > 2: # only 2D for PCA
    input_shape = (-1,) + x_train.shape[1:]
    new_shape = (-1, np.prod(input_shape[1:]))
    x_train = np.reshape(x_train, new_shape)
    x_test = [np.reshape(x, new_shape) for x in x_test]
    if n_components is not None: # no need to reshape back
      input_shape = None
  # ====== train PCA ====== #
  if algo == 'sppca':
    pca = SupervisedPPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'plda':
    from odin.ml import PLDA
    pca = PLDA(n_phi=n_components, random_state=random_state)
    pca.fit(x_train, y)
  elif algo == 'pca':
    pca = PCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  elif algo == 'rpca':
    # we copy the implementation of RandomizedPCA because
    # it is significantly faster than PCA(svd_solver='randomize')
    pca = RandomizedPCA(n_components=n_components, iterated_power=2,
                        random_state=random_state)
    pca.fit(x_train)
  elif algo == 'ipca':
    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    prog = Progbar(target=x_train.shape[0],
                   print_report=False, print_summary=False, name="Fitting PCA")
    for start, end in batching(batch_size=batch_size, n=x_train.shape[0],
                               seed=5218):
      pca.partial_fit(x_train[start:end], check_input=False)
      prog.add(end - start)
  elif algo == 'ppca':
    pca = PPCA(n_components=n_components, random_state=random_state)
    pca.fit(x_train)
  # ====== transform ====== #
  x_train = pca.transform(x_train)
  x_test = [pca.transform(x) for x in x_test]
  # reshape back to original shape if necessary
  if input_shape is not None:
    x_train = np.reshape(x_train, input_shape)
    x_test = [np.reshape(x, input_shape) for x in x_test]
  # return the results
  if len(x_test) == 0:
    return x_train if not return_model else (pca, x_train)
  return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)