コード例 #1
0
    def mutual_info_gap(
        self,
        n_bins: int = 10,
        strategy: Literal['uniform', 'quantile', 'kmeans', 'gmm'] = 'uniform',
    ) -> float:
        """Mutual Information Gap

    Parameters
    ----------
    n_bins : int, optional
        number of bins for discretizing the latents, by default 10
    strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}
        Strategy used to define the widths of the bins.
        'uniform' - All bins in each feature have identical widths.
        'quantile' - All bins in each feature have the same number of points.
        'kmeans' - Values in each bin have the same nearest center of a 1D cluster.
        , by default 'uniform'

    Returns
    -------
    float
        mutual information gap score
    """
        z = self.dist_to_tensor(self.latents).numpy()
        if n_bins > 1:
            z = discretizing(z,
                             independent=True,
                             n_bins=n_bins,
                             strategy=strategy)
        f = self.factors
        return mutual_info_gap(z, f)
コード例 #2
0
 def __init__(
     self,
     factors: Union[tf.Tensor, np.ndarray, DatasetV2],
     factor_names: Optional[List[str]] = None,
     categorical: Union[bool, List[bool]] = False,
     n_bins: Optional[Union[int, List[int]]] = None,
     strategy: Literal['uniform', 'quantile', 'kmeans', 'gmm'] = 'uniform',
 ):
     if isinstance(factors, tf.data.Dataset):
         factors = tf.stack([x for x in factors])
     if tf.is_tensor(factors):
         factors = factors.numpy()
     factors = np.atleast_2d(factors)
     if factors.ndim != 2:
         raise ValueError(
             "factors must be a matrix [n_observations, n_factor], "
             f"but given shape:{factors.shape}")
     # check factors is one-hot encoded
     if np.all(np.sum(factors, axis=-1) == 1):
         factors = np.argmax(factors, axis=1)[:, np.newaxis]
         categorical = True
     n_factors = factors.shape[1]
     # discretizing
     factors_original = np.array(factors)
     n_bins = as_tuple(n_bins, N=n_factors)
     strategy = as_tuple(strategy, N=n_factors, t=str)
     for i, (b, s) in enumerate(zip(n_bins, strategy)):
         if b is not None:
             factors[:, i] = discretizing(factors[:, i][:, np.newaxis],
                                          n_bins=b,
                                          strategy=s).ravel()
     factors = factors.astype(np.int64)
     # factor_names
     if factor_names is None:
         factor_names = [f'F{i}' for i in range(n_factors)]
     else:
         factor_names = [str(i) for i in tf.nest.flatten(factor_names)]
     assert len(factor_names) == n_factors, \
       f'Given {n_factors} but only {len(factor_names)} names'
     # store the attributes
     self.factors = factors
     self.factors_original = factors_original
     self.discretizer = list(zip(n_bins, strategy))
     self.categorical = as_tuple(categorical, N=n_factors, t=bool)
     self.names = factor_names
     self.labels = [np.unique(x) for x in factors.T]
     self.sizes = [len(lab) for lab in self.labels]
コード例 #3
0
ファイル: evaluation.py プロジェクト: YashinaTatiana/odin-ai
  def plot_uncertainty_scatter(self, factors=None, n_samples=2, algo='tsne'):
    r""" Plotting the scatter points of the mean and sampled latent codes,
    colored by the factors.

    Arguments:
      factors : list of Integer or String. The index or name of factors taken
        into account for analyzing.
    """
    factors = self._check_factors(factors)
    # this all include tarin and test data separatedly
    z_mean = np.concatenate(self.representations_mean)
    z_var = np.concatenate(
        [np.mean(var, axis=1) for var in self.representations_variance])
    z_samples = [
        z for z in np.concatenate(self.representations_sample(int(n_samples)),
                                  axis=1)
    ]
    F = np.concatenate(self.original_factors, axis=0)[:, factors]
    labels = self.factors_name[factors]
    # preprocessing
    inputs = tuple([z_mean] + z_samples)
    Z = dimension_reduce(*inputs,
                         algo=algo,
                         n_components=2,
                         return_model=False,
                         combined=True,
                         random_state=self.randint)
    V = utils.discretizing(z_var[:, np.newaxis], n_bins=10).ravel()
    # the figure
    nrow = 3
    ncol = int(np.ceil(len(labels) / nrow))
    fig = vs.plot_figure(nrow=nrow * 4, ncol=ncol * 4, dpi=80)
    for idx, (name, y) in enumerate(zip(labels, F.T)):
      ax = vs.plot_subplot(nrow, ncol, idx + 1)
      for i, x in enumerate(Z):
        kw = dict(val=y,
                  color="coolwarm",
                  ax=ax,
                  x=x,
                  grid=False,
                  legend_enable=False,
                  centroids=True,
                  fontsize=12)
        if i == 0:  # the mean value
          vs.plot_scatter(size=V,
                          size_range=(8, 80),
                          alpha=0.3,
                          linewidths=0,
                          cbar=True,
                          cbar_horizontal=True,
                          title=name,
                          **kw)
        else:  # the samples
          vs.plot_scatter_text(size=8,
                               marker='x',
                               alpha=0.8,
                               weight='light',
                               **kw)
    # fig.tight_layout()
    self.add_figure("uncertainty_scatter_%s" % algo, fig)
    return self
コード例 #4
0
    def sample_batch(self,
                     inputs=None,
                     latents=None,
                     factors=None,
                     n_bins=5,
                     strategy=None,
                     factor_names=None,
                     train_percent=0.8,
                     n_samples=[2000, 1000],
                     batch_size=64,
                     verbose=True):
        r""" Sample a batch of training and testing for evaluation of VAE

    Arguments:
      inputs : list of `ndarray` or `tensorflow.data.Dataset`.
        Inputs to the model, note all data will be loaded in-memory
      latents : list of `Distribution`
        distribution of learned representation
      factors : a `ndarray` or `tensorflow.data.Dataset`.
        a matrix of groundtruth factors, note all data will be loaded in-memory
      n_bins : int or array-like, shape (n_features,) (default=5)
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.
      strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile')
        Strategy used to define the widths of the bins.
        `None` - No discretization performed
        uniform - All bins in each feature have identical widths.
        quantile - All bins in each feature have the same number of points.
        kmeans - Values in each bin have the same nearest center of a 1D
          k-means cluster.
        gmm - using the components (in sorted order of mean) of Gaussian
          mixture to label.
      factor_names :
      train_percent :
      n_samples :
      batch_size :

    Returns:
      `Criticizer` with sampled data
    """
        from odin.bay.helpers import concat_distributions
        inputs, latents, factors = prepare_inputs_factors(inputs,
                                                          latents,
                                                          factors,
                                                          verbose=verbose)
        n_samples = as_tuple(n_samples, t=int, N=2)
        n_inputs = factors.shape[0]
        # ====== split train test ====== #
        if inputs is None:
            latents = latents[self._latent_indices]
            split = int(n_inputs * train_percent)
            train_ids = slice(None, split)
            test_ids = slice(split, None)
            train_latents = [z[train_ids] for z in latents]
            test_latents = [z[test_ids] for z in latents]
            if len(latents) == 1:
                train_latents = train_latents[0]
                test_latents = test_latents[0]
            else:
                self._is_multi_latents = len(latents)
                train_latents = CombinedDistribution(train_latents,
                                                     name="Latents")
                test_latents = CombinedDistribution(test_latents,
                                                    name="Latents")
        else:
            ids = self.random_state.permutation(n_inputs)
            split = int(train_percent * n_inputs)
            train_ids, test_ids = ids[:split], ids[split:]
            train_inputs = [i[train_ids] for i in inputs]
            test_inputs = [i[test_ids] for i in inputs]
        # ====== create discretized factors ====== #
        f_original = (factors[train_ids], factors[test_ids])
        # discretizing the factors
        if strategy is not None:
            if verbose:
                print(f"Discretizing factors: {n_bins} - {strategy}")
            factors = utils.discretizing(factors,
                                         n_bins=int(n_bins),
                                         strategy=strategy)
        # check for singular factor and ignore it
        ids = []
        for i, (name, f) in enumerate(zip(factor_names, factors.T)):
            c = Counter(f)
            if len(c) < 2:
                warnings.warn(
                    f"Ignore factor with name '{name}', singular data: {f}")
            else:
                ids.append(i)
        if len(ids) != len(factor_names):
            f_original = (f_original[0][:, ids], f_original[1][:, ids])
            factor_names = factor_names[ids]
            factors = factors[:, ids]
        # create the factor class for sampling
        train_factors = Factor(factors[train_ids],
                               factor_names=factor_names,
                               random_state=self.randint)
        test_factors = Factor(factors[test_ids],
                              factor_names=factor_names,
                              random_state=self.randint)

        # ====== sampling ====== #
        def sampling(inputs_, factors_, nsamples, title):
            Xs = [list() for _ in range(len(inputs))]  # inputs
            Ys = []  # factors
            Zs = []  # latents
            Os = []  # outputs
            indices = []
            n = 0
            if verbose:
                prog = tqdm(desc='Sampling %s' % title, total=nsamples)
            while n < nsamples:
                batch = min(batch_size, nsamples - n, factors_.shape[0])
                if verbose:
                    prog.update(int(batch))
                # factors
                y, ids = factors_.sample_factors(num=batch,
                                                 return_indices=True)
                indices.append(ids)
                Ys.append(y)
                # inputs
                inps = []
                for x, i in zip(Xs, inputs_):
                    i = i[ids, :]
                    x.append(i)
                    inps.append(i)
                # latents representation
                z = self.encode(inps, sample_shape=())
                o = tf.nest.flatten(self.decode(z))
                if isinstance(z, (tuple, list)):
                    z = z[self._latent_indices]
                    if len(z) == 1:
                        z = z[0]
                    else:
                        self._is_multi_latents = len(z)
                Os.append(o)
                Zs.append(z)
                # update the counter
                n += len(y)
            # end progress
            if verbose:
                prog.clear()
                prog.close()
            # aggregate all data
            Xs = [np.concatenate(x, axis=0) for x in Xs]
            Ys = np.concatenate(Ys, axis=0)
            if self.is_multi_latents:
                Zs = CombinedDistribution(
                    [
                        concat_distributions(
                            [z[zi] for z in Zs],
                            name="Latents%d" % zi,
                        ) for zi in range(self.is_multi_latents)
                    ],
                    name="Latents",
                )
            else:
                Zs = concat_distributions(Zs, name="Latents")
            Os = [
                concat_distributions(
                    [j[i] for j in Os],
                    name="Output%d" % i,
                ) for i in range(len(Os[0]))
            ]
            return Xs, Ys, Zs, Os, np.concatenate(indices, axis=0)

        # perform sampling
        if inputs is not None:
            train = sampling(inputs_=train_inputs,
                             factors_=train_factors,
                             nsamples=n_samples[0],
                             title="Train")
            test = sampling(inputs_=test_inputs,
                            factors_=test_factors,
                            nsamples=n_samples[1],
                            title="Test ")
            ids_train = train[4]
            ids_test = test[4]
            # assign the variables
            self._inputs = (train[0], test[0])
            self._factors = (train[1], test[1])
            self._representations = (train[2], test[2])
            self._reconstructions = (train[3], test[3])
            self._original_factors = (f_original[0][ids_train],
                                      f_original[1][ids_test])
        else:
            self._inputs = (None, None)
            self._factors = (train_factors.factors, test_factors.factors)
            self._representations = (train_latents, test_latents)
            self._reconstructions = (None, None)
            self._original_factors = (f_original[0], f_original[1])
        self._factor_names = train_factors.factor_names
        # concatenated
        self._representations_full = concat_distributions(self.representations)
        self._factors_full = np.concatenate(self.factors, axis=0)
        self._original_factors_full = np.concatenate(self.original_factors,
                                                     axis=0)
        return self
コード例 #5
0
    def plot_correlation_scatter(self,
                                 omic1=OMIC.transcriptomic,
                                 omic2=OMIC.proteomic,
                                 var_names1='auto',
                                 var_names2='auto',
                                 is_marker_pairs=True,
                                 log1=True,
                                 log2=True,
                                 max_scatter_points=200,
                                 top=3,
                                 bottom=3,
                                 title='',
                                 return_figure=False):
        r""" Mapping from omic1 to omic2

    Arguments:
      omic1, omic2 : instance of OMIC.
        With `omic1` represent the x-axis, and `omic2` represent the y-axis.
      var_names1 : list of all variable name for `omic1`
    """
        omic1 = OMIC.parse(omic1)
        omic2 = OMIC.parse(omic2)
        if isinstance(var_names1, string_types) and var_names1 == 'auto':
            var_names1 = omic1.markers
        if isinstance(var_names2, string_types) and var_names2 == 'auto':
            var_names2 = omic2.markers
        if var_names1 is None or var_names2 is None:
            is_marker_pairs = False
        max_scatter_points = int(max_scatter_points)
        # get all correlations
        corr = self.get_correlation(omic1, omic2)
        corr_map = {(x[0], x[1]): (0 if np.isnan(x[2]) else x[2],
                                   0 if np.isnan(x[3]) else x[3])
                    for x in corr}
        om1_names = self.get_var_names(omic1)
        om2_names = self.get_var_names(omic2)
        om1_idx = {j: i for i, j in enumerate(om1_names)}
        om2_idx = {j: i for i, j in enumerate(om2_names)}
        # extract the data and normalization
        X1 = self.numpy(omic1)
        library = np.sum(X1, axis=1, keepdims=True)
        library = discretizing(library, n_bins=10, strategy='quantile').ravel()
        if log1:
            s = np.sum(X1, axis=1, keepdims=True)
            X1 = np.log1p(X1 / s * np.median(s))
        X2 = self.numpy(omic2)
        if log2:
            s = np.sum(X2, axis=1, keepdims=True)
            X2 = np.log1p(X2 / s * np.median(s))
        ### getting the marker pairs
        all_pairs = []
        # coordinate marker pairs
        if is_marker_pairs:
            pairs = [(i1, i2) for i1, i2 in zip(var_names1, var_names2)
                     if i1 in om1_idx and i2 in om2_idx]
            var_names1 = [i for i, _ in pairs]
            var_names2 = [i for _, i in pairs]
        # filter omic2
        if var_names2 is not None:
            var_names2 = [i for i in var_names2 if i in om2_names]
        else:
            var_names2 = om2_names
        assert len(var_names2) > 0, \
          (f"None of the variables {var_names2} is contained in variable list "
           f"of OMIC {omic2.name}")
        nrow = len(var_names2)
        # filter omic1
        if var_names1 is not None:
            var_names1 = [i for i in var_names1 if i in om1_names]
            ncol = len(var_names1)
            assert len(var_names1) > 0, \
              (f"None of the variables {var_names1} is contained in variable list "
               f"of OMIC {omic1.name}")
            for name2 in var_names2:
                for name1 in var_names1:
                    all_pairs.append((om1_idx[name1], om2_idx[name2]))
        else:
            # top and bottom correlation pairs
            top = int(top)
            bottom = int(bottom)
            ncol = top + bottom
            # pick all top and bottom of omic1 coordinated to omic2
            for name in var_names2:
                i2 = om2_idx[name]
                pairs = sorted([[sum(corr_map[(i1, i2)]), i1]
                                for i1 in range(len(om1_names))])
                for _, i1 in pairs[-top:][::-1] + pairs[:bottom][::-1]:
                    all_pairs.append((i1, i2))
        ### downsampling scatter points
        if max_scatter_points > 0:
            ids = np.random.permutation(len(X1))[:max_scatter_points]
        else:
            ids = np.arange(len(X1), dtype=np.int32)
        ### plotting
        fig = plt.figure(figsize=(ncol * 2, nrow * 2 + 2), dpi=80)
        for i, pair in enumerate(all_pairs):
            ax = plt.subplot(nrow, ncol, i + 1)
            p, s = corr_map[pair]
            idx1, idx2 = pair
            x1 = X1[:, idx1]
            x2 = X2[:, idx2]
            crow = i // ncol
            ccol = i % ncol
            if is_marker_pairs:
                color = 'salmon' if crow == ccol else 'blue'
            else:
                color = 'salmon' if ccol < top else 'blue'
            vs.plot_scatter(x=x1[ids],
                            y=x2[ids],
                            color=color,
                            ax=ax,
                            size=library[ids],
                            size_range=(6, 30),
                            legend_enable=False,
                            linewidths=0.,
                            cbar=False,
                            alpha=0.3)
            # additional title for first column
            ax.set_title(f"{om1_names[idx1]}\n$p={p:.2g}$ $s={s:.2g}$",
                         fontsize=8)
            # beginning of every column
            if i % ncol == 0:
                ax.set_ylabel(f"{om2_names[idx2]}", fontsize=8, weight='bold')
        ## big title
        plt.suptitle(f"[x:{omic1.name}_y:{omic2.name}]{title}", fontsize=10)
        fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
        ### store and return
        if return_figure:
            return fig
        self.add_figure(
            f"corr_{omic1.name}{'log' if log1 else 'raw'}_"
            f"{omic2.name}{'log' if log2 else 'raw'}", fig)
        return self
コード例 #6
0
    def sample_batch(self,
                     inputs,
                     factors=None,
                     discretizing=False,
                     n_bins=5,
                     strategy='quantile',
                     factors_name=None,
                     train_percent=0.8,
                     n_samples=[2000, 1000],
                     batch_size=32,
                     verbose=True):
        r""" Sample a batch of training and testing for evaluation of VAE

    Arguments:
      inputs : list of `ndarray` or `tensorflow.data.Dataset`.
        Inputs to the model, note all data will be loaded in-memory
      factors : a `ndarray` or `tensorflow.data.Dataset`.
        a matrix of groundtruth factors, note all data will be loaded in-memory
      discretizing : if True, turn continuous factors into discrete
      n_bins : int or array-like, shape (n_features,) (default=5)
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.
      strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile')
        Strategy used to define the widths of the bins.
        uniform - All bins in each feature have identical widths.
        quantile - All bins in each feature have the same number of points.
        kmeans - Values in each bin have the same nearest center of a 1D
          k-means cluster.
        gmm - using the components (in sorted order of mean) of Gaussian
          mixture to label.
      factors_name :
      train_percent :
      n_samples :
      batch_size :

    Returns:
      `Criticizer` with sampled data
    """
        ### inputs is a tensorflow Dataset, convert everything to numpy
        if isinstance(inputs, tf.data.Dataset):
            struct = tf.nest.flatten(
                tf.data.experimental.get_structure(inputs))
            n_inputs = len(struct)
            if verbose:
                inputs = tqdm(inputs, desc="Reading data")
            if factors is None:  # include factors
                assert n_inputs >= 2, \
                  "factors are not included in the dataset: %s" % str(inputs)
                x, y = [list() for _ in range((n_inputs - 1))], []
                for data in inputs:
                    for i, j in enumerate(data[:-1]):
                        x[i].append(j)
                    y.append(data[-1])
                inputs = [tf.concat(i, axis=0).numpy() for i in x]
                if n_inputs == 2:
                    inputs = inputs[0]
                factors = tf.concat(y, axis=0).numpy()
            else:  # factors separated
                x = [list() for _ in range(n_inputs)]
                for data in inputs:
                    for i, j in enumerate(tf.nest.flatten(data)):
                        x[i].append(j)
                inputs = [tf.concat(i, axis=0).numpy() for i in x]
                if n_inputs == 1:
                    inputs = inputs[0]
                if isinstance(factors, tf.data.Dataset):
                    if verbose:
                        factors = tqdm(factors, desc="Reading factors")
                    factors = tf.concat([i for i in factors], axis=0)
        # post-processing
        is_list_inputs = isinstance(inputs, (tuple, list))
        inputs = tf.nest.flatten(inputs)
        assert len(factors.shape) == 2, "factors must be a matrix"
        # ====== split train test ====== #
        ids = self.random_state.permutation(factors.shape[0])
        split = int(train_percent * factors.shape[0])
        train_ids, test_ids = ids[:split], ids[split:]
        train_inputs = [i[train_ids] for i in inputs]
        test_inputs = [i[test_ids] for i in inputs]
        n_samples = as_tuple(n_samples, t=int, N=2)
        # ====== create discretized factors ====== #
        f_original = (factors[train_ids], factors[test_ids])
        if discretizing:
            if verbose:
                print("Discretizing factors:", int(n_bins), '-', strategy)
            factors = utils.discretizing(factors,
                                         n_bins=int(n_bins),
                                         strategy=strategy)
        train_factors = Factor(factors[train_ids],
                               factors_name=factors_name,
                               random_state=self.randint)
        test_factors = Factor(factors[test_ids],
                              factors_name=factors_name,
                              random_state=self.randint)

        # ====== sampling ====== #
        def sampling(inputs_, factors_, nsamples, title):
            Xs = [list() for _ in range(len(inputs))]  # inputs
            Ys = []  # factors
            Zs = []  # latents
            Os = []  # outputs
            indices = []
            n = 0
            if verbose:
                prog = tqdm(desc='Sampling %s' % title, total=nsamples)
            while n < nsamples:
                batch = min(batch_size, nsamples - n, factors_.shape[0])
                if verbose:
                    prog.update(int(batch))
                # factors
                y, ids = factors_.sample_factors(num=batch,
                                                 return_indices=True)
                indices.append(ids)
                Ys.append(y)
                # inputs
                inps = []
                for x, i in zip(Xs, inputs_):
                    i = i[ids, :]
                    x.append(i)
                    inps.append(i)
                # latents representation
                z = self.encode(inps, sample_shape=(), first_latent=False)
                Os.append(tf.nest.flatten(self.decode(z)))
                Zs.append(z[0] if isinstance(z, (tuple, list)) else z)
                # update the counter
                n += len(y)
            # aggregate all data
            Xs = [np.concatenate(x, axis=0) for x in Xs]
            Ys = np.concatenate(Ys, axis=0)
            Zs = concat_distribution(Zs, name="Latents")
            Os = [
                concat_distribution(
                    [j[i] for j in Os],
                    name="Output%d" % i,
                ) for i in range(len(Os[0]))
            ]
            return Xs, Ys, Zs, Os, np.concatenate(indices, axis=0)

        # perform sampling
        train = sampling(inputs_=train_inputs,
                         factors_=train_factors,
                         nsamples=n_samples[0],
                         title="Train")
        test = sampling(inputs_=test_inputs,
                        factors_=test_factors,
                        nsamples=n_samples[1],
                        title="Test ")
        ids_train = train[4]
        ids_test = test[4]
        # assign the variables
        self._is_list_inputs = is_list_inputs
        self._inputs = (train[0], test[0])
        self._factors = (train[1], test[1])
        self._factors_name = train_factors.factors_name
        self._representations = (train[2], test[2])
        self._reconstructions = (train[3], test[3])
        self._original_factors = (f_original[0][ids_train],
                                  f_original[1][ids_test])
        return self