Example #1
0
 def plot_histogram(
     self,
     histogram_bins: int = 120,
     original_factors: bool = True,
     return_figure: bool = False,
 ):
     Z = self.dist_to_tensor(self.latents).numpy()
     F = self.factors_original if original_factors else self.factors
     X = [i for i in F.T] + [i for i in Z.T]
     labels = self.factor_names + self.latent_names
     # create the figure
     ncol = int(np.ceil(np.sqrt(len(X)))) + 1
     nrow = int(np.ceil(len(X) / ncol))
     fig = vs.plot_figure(nrow=12, ncol=20, dpi=100)
     for i, (x, lab) in enumerate(zip(X, labels)):
         vs.plot_histogram(x,
                           ax=(nrow, ncol, i + 1),
                           bins=int(histogram_bins),
                           title=lab,
                           alpha=0.8,
                           color='blue',
                           fontsize=16)
     fig.tight_layout()
     if return_figure:
         return fig
     return self.add_figure(
         f"histogram_{'original' if original_factors else 'discretized'}",
         fig)
Example #2
0
 def plot_histogram(self, histogram_bins=120, original_factors=True):
   r"""
   orginal_factors : optional original factors before discretized by
     `Criticizer`
   """
   self.assert_sampled()
   from matplotlib import pyplot as plt
   ## prepare the data
   Z = np.concatenate(self.representations_mean, axis=0)
   F = np.concatenate(
       self.original_factors if original_factors else self.factors, axis=0)
   X = [i for i in F.T] + [i for i in Z.T]
   labels = self.factors_name.tolist() + self.codes_name.tolist()
   # create the figure
   ncol = int(np.ceil(np.sqrt(len(X)))) + 1
   nrow = int(np.ceil(len(X) / ncol))
   fig = vs.plot_figure(nrow=18, ncol=25, dpi=80)
   for i, (x, lab) in enumerate(zip(X, labels)):
     vs.plot_histogram(x,
                       ax=(nrow, ncol, i + 1),
                       bins=int(histogram_bins),
                       title=lab,
                       alpha=0.8,
                       color='blue',
                       fontsize=16)
   plt.tight_layout()
   self.add_figure(
       "histogram_%s" % ("original" if original_factors else "discretized"),
       fig)
   return self
 def plot_histogram(self,
                    omic=OMIC.proteomic,
                    bins=80,
                    log_norm=True,
                    var_names=None,
                    max_plots=100,
                    fig=None,
                    return_figure=False):
     r""" Plot histogram for each variable of given OMIC type """
     omic = OMIC.parse(omic)
     x = self.numpy(omic)
     bins = min(int(bins), x.shape[0] // 2)
     max_plots = int(max_plots)
     ### prepare the data
     var_ids = self.get_var_indices(omic)
     if var_names is None:
         var_names = var_ids.keys()
     var_names = np.array([i for i in var_names if i in var_ids])
     assert len(var_names) > 0, \
       f"No matching variables found for {omic.name}"
     # randomly select variables
     if len(var_names) > max_plots:
         rand = np.random.RandomState(seed=1)
         ids = rand.permutation(len(var_names))[:max_plots]
         var_names = var_names[ids]
     ids = [var_ids[i] for i in var_names]
     x = x[:, ids]
     ### the figures
     ncol = 8
     nrow = int(np.ceil(x.shape[1] / ncol))
     if fig is None:
         fig = vs.plot_figure(nrow=nrow * 2, ncol=ncol * 3, dpi=80)
     # plot
     for idx, (y, name) in enumerate(zip(x.T, var_names)):
         sparsity = sparsity_percentage(y, batch_size=2048)
         y = y[y != 0.]
         if log_norm:
             y = np.log1p(y)
         vs.plot_histogram(x=y,
                           bins=bins,
                           alpha=0.8,
                           ax=(nrow, ncol, idx + 1),
                           title=f"{name}\n({sparsity*100:.1f}% zeros)")
         fig.gca().tick_params(axis='y', labelleft=False)
     ### adjust and return
     fig.suptitle(f"{omic.name}")
     fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
     if return_figure:
         return fig
     return self.add_figure(f"histogram_{omic.name}", fig)
Example #4
0
def _draw_hist(x, ax, title, n_bins, show_yticks=True):
    count, bins = plot_histogram(x=x,
                                 bins=n_bins,
                                 ax=ax,
                                 normalize=False,
                                 kde=False,
                                 range_0_1=False,
                                 covariance_factor=0.25,
                                 centerlize=False,
                                 fontsize=8,
                                 title=title)
    plt.xlim((np.min(x), np.max(x)))
    plt.xticks(np.linspace(start=np.min(x),
                           stop=np.max(x),
                           num=5,
                           dtype='float32'),
               fontsize=6)
    if show_yticks:
        plt.yticks(np.linspace(start=np.min(count),
                               stop=np.max(count),
                               num=5,
                               dtype='int32'),
                   fontsize=5)
    else:
        plt.yticks([], [])
    return count, bins
Example #5
0
 def plot_hist(hist, ax, name):
     count, bins = plot_histogram(true,
                                  bins=nbins,
                                  ax=ax,
                                  title=name,
                                  fontsize=fontsize)
     plt.xlim((np.min(bins), np.max(bins)))
     plt.xticks(np.linspace(start=np.min(bins),
                            stop=np.max(bins),
                            num=8,
                            dtype='int32'),
                fontsize=6)
     plt.yticks(np.linspace(start=np.min(count),
                            stop=np.max(count),
                            num=8,
                            dtype='int32'),
                fontsize=6)
 def plot_percentile_histogram(self,
                               omic=OMIC.transcriptomic,
                               n_hist=10,
                               title="",
                               outlier=0.001,
                               non_zeros=False,
                               fig=None):
     r""" Data is chopped into multiple percentile (`n_hist`) and the
 histogram is plotted for each percentile. """
     omic = OMIC.parse(omic)
     arr = self.numpy(omic)
     if non_zeros:
         arr = arr[arr != 0]
     n_percentiles = n_hist + 1
     n_col = 5
     n_row = int(np.ceil(n_hist / n_col))
     if fig is None:
         fig = vs.plot_figure(nrow=int(n_row * 1.5), ncol=20)
     self.assert_figure(fig)
     percentile = np.linspace(start=np.min(arr),
                              stop=np.max(arr),
                              num=n_percentiles)
     n_samples = len(arr)
     for i, (p_min, p_max) in enumerate(zip(percentile, percentile[1:])):
         min_mask = arr >= p_min
         max_mask = arr <= p_max
         mask = np.logical_and(min_mask, max_mask)
         a = arr[mask]
         _, bins = vs.plot_histogram(
             a,
             bins=120,
             ax=(n_row, n_col, i + 1),
             fontsize=8,
             color='red' if len(a) / n_samples < outlier else 'blue',
             title=f"{len(a)}(samples)  Range:[{p_min:.2g},{p_max:.2g}]")
         plt.gca().set_xticks(np.linspace(np.min(bins), np.max(bins),
                                          num=8))
     if len(title) > 0:
         plt.suptitle(title)
     plt.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
     self.add_figure(f'histogram{n_hist}_{omic.name}', fig)
     return self
Example #7
0
def plot_histogram(series, ax, title):
    V.plot_histogram(x=clipping_quartile(series),
                     bins=n_bin,
                     ax=ax,
                     title=title,
                     fontsize=4)
Example #8
0
    def plot_disentanglement(
        self,
        factor_indices: Optional[Union[int, str, List[Union[int,
                                                            str]]]] = None,
        n_bins_factors: int = 15,
        n_bins_codes: int = 80,
        corr_type: Union[Literal['spearman', 'pearson', 'lasso', 'average',
                                 'mi'], ndarray] = 'average',
        original_factors: bool = True,
        show_all_codes: bool = False,
        sort_pairs: bool = True,
        title: str = '',
        return_figure: bool = False,
        seed: int = 1,
    ):
        r""" To illustrate the disentanglement of the codes, the codes' histogram
    bars are colored by the value of factors.

    Arguments:
      factor_names : list of String or Integer.
        Name or index of which factors will be used for visualization.
      factor_bins : factor is discretized into bins, then a LogisticRegression
        model will predict the bin (with color) given the code as input.
      corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix}
        Type of correlation, with special case 'mi' for mutual information.
          - If None, no sorting by correlation provided.
          - If an array, the array must have shape `[n_codes, n_factors]`
      show_all_codes : a Boolean.
        if False, only show most correlated codes-factors, otherwise,
        all codes are shown for each factor.
        This option only in effect when `corr_type` is not `None`.
      original_factors : optional original factors before discretized by
        `Criticizer`
    """
        ### prepare styled plot
        styles = dict(fontsize=12,
                      cbar_horizontal=False,
                      bins_color=int(n_bins_factors),
                      bins=int(n_bins_codes),
                      color='bwr',
                      alpha=0.8)
        # get all relevant factors
        if factor_indices is None:
            factor_indices = list(range(self.n_factors))
        factor_indices = [
            int(i) if isinstance(i, Number) else self.factor_names.index(i)
            for i in as_tuple(factor_indices)
        ]
        ### correlation
        if isinstance(corr_type, string_types):
            if corr_type == 'mi':
                corr = self.mutualinfo_matrix(
                    convert_to_tensor=self.dist_to_tensor, seed=seed)
                score_type = 'mutual-info'
            else:
                corr = self.correlation_matrix(
                    convert_to_tensor=self.dist_to_tensor,
                    method=corr_type,
                    seed=seed)
                score_type = corr_type
            # [n_factors, n_codes]
            corr = corr.T[factor_indices]
        ### directly give the correlation matrix
        elif isinstance(corr_type, ndarray):
            corr = corr_type
            if self.n_latents != self.n_factors and corr.shape[
                    0] == self.n_latents:
                corr = corr.T
            assert corr.shape == (self.n_factors, self.n_latents), \
              (f"Correlation matrix expect shape (n_factors={self.n_factors}, "
               f"n_codes={self.n_codes}) but given shape: {corr.shape}")
            score_type = 'score'
            corr = corr[factor_indices]
        ### exception
        else:
            raise ValueError(
                f"corr_type could be string, None or a matrix but given: {type(corr_type)}"
            )
        ### sorting the latents
        if sort_pairs:
            latent_indices = diagonal_linear_assignment(np.abs(corr),
                                                        nan_policy=0)
        else:
            latent_indices = np.arange(self.n_latents, dtype=np.int32)
        if not show_all_codes:
            latent_indices = latent_indices[:len(factor_indices)]
        corr = corr[:, latent_indices]
        ### prepare the data
        # factors
        F = (self.factors_original
             if original_factors else self.factors)[:, factor_indices]
        factor_names = np.asarray(self.factor_names)[factor_indices]
        # codes
        Z = self.dist_to_tensor(self.latents).numpy()[:, latent_indices]
        latent_names = np.asarray(self.latent_names)[latent_indices]
        ### create the figure
        nrow = F.shape[1]
        ncol = Z.shape[1] + 1
        fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=100)
        count = 1
        for fidx, (f, fname) in enumerate(zip(F.T, factor_names)):
            # the first plot show how the factor clustered
            ax, _, _ = vs.plot_histogram(x=f,
                                         color_val=f,
                                         ax=(nrow, ncol, count),
                                         cbar=False,
                                         title=f"{fname}",
                                         **styles)
            ax.tick_params(axis='y', labelleft=False)
            count += 1
            # the rest of the row show how the codes align with the factor
            for zidx, (score, z,
                       zname) in enumerate(zip(corr[fidx], Z.T, latent_names)):
                text = "*" if fidx == zidx else ""
                ax, _, _ = vs.plot_histogram(
                    x=z,
                    color_val=f,
                    ax=(nrow, ncol, count),
                    cbar=False,
                    title=f"{text}{fname}-{zname} (${score:.2f}$)",
                    bold_title=True if fidx == zidx else False,
                    **styles)
                ax.tick_params(axis='y', labelleft=False)
                count += 1
        ### fine tune the plot
        fig.suptitle(f"[{score_type}]{title}", fontsize=12)
        fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
        if return_figure:
            return fig
        return self.add_figure(
            f"disentanglement_{'original' if original_factors else 'discretized'}",
            fig)
Example #9
0
  def plot_histogram_heatmap(self,
                             factors=None,
                             factor_bins=15,
                             histogram_bins=80,
                             n_codes_per_factor=6,
                             corr_method='average',
                             original_factors=True):
    r""" The histogram bars are colored by the value of factors

    Arguments:
      factors : which factors will be used
      factor_bins : factor is discretized into bins, then a LogisticRegression
        model will predict the bin (with color) given the code as input.
      orginal_factors : optional original factors before discretized by
        `Criticizer`
    """
    self.assert_sampled()
    from matplotlib import pyplot as plt
    import seaborn as sns
    sns.set()
    if n_codes_per_factor is None:
      n_codes_per_factor = self.n_codes
    else:
      n_codes_per_factor = int(n_codes_per_factor)
    styles = dict(fontsize=12,
                  val_bins=int(factor_bins),
                  color='bwr',
                  bins=int(histogram_bins),
                  alpha=0.8)
    ## correlation
    train_corr, test_corr = self.cal_correlation_matrix(mean=True,
                                                        method=corr_method,
                                                        decode=False)
    corr = (train_corr + test_corr) / 2
    ## prepare the data
    factors = self._check_factors(factors)
    Z = np.concatenate(self.representations_mean, axis=0)
    F = np.concatenate(
        self.original_factors if original_factors else self.factors,
        axis=0)[:, factors]
    # annotations
    factors_name = self.factors_name[factors]
    codes_name = self.codes_name
    # create the figure
    nrow = F.shape[1]
    ncol = int(1 + n_codes_per_factor)
    fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 3, dpi=80)
    plot_count = 1
    for fidx, (f, fname) in enumerate(zip(F.T, factors_name)):
      c = corr[:, fidx]
      vs.plot_histogram(f,
                        val=f,
                        ax=(nrow, ncol, plot_count),
                        cbar=True,
                        cbar_horizontal=False,
                        title=fname,
                        **styles)
      plot_count += 1
      # all codes are visualized
      if n_codes_per_factor == self.n_codes:
        all_codes = range(self.n_codes)
      # lower to higher correlation
      else:
        zids = np.argsort(c)
        bottom = zids[:n_codes_per_factor // 2]
        top = zids[-(n_codes_per_factor - n_codes_per_factor // 2):]
        all_codes = (top.tolist()[::-1] + bottom.tolist()[::-1])
      for i in all_codes:
        z = Z[:, i]
        zname = codes_name[i]
        vs.plot_histogram(z,
                          val=f,
                          ax=(nrow, ncol, plot_count),
                          title='[%.2g]%s' % (c[i], zname),
                          **styles)
        plot_count += 1
    fig.tight_layout()
    self.add_figure(
        "histogram_%s" % ("original" if original_factors else "discretized"),
        fig)
    return self
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

tf.random.set_seed(8)
np.random.seed(8)
sns.set()

shape = (1024, 1)
total_figures = 1 + 4 * 2
ncol = nrow = int(np.ceil(np.sqrt(total_figures)))
hist_bins = 120
for dist, fn in [('uniform', np.random.rand), ('normal', np.random.randn)]:
    x = fn(*shape)
    vs.plot_figure(nrow=12, ncol=12, dpi=120)
    ax = vs.subplot(nrow, ncol, 1)
    ax = vs.plot_histogram(x, bins=hist_bins, title=dist, ax=ax)
    idx = 2
    for strategy in ('gmm', 'uniform', 'quantile', 'kmeans'):
        for n_bins in (5, 10):
            y = discretizing(x, n_bins=n_bins, strategy=strategy)
            title = '%s-%d' % (strategy, n_bins)
            ax = vs.subplot(nrow, ncol, idx)
            vs.plot_histogram(y, bins=hist_bins, ax=ax, title=title)
            idx += 1
    plt.tight_layout()

# ====== special case: GMM discretizing ====== #
vs.plot_figure()
y, gmm = discretizing(x, n_bins=2, strategy='gmm', return_model=True)
gmm = gmm[0]
vs.plot_gaussian_mixture(x,
Example #11
0
  def plot_disentanglement(self,
                           factor_names=None,
                           n_bins_factors=15,
                           n_bins_codes=80,
                           corr_type='average',
                           original_factors=True,
                           show_all_codes=False,
                           title='',
                           return_figure=False):
    r""" To illustrate the disentanglement of the codes, the codes' histogram
    bars are colored by the value of factors.

    Arguments:
      factor_names : list of String or Integer.
        Name or index of which factors will be used for visualization.
      factor_bins : factor is discretized into bins, then a LogisticRegression
        model will predict the bin (with color) given the code as input.
      corr_type : {'spearman', 'pearson', 'lasso', 'average', 'mi', None, matrix}
        Type of correlation, with special case 'mi' for mutual information.
          - If None, no sorting by correlation provided.
          - If an array, the array must have shape `[n_codes, n_factors]`
      show_all_codes : a Boolean.
        if False, only show most correlated codes-factors, otherwise,
        all codes are shown for each factor.
        This option only in effect when `corr_type` is not `None`.
      original_factors : optional original factors before discretized by
        `Criticizer`
    """
    self.assert_sampled()
    ### prepare styled plot
    from matplotlib import pyplot as plt
    import seaborn as sns
    sns.set()
    styles = dict(fontsize=12,
                  cbar_horizontal=False,
                  bins_color=int(n_bins_factors),
                  bins=int(n_bins_codes),
                  color='bwr',
                  alpha=0.8)
    # get all relevant factors
    factor_ids = self._check_factors(factor_names)
    ### correlation
    if isinstance(corr_type, string_types):
      if corr_type == 'mi':
        train_corr, test_corr = self.create_mutualinfo_matrix(mean=True)
        score_type = 'mutual-info'
      else:
        train_corr, test_corr = self.create_correlation_matrix(mean=True,
                                                               method=corr_type)
        score_type = corr_type
      # [n_factors, n_codes]
      corr = ((train_corr + test_corr) / 2.).T
      corr = corr[factor_ids]
      code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0)
      if not show_all_codes:
        code_ids = code_ids[:len(factor_ids)]
    # directly give the correlation matrix
    elif isinstance(corr_type, np.ndarray):
      corr = corr_type
      if self.n_codes != self.n_factors and corr.shape[0] == self.n_codes:
        corr = corr.T
      assert corr.shape == (self.n_factors, self.n_codes), \
        (f"Correlation matrix expect shape (n_factors={self.n_factors}, "
         f"n_codes={self.n_codes}) but given shape: {corr.shape}")
      score_type = 'score'
      corr = corr[factor_ids]
      code_ids = diagonal_linear_assignment(np.abs(corr), nan_policy=0)
      if not show_all_codes:
        code_ids = code_ids[:len(factor_ids)]
    # no correlation provided
    elif corr_type is None:
      train_corr, test_corr = self.create_correlation_matrix(mean=True,
                                                             method='spearman')
      score_type = 'spearman'
      # [n_factors, n_codes]
      corr = ((train_corr + test_corr) / 2.).T
      code_ids = np.arange(self.n_codes, dtype=np.int32)
    # exception
    else:
      raise ValueError(
          f"corr_type could be string, None or a matrix but given: {type(corr_type)}"
      )
    # applying the indexing
    corr = corr[:, code_ids]
    ### prepare the data
    # factors
    F = np.concatenate(
        self.original_factors if original_factors else self.factors,
        axis=0,
    )[:, factor_ids]
    factor_names = self.factor_names[factor_ids]
    # codes
    Z = np.concatenate(self.representations_mean, axis=0)[:, code_ids]
    code_names = self.code_names[code_ids]
    ### create the figure
    nrow = F.shape[1]
    ncol = Z.shape[1] + 1
    fig = vs.plot_figure(nrow=nrow * 3, ncol=ncol * 2.8, dpi=80)
    count = 1
    for fidx, (f, fname) in enumerate(zip(F.T, factor_names)):
      # the first plot show how the factor clustered
      ax = vs.plot_histogram(x=f,
                             color_val=f,
                             ax=(nrow, ncol, count),
                             cbar=False,
                             title=f"{fname}",
                             **styles)
      plt.gca().tick_params(axis='y', labelleft=False)
      count += 1
      # the rest of the row show how the codes align with the factor
      for zidx, (score, z, zname) in enumerate(zip(corr[fidx], Z.T,
                                                   code_names)):
        text = "*" if fidx == zidx else ""
        ax = vs.plot_histogram(x=z,
                               color_val=f,
                               ax=(nrow, ncol, count),
                               cbar=False,
                               title=f"{text}{fname}-{zname} (${score:.2f}$)",
                               bold_title=True if fidx == zidx else False,
                               **styles)
        plt.gca().tick_params(axis='y', labelleft=False)
        count += 1
    ### fine tune the plot
    fig.suptitle(f"[{score_type}]{title}", fontsize=12)
    fig.tight_layout(rect=[0.0, 0.03, 1.0, 0.97])
    if return_figure:
      return fig
    return self.add_figure(
        f"disentanglement_{'original' if original_factors else 'discretized'}",
        fig)
Example #12
0
def plot_histogram(series, ax, title):
  V.plot_histogram(x=clipping_quartile(series), bins=n_bin, ax=ax,
                   title=title, fontsize=4)