def mutual_info_gap( self, n_bins: int = 10, strategy: Literal['uniform', 'quantile', 'kmeans', 'gmm'] = 'uniform', ) -> float: """Mutual Information Gap Parameters ---------- n_bins : int, optional number of bins for discretizing the latents, by default 10 strategy : {'uniform', 'quantile', 'kmeans', 'gmm'} Strategy used to define the widths of the bins. 'uniform' - All bins in each feature have identical widths. 'quantile' - All bins in each feature have the same number of points. 'kmeans' - Values in each bin have the same nearest center of a 1D cluster. , by default 'uniform' Returns ------- float mutual information gap score """ z = self.dist_to_tensor(self.latents).numpy() if n_bins > 1: z = discretizing(z, independent=True, n_bins=n_bins, strategy=strategy) f = self.factors return mutual_info_gap(z, f)
def __init__( self, factors: Union[tf.Tensor, np.ndarray, DatasetV2], factor_names: Optional[List[str]] = None, categorical: Union[bool, List[bool]] = False, n_bins: Optional[Union[int, List[int]]] = None, strategy: Literal['uniform', 'quantile', 'kmeans', 'gmm'] = 'uniform', ): if isinstance(factors, tf.data.Dataset): factors = tf.stack([x for x in factors]) if tf.is_tensor(factors): factors = factors.numpy() factors = np.atleast_2d(factors) if factors.ndim != 2: raise ValueError( "factors must be a matrix [n_observations, n_factor], " f"but given shape:{factors.shape}") # check factors is one-hot encoded if np.all(np.sum(factors, axis=-1) == 1): factors = np.argmax(factors, axis=1)[:, np.newaxis] categorical = True n_factors = factors.shape[1] # discretizing factors_original = np.array(factors) n_bins = as_tuple(n_bins, N=n_factors) strategy = as_tuple(strategy, N=n_factors, t=str) for i, (b, s) in enumerate(zip(n_bins, strategy)): if b is not None: factors[:, i] = discretizing(factors[:, i][:, np.newaxis], n_bins=b, strategy=s).ravel() factors = factors.astype(np.int64) # factor_names if factor_names is None: factor_names = [f'F{i}' for i in range(n_factors)] else: factor_names = [str(i) for i in tf.nest.flatten(factor_names)] assert len(factor_names) == n_factors, \ f'Given {n_factors} but only {len(factor_names)} names' # store the attributes self.factors = factors self.factors_original = factors_original self.discretizer = list(zip(n_bins, strategy)) self.categorical = as_tuple(categorical, N=n_factors, t=bool) self.names = factor_names self.labels = [np.unique(x) for x in factors.T] self.sizes = [len(lab) for lab in self.labels]
def plot_uncertainty_scatter(self, factors=None, n_samples=2, algo='tsne'): r""" Plotting the scatter points of the mean and sampled latent codes, colored by the factors. Arguments: factors : list of Integer or String. The index or name of factors taken into account for analyzing. """ factors = self._check_factors(factors) # this all include tarin and test data separatedly z_mean = np.concatenate(self.representations_mean) z_var = np.concatenate( [np.mean(var, axis=1) for var in self.representations_variance]) z_samples = [ z for z in np.concatenate(self.representations_sample(int(n_samples)), axis=1) ] F = np.concatenate(self.original_factors, axis=0)[:, factors] labels = self.factors_name[factors] # preprocessing inputs = tuple([z_mean] + z_samples) Z = dimension_reduce(*inputs, algo=algo, n_components=2, return_model=False, combined=True, random_state=self.randint) V = utils.discretizing(z_var[:, np.newaxis], n_bins=10).ravel() # the figure nrow = 3 ncol = int(np.ceil(len(labels) / nrow)) fig = vs.plot_figure(nrow=nrow * 4, ncol=ncol * 4, dpi=80) for idx, (name, y) in enumerate(zip(labels, F.T)): ax = vs.plot_subplot(nrow, ncol, idx + 1) for i, x in enumerate(Z): kw = dict(val=y, color="coolwarm", ax=ax, x=x, grid=False, legend_enable=False, centroids=True, fontsize=12) if i == 0: # the mean value vs.plot_scatter(size=V, size_range=(8, 80), alpha=0.3, linewidths=0, cbar=True, cbar_horizontal=True, title=name, **kw) else: # the samples vs.plot_scatter_text(size=8, marker='x', alpha=0.8, weight='light', **kw) # fig.tight_layout() self.add_figure("uncertainty_scatter_%s" % algo, fig) return self
def sample_batch(self, inputs=None, latents=None, factors=None, n_bins=5, strategy=None, factor_names=None, train_percent=0.8, n_samples=[2000, 1000], batch_size=64, verbose=True): r""" Sample a batch of training and testing for evaluation of VAE Arguments: inputs : list of `ndarray` or `tensorflow.data.Dataset`. Inputs to the model, note all data will be loaded in-memory latents : list of `Distribution` distribution of learned representation factors : a `ndarray` or `tensorflow.data.Dataset`. a matrix of groundtruth factors, note all data will be loaded in-memory n_bins : int or array-like, shape (n_features,) (default=5) The number of bins to produce. Raises ValueError if ``n_bins < 2``. strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile') Strategy used to define the widths of the bins. `None` - No discretization performed uniform - All bins in each feature have identical widths. quantile - All bins in each feature have the same number of points. kmeans - Values in each bin have the same nearest center of a 1D k-means cluster. gmm - using the components (in sorted order of mean) of Gaussian mixture to label. factor_names : train_percent : n_samples : batch_size : Returns: `Criticizer` with sampled data """ from odin.bay.helpers import concat_distributions inputs, latents, factors = prepare_inputs_factors(inputs, latents, factors, verbose=verbose) n_samples = as_tuple(n_samples, t=int, N=2) n_inputs = factors.shape[0] # ====== split train test ====== # if inputs is None: latents = latents[self._latent_indices] split = int(n_inputs * train_percent) train_ids = slice(None, split) test_ids = slice(split, None) train_latents = [z[train_ids] for z in latents] test_latents = [z[test_ids] for z in latents] if len(latents) == 1: train_latents = train_latents[0] test_latents = test_latents[0] else: self._is_multi_latents = len(latents) train_latents = CombinedDistribution(train_latents, name="Latents") test_latents = CombinedDistribution(test_latents, name="Latents") else: ids = self.random_state.permutation(n_inputs) split = int(train_percent * n_inputs) train_ids, test_ids = ids[:split], ids[split:] train_inputs = [i[train_ids] for i in inputs] test_inputs = [i[test_ids] for i in inputs] # ====== create discretized factors ====== # f_original = (factors[train_ids], factors[test_ids]) # discretizing the factors if strategy is not None: if verbose: print(f"Discretizing factors: {n_bins} - {strategy}") factors = utils.discretizing(factors, n_bins=int(n_bins), strategy=strategy) # check for singular factor and ignore it ids = [] for i, (name, f) in enumerate(zip(factor_names, factors.T)): c = Counter(f) if len(c) < 2: warnings.warn( f"Ignore factor with name '{name}', singular data: {f}") else: ids.append(i) if len(ids) != len(factor_names): f_original = (f_original[0][:, ids], f_original[1][:, ids]) factor_names = factor_names[ids] factors = factors[:, ids] # create the factor class for sampling train_factors = Factor(factors[train_ids], factor_names=factor_names, random_state=self.randint) test_factors = Factor(factors[test_ids], factor_names=factor_names, random_state=self.randint) # ====== sampling ====== # def sampling(inputs_, factors_, nsamples, title): Xs = [list() for _ in range(len(inputs))] # inputs Ys = [] # factors Zs = [] # latents Os = [] # outputs indices = [] n = 0 if verbose: prog = tqdm(desc='Sampling %s' % title, total=nsamples) while n < nsamples: batch = min(batch_size, nsamples - n, factors_.shape[0]) if verbose: prog.update(int(batch)) # factors y, ids = factors_.sample_factors(num=batch, return_indices=True) indices.append(ids) Ys.append(y) # inputs inps = [] for x, i in zip(Xs, inputs_): i = i[ids, :] x.append(i) inps.append(i) # latents representation z = self.encode(inps, sample_shape=()) o = tf.nest.flatten(self.decode(z)) if isinstance(z, (tuple, list)): z = z[self._latent_indices] if len(z) == 1: z = z[0] else: self._is_multi_latents = len(z) Os.append(o) Zs.append(z) # update the counter n += len(y) # end progress if verbose: prog.clear() prog.close() # aggregate all data Xs = [np.concatenate(x, axis=0) for x in Xs] Ys = np.concatenate(Ys, axis=0) if self.is_multi_latents: Zs = CombinedDistribution( [ concat_distributions( [z[zi] for z in Zs], name="Latents%d" % zi, ) for zi in range(self.is_multi_latents) ], name="Latents", ) else: Zs = concat_distributions(Zs, name="Latents") Os = [ concat_distributions( [j[i] for j in Os], name="Output%d" % i, ) for i in range(len(Os[0])) ] return Xs, Ys, Zs, Os, np.concatenate(indices, axis=0) # perform sampling if inputs is not None: train = sampling(inputs_=train_inputs, factors_=train_factors, nsamples=n_samples[0], title="Train") test = sampling(inputs_=test_inputs, factors_=test_factors, nsamples=n_samples[1], title="Test ") ids_train = train[4] ids_test = test[4] # assign the variables self._inputs = (train[0], test[0]) self._factors = (train[1], test[1]) self._representations = (train[2], test[2]) self._reconstructions = (train[3], test[3]) self._original_factors = (f_original[0][ids_train], f_original[1][ids_test]) else: self._inputs = (None, None) self._factors = (train_factors.factors, test_factors.factors) self._representations = (train_latents, test_latents) self._reconstructions = (None, None) self._original_factors = (f_original[0], f_original[1]) self._factor_names = train_factors.factor_names # concatenated self._representations_full = concat_distributions(self.representations) self._factors_full = np.concatenate(self.factors, axis=0) self._original_factors_full = np.concatenate(self.original_factors, axis=0) return self
def plot_correlation_scatter(self, omic1=OMIC.transcriptomic, omic2=OMIC.proteomic, var_names1='auto', var_names2='auto', is_marker_pairs=True, log1=True, log2=True, max_scatter_points=200, top=3, bottom=3, title='', return_figure=False): r""" Mapping from omic1 to omic2 Arguments: omic1, omic2 : instance of OMIC. With `omic1` represent the x-axis, and `omic2` represent the y-axis. var_names1 : list of all variable name for `omic1` """ omic1 = OMIC.parse(omic1) omic2 = OMIC.parse(omic2) if isinstance(var_names1, string_types) and var_names1 == 'auto': var_names1 = omic1.markers if isinstance(var_names2, string_types) and var_names2 == 'auto': var_names2 = omic2.markers if var_names1 is None or var_names2 is None: is_marker_pairs = False max_scatter_points = int(max_scatter_points) # get all correlations corr = self.get_correlation(omic1, omic2) corr_map = {(x[0], x[1]): (0 if np.isnan(x[2]) else x[2], 0 if np.isnan(x[3]) else x[3]) for x in corr} om1_names = self.get_var_names(omic1) om2_names = self.get_var_names(omic2) om1_idx = {j: i for i, j in enumerate(om1_names)} om2_idx = {j: i for i, j in enumerate(om2_names)} # extract the data and normalization X1 = self.numpy(omic1) library = np.sum(X1, axis=1, keepdims=True) library = discretizing(library, n_bins=10, strategy='quantile').ravel() if log1: s = np.sum(X1, axis=1, keepdims=True) X1 = np.log1p(X1 / s * np.median(s)) X2 = self.numpy(omic2) if log2: s = np.sum(X2, axis=1, keepdims=True) X2 = np.log1p(X2 / s * np.median(s)) ### getting the marker pairs all_pairs = [] # coordinate marker pairs if is_marker_pairs: pairs = [(i1, i2) for i1, i2 in zip(var_names1, var_names2) if i1 in om1_idx and i2 in om2_idx] var_names1 = [i for i, _ in pairs] var_names2 = [i for _, i in pairs] # filter omic2 if var_names2 is not None: var_names2 = [i for i in var_names2 if i in om2_names] else: var_names2 = om2_names assert len(var_names2) > 0, \ (f"None of the variables {var_names2} is contained in variable list " f"of OMIC {omic2.name}") nrow = len(var_names2) # filter omic1 if var_names1 is not None: var_names1 = [i for i in var_names1 if i in om1_names] ncol = len(var_names1) assert len(var_names1) > 0, \ (f"None of the variables {var_names1} is contained in variable list " f"of OMIC {omic1.name}") for name2 in var_names2: for name1 in var_names1: all_pairs.append((om1_idx[name1], om2_idx[name2])) else: # top and bottom correlation pairs top = int(top) bottom = int(bottom) ncol = top + bottom # pick all top and bottom of omic1 coordinated to omic2 for name in var_names2: i2 = om2_idx[name] pairs = sorted([[sum(corr_map[(i1, i2)]), i1] for i1 in range(len(om1_names))]) for _, i1 in pairs[-top:][::-1] + pairs[:bottom][::-1]: all_pairs.append((i1, i2)) ### downsampling scatter points if max_scatter_points > 0: ids = np.random.permutation(len(X1))[:max_scatter_points] else: ids = np.arange(len(X1), dtype=np.int32) ### plotting fig = plt.figure(figsize=(ncol * 2, nrow * 2 + 2), dpi=80) for i, pair in enumerate(all_pairs): ax = plt.subplot(nrow, ncol, i + 1) p, s = corr_map[pair] idx1, idx2 = pair x1 = X1[:, idx1] x2 = X2[:, idx2] crow = i // ncol ccol = i % ncol if is_marker_pairs: color = 'salmon' if crow == ccol else 'blue' else: color = 'salmon' if ccol < top else 'blue' vs.plot_scatter(x=x1[ids], y=x2[ids], color=color, ax=ax, size=library[ids], size_range=(6, 30), legend_enable=False, linewidths=0., cbar=False, alpha=0.3) # additional title for first column ax.set_title(f"{om1_names[idx1]}\n$p={p:.2g}$ $s={s:.2g}$", fontsize=8) # beginning of every column if i % ncol == 0: ax.set_ylabel(f"{om2_names[idx2]}", fontsize=8, weight='bold') ## big title plt.suptitle(f"[x:{omic1.name}_y:{omic2.name}]{title}", fontsize=10) fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98]) ### store and return if return_figure: return fig self.add_figure( f"corr_{omic1.name}{'log' if log1 else 'raw'}_" f"{omic2.name}{'log' if log2 else 'raw'}", fig) return self
def sample_batch(self, inputs, factors=None, discretizing=False, n_bins=5, strategy='quantile', factors_name=None, train_percent=0.8, n_samples=[2000, 1000], batch_size=32, verbose=True): r""" Sample a batch of training and testing for evaluation of VAE Arguments: inputs : list of `ndarray` or `tensorflow.data.Dataset`. Inputs to the model, note all data will be loaded in-memory factors : a `ndarray` or `tensorflow.data.Dataset`. a matrix of groundtruth factors, note all data will be loaded in-memory discretizing : if True, turn continuous factors into discrete n_bins : int or array-like, shape (n_features,) (default=5) The number of bins to produce. Raises ValueError if ``n_bins < 2``. strategy : {'uniform', 'quantile', 'kmeans', 'gmm'}, (default='quantile') Strategy used to define the widths of the bins. uniform - All bins in each feature have identical widths. quantile - All bins in each feature have the same number of points. kmeans - Values in each bin have the same nearest center of a 1D k-means cluster. gmm - using the components (in sorted order of mean) of Gaussian mixture to label. factors_name : train_percent : n_samples : batch_size : Returns: `Criticizer` with sampled data """ ### inputs is a tensorflow Dataset, convert everything to numpy if isinstance(inputs, tf.data.Dataset): struct = tf.nest.flatten( tf.data.experimental.get_structure(inputs)) n_inputs = len(struct) if verbose: inputs = tqdm(inputs, desc="Reading data") if factors is None: # include factors assert n_inputs >= 2, \ "factors are not included in the dataset: %s" % str(inputs) x, y = [list() for _ in range((n_inputs - 1))], [] for data in inputs: for i, j in enumerate(data[:-1]): x[i].append(j) y.append(data[-1]) inputs = [tf.concat(i, axis=0).numpy() for i in x] if n_inputs == 2: inputs = inputs[0] factors = tf.concat(y, axis=0).numpy() else: # factors separated x = [list() for _ in range(n_inputs)] for data in inputs: for i, j in enumerate(tf.nest.flatten(data)): x[i].append(j) inputs = [tf.concat(i, axis=0).numpy() for i in x] if n_inputs == 1: inputs = inputs[0] if isinstance(factors, tf.data.Dataset): if verbose: factors = tqdm(factors, desc="Reading factors") factors = tf.concat([i for i in factors], axis=0) # post-processing is_list_inputs = isinstance(inputs, (tuple, list)) inputs = tf.nest.flatten(inputs) assert len(factors.shape) == 2, "factors must be a matrix" # ====== split train test ====== # ids = self.random_state.permutation(factors.shape[0]) split = int(train_percent * factors.shape[0]) train_ids, test_ids = ids[:split], ids[split:] train_inputs = [i[train_ids] for i in inputs] test_inputs = [i[test_ids] for i in inputs] n_samples = as_tuple(n_samples, t=int, N=2) # ====== create discretized factors ====== # f_original = (factors[train_ids], factors[test_ids]) if discretizing: if verbose: print("Discretizing factors:", int(n_bins), '-', strategy) factors = utils.discretizing(factors, n_bins=int(n_bins), strategy=strategy) train_factors = Factor(factors[train_ids], factors_name=factors_name, random_state=self.randint) test_factors = Factor(factors[test_ids], factors_name=factors_name, random_state=self.randint) # ====== sampling ====== # def sampling(inputs_, factors_, nsamples, title): Xs = [list() for _ in range(len(inputs))] # inputs Ys = [] # factors Zs = [] # latents Os = [] # outputs indices = [] n = 0 if verbose: prog = tqdm(desc='Sampling %s' % title, total=nsamples) while n < nsamples: batch = min(batch_size, nsamples - n, factors_.shape[0]) if verbose: prog.update(int(batch)) # factors y, ids = factors_.sample_factors(num=batch, return_indices=True) indices.append(ids) Ys.append(y) # inputs inps = [] for x, i in zip(Xs, inputs_): i = i[ids, :] x.append(i) inps.append(i) # latents representation z = self.encode(inps, sample_shape=(), first_latent=False) Os.append(tf.nest.flatten(self.decode(z))) Zs.append(z[0] if isinstance(z, (tuple, list)) else z) # update the counter n += len(y) # aggregate all data Xs = [np.concatenate(x, axis=0) for x in Xs] Ys = np.concatenate(Ys, axis=0) Zs = concat_distribution(Zs, name="Latents") Os = [ concat_distribution( [j[i] for j in Os], name="Output%d" % i, ) for i in range(len(Os[0])) ] return Xs, Ys, Zs, Os, np.concatenate(indices, axis=0) # perform sampling train = sampling(inputs_=train_inputs, factors_=train_factors, nsamples=n_samples[0], title="Train") test = sampling(inputs_=test_inputs, factors_=test_factors, nsamples=n_samples[1], title="Test ") ids_train = train[4] ids_test = test[4] # assign the variables self._is_list_inputs = is_list_inputs self._inputs = (train[0], test[0]) self._factors = (train[1], test[1]) self._factors_name = train_factors.factors_name self._representations = (train[2], test[2]) self._reconstructions = (train[3], test[3]) self._original_factors = (f_original[0][ids_train], f_original[1][ids_test]) return self