def test_densmap_trustworthiness_on_iris(iris): densmap_iris_model = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data) embedding = densmap_iris_model.embedding_ trust = trustworthiness(iris.data, embedding, 10) assert ( trust >= 0.97 ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format( trust) with pytest.raises(NotImplementedError): densmap_iris_model.transform(iris.data[:10]) with pytest.raises(ValueError): densmap_iris_model.inverse_transform(embedding[:10]) with pytest.raises(NotImplementedError): _ = UMAP( n_neighbors=10, min_dist=0.01, random_state=42, densmap=True, verbose=True, ).fit(iris.data, y=iris.target)
def test_umap_inverse_transform_fails_expectedly(sparse_spatial_data, nn_data): u = UMAP(n_epochs=11) u.fit(sparse_spatial_data[:100]) with pytest.raises(ValueError): u.inverse_transform(u.embedding_[:10]) u = UMAP(metric="dice", n_epochs=11) u.fit(nn_data[:100]) with pytest.raises(ValueError): u.inverse_transform(u.embedding_[:10])
def perform_latent_walk_in_umap_space(domain_configs: List[DomainConfig], dataloader_type: str, random_state: int = 1234): if len(domain_configs) != 2: raise RuntimeError( "Expects two domain configurations (image and sequencing domain)") if domain_configs[0].name == "image" and domain_configs[1].name == "rna": image_domain_config = domain_configs[0] rna_domain_config = domain_configs[1] elif domain_configs[0].name == "rna" and domain_configs[1].name == "image": image_domain_config = domain_configs[1] rna_domain_config = domain_configs[0] else: raise RuntimeError( "Expected domain configuration types are >image< and >rna<.") rna_data_loader = rna_domain_config.data_loader_dict[dataloader_type] image_data_loader = image_domain_config.data_loader_dict[dataloader_type] device = get_device() geneset_ae = rna_domain_config.domain_model_config.model.to(device).eval() image_ae = image_domain_config.domain_model_config.model.to(device).eval() all_rna_latents = [] all_rna_labels = [] all_image_latents = [] all_image_labels = [] grid_sequences = [] grid_geneset_activities = [] grid_images = [] rna_cell_ids = [] image_cell_ids = [] for i, sample in enumerate(rna_data_loader): rna_inputs = sample[rna_domain_config.data_key].to(device) rna_labels = sample[rna_domain_config.label_key] rna_cell_ids.extend(sample["id"]) geneset_ae_output = geneset_ae(rna_inputs) latents = geneset_ae_output["latents"] all_rna_latents.extend(list(latents.clone().detach().cpu().numpy())) all_rna_labels.extend(list(rna_labels.clone().detach().cpu().numpy())) for i, sample in enumerate(image_data_loader): image_inputs = sample[image_domain_config.data_key].to(device) image_labels = sample[image_domain_config.label_key].to(device) image_cell_ids.extend(sample["id"]) image_ae_output = image_ae(image_inputs) latents = image_ae_output["latents"] all_image_latents.extend(list(latents.clone().detach().cpu().numpy())) all_image_labels.extend( list(image_labels.clone().detach().cpu().numpy())) all_latents = np.concatenate( (np.array(all_image_latents), np.array(all_rna_latents)), axis=0) all_labels = np.concatenate( (np.array(all_image_labels), np.array(all_rna_labels)), axis=0) all_domain_labels = np.concatenate( ( np.repeat("image", len(all_image_labels)), np.repeat("rna", len(all_rna_labels)), ), axis=0, ) all_cell_ids = np.concatenate((image_cell_ids, rna_cell_ids), axis=0) mapper = UMAP(random_state=random_state) transformed = mapper.fit_transform(all_latents) min_umap_c1 = min(transformed[:, 0]) max_umap_c1 = max(transformed[:, 0]) min_umap_c2 = min(transformed[:, 1]) max_umap_c2 = max(transformed[:, 1]) test_pts = np.array([ (np.array([min_umap_c1, max_umap_c2]) * (1 - x) + np.array([max_umap_c1, max_umap_c2]) * x) * (1 - y) + (np.array([min_umap_c1, min_umap_c2]) * (1 - x) + np.array([max_umap_c1, min_umap_c2]) * x) * y for y in np.linspace(0, 1, 10) for x in np.linspace(0, 1, 10) ]) inv_transformed_points = mapper.inverse_transform(test_pts) test_pts_ds = torch.utils.data.TensorDataset( torch.from_numpy(inv_transformed_points)) test_pts_loader = torch.utils.data.DataLoader(test_pts_ds, batch_size=64, shuffle=False) for i, sample in enumerate(test_pts_loader): image_recons = image_ae.decode(sample[0].to(device)) rna_recons, decoded_geneset_activities = geneset_ae.decode( sample[0].to(device)) grid_images.extend(list(image_recons.clone().detach().cpu().numpy())) grid_sequences.extend(list(rna_recons.clone().detach().cpu().numpy())) grid_geneset_activities.extend( list(decoded_geneset_activities.clone().detach().cpu().numpy())) data_dict = { "grid_points": test_pts, "grid_images": grid_images, "grid_sequences": grid_sequences, "grid_geneset_activities": grid_geneset_activities, "all_latents": all_latents, "all_labels": all_labels, "all_domain_labels": all_domain_labels, "all_cell_ids": all_cell_ids, } return data_dict
class PCAUmap: def __init__( self, n_neighbors=15, use_pca=1, kernel='linear', min_dist=0.1, n_components=2, random_state=None, transform_seed=None, scaler=True, metric="euclidean", augment_size=3, impute_rate=0.1, ): if kernel == 'linear': self.pca = PCA() else: self.pca = KernelPCA(kernel=kernel, fit_inverse_transform=True) self.umap = UMAP( random_state=random_state, transform_seed=transform_seed, n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric, ) self.use_pca = use_pca self.random_state = random_state self.scaler = StandardScaler() self.data = None self.pca_features = None self.embedding = None self.imputer = KNNImputer() self.augment_size = augment_size self.impute_rate = impute_rate def fit(self, data): self.data = pd.DataFrame(data) augmented_data = self.augumentation(self.augment_size, self.impute_rate) if self.scaler is None: if self.use_pca is None: self.umap.fit(augmented_data) self.embedding = self.umap.transform(data) else: self.umap.fit(self.pca.fit_transform(augmented_data)) self.pca_features = self.pca.transform(data) self.embedding = self.umap.transform(self.pca_features) else: if self.use_pca is None: self.umap.fit(self.scaler.fit_transform(augmented_data)) self.embedding = self.umap.transform( self.scaler.transform(data)) else: self.umap.fit( self.pca.fit_transform( self.scaler.fit_transform(augmented_data))) self.pca_features = self.pca.transform( self.scaler.transform(data)) self.embedding = self.umap.transform(self.pca_features) return self def transform(self, data): self.data = pd.DataFrame(data) if self.scaler is None: if self.pca is None: self.embedding = self.umap.transform(data) return self.embedding else: self.pca_features = self.pca.transform(data) self.embedding = self.umap.transform(self.pca_features) return self.embedding else: if self.pca is None: self.embedding = self.umap.transform( self.scaler.transform(data)) return self.embedding else: self.pca_features = self.pca.transform( self.scaler.transform(data)) self.embedding = self.umap.transform(self.pca_features) return self.embedding def fit_transform(self, data): self.fit(data) return self.transform(data) def inverse_transform(self, embedded): if self.scaler is None: if self.pca is None: return self.umap.inverse_transform(embedded) else: return self.pca.inverse_transform( self.umap.inverse_transform(embedded)) else: if self.pca is None: return self.scaler.inverse_transform( self.umap.inverse_transform(embedded)) else: return self.scaler.inverse_transform( self.pca.inverse_transform( self.umap.inverse_transform(embedded))) def pca_summary(self, c=None): plt.figure(figsize=(6, 6)) if c is None: plt.scatter(self.pca_features[:, 0], self.pca_features[:, 1], alpha=0.5) else: plt.scatter(self.pca_features[:, 0], self.pca_features[:, 1], alpha=0.5, c=c) plt.xlabel("PC1 ({}%)".format( int(self.pca.explained_variance_ratio_[0] * 100))) plt.ylabel("PC2 ({}%)".format( int(self.pca.explained_variance_ratio_[1] * 100))) plt.grid() plt.show() plt.figure(figsize=(6, 6)) plt.scatter(self.pca.components_[0], self.pca.components_[1], alpha=0.5) plt.xlabel("loading 1") plt.ylabel("loading 2") plt.grid() plt.show() plt.figure(figsize=(6, 6)) plt.plot([0] + list(np.cumsum(self.pca.explained_variance_ratio_)), "-o") plt.xlabel("Number of principal components") plt.ylabel("Cumulative contribution ratio") plt.grid() plt.show() def map_predicted_values( self, model, c=None, alpha=0.5, edgecolors="k", figsize=(8, 6), h=0.2, cm=plt.cm.jet, ): x_min = self.embedding[:, 0].min() - 0.5 x_max = self.embedding[:, 0].max() + 0.5 y_min = self.embedding[:, 1].min() - 0.5 y_max = self.embedding[:, 1].max() + 0.5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) plt.figure(figsize=figsize) if hasattr(model, "predict_proba"): Z = model.predict_proba( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()]))[:, 1] elif hasattr(model, "decision_function"): Z = model.decision_function( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()])) else: Z = model.predict( self.inverse_transform(np.c_[xx.ravel(), yy.ravel()])) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=alpha, cmap=cm) plt.colorbar() if c is None: plt.scatter( self.embedding[:, 0], self.embedding[:, 1], alpha=alpha, edgecolors=edgecolors, ) else: plt.scatter( self.embedding[:, 0], self.embedding[:, 1], alpha=alpha, c=c, edgecolors=edgecolors, ) plt.grid() plt.show() def augumentation(self, augment_size, rate): augmented_data = pd.concat([self.data] * augment_size).values augmented_data = fill_randomly(augmented_data, np.nan, rate) augmented_data = pd.DataFrame( self.imputer.fit_transform(augmented_data)) augmented_data = pd.concat([self.data, augmented_data]) return augmented_data