Ejemplo n.º 1
0
def test_densmap_trustworthiness_on_iris(iris):
    densmap_iris_model = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        densmap=True,
        verbose=True,
    ).fit(iris.data)
    embedding = densmap_iris_model.embedding_
    trust = trustworthiness(iris.data, embedding, 10)
    assert (
        trust >= 0.97
    ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
        trust)

    with pytest.raises(NotImplementedError):
        densmap_iris_model.transform(iris.data[:10])

    with pytest.raises(ValueError):
        densmap_iris_model.inverse_transform(embedding[:10])

    with pytest.raises(NotImplementedError):
        _ = UMAP(
            n_neighbors=10,
            min_dist=0.01,
            random_state=42,
            densmap=True,
            verbose=True,
        ).fit(iris.data, y=iris.target)
Ejemplo n.º 2
0
def test_umap_inverse_transform_fails_expectedly(sparse_spatial_data, nn_data):
    u = UMAP(n_epochs=11)
    u.fit(sparse_spatial_data[:100])
    with pytest.raises(ValueError):
        u.inverse_transform(u.embedding_[:10])
    u = UMAP(metric="dice", n_epochs=11)
    u.fit(nn_data[:100])
    with pytest.raises(ValueError):
        u.inverse_transform(u.embedding_[:10])
Ejemplo n.º 3
0
def perform_latent_walk_in_umap_space(domain_configs: List[DomainConfig],
                                      dataloader_type: str,
                                      random_state: int = 1234):
    if len(domain_configs) != 2:
        raise RuntimeError(
            "Expects two domain configurations (image and sequencing domain)")
    if domain_configs[0].name == "image" and domain_configs[1].name == "rna":
        image_domain_config = domain_configs[0]
        rna_domain_config = domain_configs[1]
    elif domain_configs[0].name == "rna" and domain_configs[1].name == "image":
        image_domain_config = domain_configs[1]
        rna_domain_config = domain_configs[0]
    else:
        raise RuntimeError(
            "Expected domain configuration types are >image< and >rna<.")

    rna_data_loader = rna_domain_config.data_loader_dict[dataloader_type]
    image_data_loader = image_domain_config.data_loader_dict[dataloader_type]
    device = get_device()

    geneset_ae = rna_domain_config.domain_model_config.model.to(device).eval()
    image_ae = image_domain_config.domain_model_config.model.to(device).eval()

    all_rna_latents = []
    all_rna_labels = []
    all_image_latents = []
    all_image_labels = []
    grid_sequences = []
    grid_geneset_activities = []
    grid_images = []
    rna_cell_ids = []
    image_cell_ids = []

    for i, sample in enumerate(rna_data_loader):
        rna_inputs = sample[rna_domain_config.data_key].to(device)
        rna_labels = sample[rna_domain_config.label_key]
        rna_cell_ids.extend(sample["id"])

        geneset_ae_output = geneset_ae(rna_inputs)
        latents = geneset_ae_output["latents"]
        all_rna_latents.extend(list(latents.clone().detach().cpu().numpy()))
        all_rna_labels.extend(list(rna_labels.clone().detach().cpu().numpy()))

    for i, sample in enumerate(image_data_loader):
        image_inputs = sample[image_domain_config.data_key].to(device)
        image_labels = sample[image_domain_config.label_key].to(device)
        image_cell_ids.extend(sample["id"])

        image_ae_output = image_ae(image_inputs)
        latents = image_ae_output["latents"]
        all_image_latents.extend(list(latents.clone().detach().cpu().numpy()))
        all_image_labels.extend(
            list(image_labels.clone().detach().cpu().numpy()))

    all_latents = np.concatenate(
        (np.array(all_image_latents), np.array(all_rna_latents)), axis=0)
    all_labels = np.concatenate(
        (np.array(all_image_labels), np.array(all_rna_labels)), axis=0)
    all_domain_labels = np.concatenate(
        (
            np.repeat("image", len(all_image_labels)),
            np.repeat("rna", len(all_rna_labels)),
        ),
        axis=0,
    )
    all_cell_ids = np.concatenate((image_cell_ids, rna_cell_ids), axis=0)

    mapper = UMAP(random_state=random_state)
    transformed = mapper.fit_transform(all_latents)
    min_umap_c1 = min(transformed[:, 0])
    max_umap_c1 = max(transformed[:, 0])
    min_umap_c2 = min(transformed[:, 1])
    max_umap_c2 = max(transformed[:, 1])

    test_pts = np.array([
        (np.array([min_umap_c1, max_umap_c2]) *
         (1 - x) + np.array([max_umap_c1, max_umap_c2]) * x) * (1 - y) +
        (np.array([min_umap_c1, min_umap_c2]) *
         (1 - x) + np.array([max_umap_c1, min_umap_c2]) * x) * y
        for y in np.linspace(0, 1, 10) for x in np.linspace(0, 1, 10)
    ])

    inv_transformed_points = mapper.inverse_transform(test_pts)
    test_pts_ds = torch.utils.data.TensorDataset(
        torch.from_numpy(inv_transformed_points))
    test_pts_loader = torch.utils.data.DataLoader(test_pts_ds,
                                                  batch_size=64,
                                                  shuffle=False)

    for i, sample in enumerate(test_pts_loader):
        image_recons = image_ae.decode(sample[0].to(device))
        rna_recons, decoded_geneset_activities = geneset_ae.decode(
            sample[0].to(device))

        grid_images.extend(list(image_recons.clone().detach().cpu().numpy()))
        grid_sequences.extend(list(rna_recons.clone().detach().cpu().numpy()))
        grid_geneset_activities.extend(
            list(decoded_geneset_activities.clone().detach().cpu().numpy()))

    data_dict = {
        "grid_points": test_pts,
        "grid_images": grid_images,
        "grid_sequences": grid_sequences,
        "grid_geneset_activities": grid_geneset_activities,
        "all_latents": all_latents,
        "all_labels": all_labels,
        "all_domain_labels": all_domain_labels,
        "all_cell_ids": all_cell_ids,
    }

    return data_dict
Ejemplo n.º 4
0
class PCAUmap:
    def __init__(
        self,
        n_neighbors=15,
        use_pca=1,
        kernel='linear',
        min_dist=0.1,
        n_components=2,
        random_state=None,
        transform_seed=None,
        scaler=True,
        metric="euclidean",
        augment_size=3,
        impute_rate=0.1,
    ):
        if kernel == 'linear':
            self.pca = PCA()
        else:
            self.pca = KernelPCA(kernel=kernel, fit_inverse_transform=True)
        self.umap = UMAP(
            random_state=random_state,
            transform_seed=transform_seed,
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            n_components=n_components,
            metric=metric,
        )
        self.use_pca = use_pca
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.data = None
        self.pca_features = None
        self.embedding = None
        self.imputer = KNNImputer()
        self.augment_size = augment_size
        self.impute_rate = impute_rate

    def fit(self, data):
        self.data = pd.DataFrame(data)
        augmented_data = self.augumentation(self.augment_size,
                                            self.impute_rate)

        if self.scaler is None:
            if self.use_pca is None:
                self.umap.fit(augmented_data)
                self.embedding = self.umap.transform(data)
            else:
                self.umap.fit(self.pca.fit_transform(augmented_data))
                self.pca_features = self.pca.transform(data)
                self.embedding = self.umap.transform(self.pca_features)
        else:
            if self.use_pca is None:
                self.umap.fit(self.scaler.fit_transform(augmented_data))
                self.embedding = self.umap.transform(
                    self.scaler.transform(data))
            else:
                self.umap.fit(
                    self.pca.fit_transform(
                        self.scaler.fit_transform(augmented_data)))
                self.pca_features = self.pca.transform(
                    self.scaler.transform(data))
                self.embedding = self.umap.transform(self.pca_features)
        return self

    def transform(self, data):
        self.data = pd.DataFrame(data)
        if self.scaler is None:
            if self.pca is None:
                self.embedding = self.umap.transform(data)
                return self.embedding
            else:
                self.pca_features = self.pca.transform(data)
                self.embedding = self.umap.transform(self.pca_features)
                return self.embedding
        else:
            if self.pca is None:
                self.embedding = self.umap.transform(
                    self.scaler.transform(data))
                return self.embedding
            else:
                self.pca_features = self.pca.transform(
                    self.scaler.transform(data))
                self.embedding = self.umap.transform(self.pca_features)
                return self.embedding

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, embedded):
        if self.scaler is None:
            if self.pca is None:
                return self.umap.inverse_transform(embedded)
            else:
                return self.pca.inverse_transform(
                    self.umap.inverse_transform(embedded))
        else:
            if self.pca is None:
                return self.scaler.inverse_transform(
                    self.umap.inverse_transform(embedded))
            else:
                return self.scaler.inverse_transform(
                    self.pca.inverse_transform(
                        self.umap.inverse_transform(embedded)))

    def pca_summary(self, c=None):
        plt.figure(figsize=(6, 6))
        if c is None:
            plt.scatter(self.pca_features[:, 0],
                        self.pca_features[:, 1],
                        alpha=0.5)
        else:
            plt.scatter(self.pca_features[:, 0],
                        self.pca_features[:, 1],
                        alpha=0.5,
                        c=c)
        plt.xlabel("PC1 ({}%)".format(
            int(self.pca.explained_variance_ratio_[0] * 100)))
        plt.ylabel("PC2 ({}%)".format(
            int(self.pca.explained_variance_ratio_[1] * 100)))
        plt.grid()
        plt.show()
        plt.figure(figsize=(6, 6))
        plt.scatter(self.pca.components_[0],
                    self.pca.components_[1],
                    alpha=0.5)
        plt.xlabel("loading 1")
        plt.ylabel("loading 2")
        plt.grid()
        plt.show()
        plt.figure(figsize=(6, 6))
        plt.plot([0] + list(np.cumsum(self.pca.explained_variance_ratio_)),
                 "-o")
        plt.xlabel("Number of principal components")
        plt.ylabel("Cumulative contribution ratio")
        plt.grid()
        plt.show()

    def map_predicted_values(
            self,
            model,
            c=None,
            alpha=0.5,
            edgecolors="k",
            figsize=(8, 6),
            h=0.2,
            cm=plt.cm.jet,
    ):

        x_min = self.embedding[:, 0].min() - 0.5
        x_max = self.embedding[:, 0].max() + 0.5
        y_min = self.embedding[:, 1].min() - 0.5
        y_max = self.embedding[:, 1].max() + 0.5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        plt.figure(figsize=figsize)
        if hasattr(model, "predict_proba"):
            Z = model.predict_proba(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))[:, 1]
        elif hasattr(model, "decision_function"):
            Z = model.decision_function(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))
        else:
            Z = model.predict(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))

        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, alpha=alpha, cmap=cm)
        plt.colorbar()
        if c is None:
            plt.scatter(
                self.embedding[:, 0],
                self.embedding[:, 1],
                alpha=alpha,
                edgecolors=edgecolors,
            )
        else:
            plt.scatter(
                self.embedding[:, 0],
                self.embedding[:, 1],
                alpha=alpha,
                c=c,
                edgecolors=edgecolors,
            )
        plt.grid()
        plt.show()

    def augumentation(self, augment_size, rate):
        augmented_data = pd.concat([self.data] * augment_size).values
        augmented_data = fill_randomly(augmented_data, np.nan, rate)
        augmented_data = pd.DataFrame(
            self.imputer.fit_transform(augmented_data))
        augmented_data = pd.concat([self.data, augmented_data])
        return augmented_data