def test_umap_transform_embedding_stability(iris, iris_selection):
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    _ = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    _ = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)
Ejemplo n.º 2
0
def test_densmap_trustworthiness_on_iris(iris):
    densmap_iris_model = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        densmap=True,
        verbose=True,
    ).fit(iris.data)
    embedding = densmap_iris_model.embedding_
    trust = trustworthiness(iris.data, embedding, 10)
    assert (
        trust >= 0.97
    ), "Insufficiently trustworthy embedding for" "iris dataset: {}".format(
        trust)

    with pytest.raises(NotImplementedError):
        densmap_iris_model.transform(iris.data[:10])

    with pytest.raises(ValueError):
        densmap_iris_model.inverse_transform(embedding[:10])

    with pytest.raises(NotImplementedError):
        _ = UMAP(
            n_neighbors=10,
            min_dist=0.01,
            random_state=42,
            densmap=True,
            verbose=True,
        ).fit(iris.data, y=iris.target)
Ejemplo n.º 3
0
def run_svc(train_pkl, val_pkl, test_pkl, series=False, outcome_col='Disease_State', num_random_search=0):
    train_methyl_array, val_methyl_array, test_methyl_array = MethylationArray.from_pickle(train_pkl), MethylationArray.from_pickle(val_pkl), MethylationArray.from_pickle(test_pkl)
    umap = UMAP(n_components=100)
    umap.fit(train_methyl_array.beta)
    train_methyl_array.beta = pd.DataFrame(umap.transform(train_methyl_array.beta.values),index=train_methyl_array.return_idx())
    val_methyl_array.beta = pd.DataFrame(umap.transform(val_methyl_array.beta),index=val_methyl_array.return_idx())
    test_methyl_array.beta = pd.DataFrame(umap.transform(test_methyl_array.beta),index=test_methyl_array.return_idx())

    model = SVC
    model = MachineLearning(model,options={'penalty':'l2','verbose':3,'n_jobs':35,'class_weight':'balanced'},grid={'C':[1,10,100,1000], 'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']},
                            n_eval=num_random_search,
                            series=series,
                            labelencode=True,
                            verbose=True)

    sklearn_model=model.fit(train_methyl_array,val_methyl_array,outcome_col)
    pickle.dump(sklearn_model,open('sklearn_model.p','wb'))

    y_pred = model.predict(test_methyl_array)
    pd.DataFrame(np.hstack((y_pred[:,np.newaxis],test_methyl_array.pheno[outcome_col].values[:,np.newaxis])),index=test_methyl_array.return_idx(),columns=['y_pred','y_true']).to_csv('SklearnPredictions.csv')

    original, std_err, (low_ci,high_ci) = model.return_outcome_metric(test_methyl_array, outcome_col, accuracy_score, run_bootstrap=True)

    results={'score':original,'Standard Error':std_err, '0.95 CI Low':low_ci, '0.95 CI High':high_ci}

    print('\n'.join(['{}:{}'.format(k,v) for k,v in results.items()]))
Ejemplo n.º 4
0
def run_umap(args):
    mapper = UMAP(n_neighbors=args.n_neighbors,
                  random_state=args.seed,
                  init="random")
    mapper.fit(X_train, y=y_train)
    Z = mapper.embedding_
    Z_test = mapper.transform(X_test)
    return Z, Z_test
Ejemplo n.º 5
0
 def do_cluster(ilayer):
     if ilayer in these_layers:
         fit = UMAP(n_components=cluster_ndims, verbose=3, \
                    n_neighbors=umap_n_neighbors, \
                    min_dist=umap_min_distance \
                   ).fit(activations_tocluster[ilayer])
         return fit, fit.transform(activations_tocluster[ilayer])
     else:
         return None, None
Ejemplo n.º 6
0
class Umap:
    """
    This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]
    by means of umap. We're using the implementation in [umap-learn](https://umap-learn.readthedocs.io/en/latest/).

    Arguments:
        n_components: the number of compoments to create/add
        kwargs: keyword arguments passed to the UMAP algorithm

    Usage:

    ```python
    from whatlies.language import SpacyLanguage
    from whatlies.transformers import Umap

    words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman",
             "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire",
             "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water",
             "person", "family", "brother", "sister"]

    lang = SpacyLanguage("en_core_web_md")
    emb = lang[words]

    emb.transform(Umap(3)).plot_interactive_matrix('umap_0', 'umap_1', 'umap_2')
    ```
    """
    def __init__(self, n_components=2, **kwargs):
        self.is_fitted = False
        self.n_components = n_components
        self.kwargs = kwargs
        self.tfm = UMAP(n_components=n_components, **kwargs)

    def __call__(self, embset):
        if not self.is_fitted:
            self.fit(embset)
        return self.transform(embset)

    def fit(self, embset):
        names, X = embset_to_X(embset=embset)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=UserWarning)
            warnings.simplefilter("ignore", category=NumbaPerformanceWarning)
            self.tfm.fit(X)
        self.is_fitted = True

    def transform(self, embset):
        names, X = embset_to_X(embset=embset)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=NumbaPerformanceWarning)
            new_vecs = self.tfm.transform(X)
        names_out = names + [f"umap_{i}" for i in range(self.n_components)]
        vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
        new_dict = new_embedding_dict(names_out, vectors_out, embset)
        return EmbeddingSet(new_dict,
                            name=f"{embset.name}.umap_{self.n_components}()")
Ejemplo n.º 7
0
def _umap_projection(embeddings, n_axes, **kwargs):
    embeddings_matrix = np.stack(embeddings.values())
    umap = UMAP()

    umap.fit(embeddings_matrix)
    projected_matrix = umap.transform(embeddings_matrix)
    projected_emebddings = {
        embedding_id: projected_matrix[i, :]
        for i, embedding_id in enumerate(embeddings)
    }
    return projected_emebddings
def test_umap_graph_layout():
    data, labels = make_blobs(n_samples=500, n_features=10, centers=5)
    model = UMAP(n_epochs=100, transform_mode="graph")
    graph = model.fit_transform(data)
    assert scipy.sparse.issparse(graph)
    nc, cl = scipy.sparse.csgraph.connected_components(graph)
    assert nc == 5

    new_graph = model.transform(data[:10] +
                                np.random.normal(0.0, 0.1, size=(10, 10)))
    assert scipy.sparse.issparse(graph)
    assert new_graph.shape[0] == 10
Ejemplo n.º 9
0
def draw_umap(
    data,
    n_neighbors=15,
    min_dist=0.1,
    c=None,
    n_components=2,
    metric="euclidean",
    title="",
    plot=True,
    cmap=None,
    use_plotly=False,
    **kwargs,
):
    """Generate plot of UMAP algorithm results based on specified arguments
    """
    fit = UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=42,
    )
    mapper = fit.fit(data)
    u = fit.transform(data)
    if plot:
        if use_plotly:
            fig = px.scatter(
                x=u[:, 0], y=u[:, 1], color=c, title=title, **kwargs
            )
            fig.update_layout(
                {
                    "plot_bgcolor": "rgba(0, 0, 0, 0)",
                    "paper_bgcolor": "rgba(0, 0, 0, 0)",
                }
            )
            fig.show()
        else:
            fig = plt.figure()
            if n_components == 1:
                ax = fig.add_subplot(111)
                ax.scatter(u[:, 0], range(len(u)), c=c)
            if n_components == 2:
                ax = fig.add_subplot(111)
                scatter = ax.scatter(u[:, 0], u[:, 1], c=c, label=c, cmap=cmap)
            if n_components == 3:
                ax = fig.add_subplot(111, projection="3d")
                ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=c, s=100)
            plt.title(title, fontsize=18)
            legend = ax.legend(*scatter.legend_elements())
            ax.add_artist(legend)

    return u, mapper
Ejemplo n.º 10
0
def example_2():
    digits_dataset = load_digits()

    digits_data = digits_dataset['data']
    digits_target = digits_dataset['target']

    X_tr, X_ts, y_tr, y_ts = train_test_split(digits_data,
                                              digits_target,
                                              test_size=0.40,
                                              random_state=42,
                                              stratify=digits_target)

    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = KMeans()

    reval = FindBestClustCV(s=s, c=c, nfold=5, nclust_range=[2, 15], nrand=100)

    metrics, nclustbest, _ = reval.best_nclust(X_tr,
                                               iter_cv=10,
                                               strat_vect=y_tr)

    plot_metrics(metrics, title='Reval performance digits dataset')

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    print(f"Best number of clusters: {nclustbest}")
    print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
    print(f'AMI (true labels vs predicted labels) = '
          f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    print(f"Validation set normalized stability (misclassification):"
          f"{metrics['val'][nclustbest]}")
    print(f'Test set ACC = {out.test_acc} '
          f'(true labels vs predicted labels)')

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()
Ejemplo n.º 11
0
def run_dimention_reduction(train_x, test_x, train_y):
    """
    次元削減を行う関数(PCA ⇒ UMAP)
    """
    # 始めにPCAで元の1/2に次元削減する
    n_components = round(len(train_x.columns) * 0.5)
    pca = PCA(n_components=n_components).fit(train_x)
    reduced_train_x = pd.DataFrame(pca.transform(train_x))
    reduced_test_x = pd.DataFrame(pca.transform(test_x))

    # UMAPで2次元に削減
    reducer = UMAP(random_state=0)
    reducer.fit(reduced_train_x)
    reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x))
    reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x))

    # 標準化
    reduced_train_x = cf.standardize(reduced_train_x)
    reduced_test_x = cf.standardize(reduced_test_x)

    reduced_train_x.columns = ["umap_1", "umap_2"]
    reduced_test_x.columns = ["umap_1", "umap_2"]
    return reduced_train_x, reduced_test_x
Ejemplo n.º 12
0
def run(n_init,
        max_features,
        umap_n_components,
        dataset,
        val_dataset,
        result_dir,
        random_state
        ):
    # Set random states
    np.random.seed(random_state)

    # load data
    train_df = pd.read_csv(dataset)

    train_texts = train_df['texts'].to_numpy()
    train_labels = train_df['labels'].to_numpy()

    val_df = pd.read_csv(val_dataset)

    val_texts = val_df['texts'].to_numpy()
    val_labels = val_df['labels'].to_numpy()

    tfidf = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train = tfidf.fit_transform(train_texts)
    X_test = tfidf.transform(val_texts)

    umap = UMAP(n_components=umap_n_components)
    X_train = umap.fit_transform(X_train.toarray())
    X_test = umap.transform(X_test.toarray())

    kmeans = KMeans(n_init=n_init, n_clusters=len(np.unique(train_labels)))
    kmeans.fit(X_train)
    predicted_labels = kmeans.predict(X_test)

    best_matching, accuracy = cluster_accuracy(val_labels, predicted_labels)
    ari = adjusted_rand_score(val_labels, predicted_labels)
    nmi = normalized_mutual_info_score(val_labels, predicted_labels)
    purity = purity_score(y_true=val_labels, y_pred=predicted_labels)

    run_results = {}
    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results['purity'] = purity  # use purity to compare with microsoft paper

    os.makedirs(result_dir, exist_ok=True)
    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir, f'20newsgroups-kmeans.csv'), index=False)
Ejemplo n.º 13
0
def test_umap_transform_on_iris_modified_dtype(iris, iris_selection):
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    fitter.embedding_ = fitter.embedding_.astype(np.float64)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.8,
        "Insufficiently trustworthy transform for iris dataset: {}".format(
            trust),
    )
Ejemplo n.º 14
0
def test_umap_transform_on_iris(iris, iris_selection):
    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200,
                  random_state=42).fit(data)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert_greater_equal(
        trust,
        0.85,
        "Insufficiently trustworthy transform for"
        "iris dataset: {}".format(trust),
    )
Ejemplo n.º 15
0
def umap_reduce(docvecs, label, umap_model, use_nn, use_umap, **kwargs):
    if not use_umap:
        return np.array(docvecs), None

    if not umap_model:
        print(f"Train UMAP...")
        umap_n_components = min(256, len(docvecs)-2) if use_nn else 1
        umap_model = UMAP(metric="cosine", set_op_mix_ratio=1.0,
                          n_components=umap_n_components, random_state=42,
                          verbose=False)
        umap_model = umap_model.fit(docvecs, y=label)
    dim_reduced_vecs = umap_model.transform(docvecs)
    if not use_nn:
        dim_reduced_vecs = dim_reduced_vecs.astype(float)
    return dim_reduced_vecs, umap_model
def run_dimention_reduction(train_x, test_x, train_y):
    """
    次元削減を行う関数(PCA ⇒ UMAP)
    """
    # 始めにPCAで元の1/2に次元削減する
    n_components = round(len(train_x.columns) * 0.5)
    pca = PCA(n_components=n_components).fit(train_x)
    reduced_train_x = pd.DataFrame(pca.transform(train_x))
    reduced_test_x = pd.DataFrame(pca.transform(test_x))

    # UMAPで2次元に削減
    reducer = UMAP(random_state=0)
    reducer.fit(reduced_train_x)
    reduced_train_x = pd.DataFrame(reducer.transform(reduced_train_x))
    reduced_test_x = pd.DataFrame(reducer.transform(reduced_test_x))
    reduced_train_x.columns = ["umap_1", "umap_2"]
    reduced_test_x.columns = ["umap_1", "umap_2"]

    # df = pd.concat([reduced_train_x, train_y], axis=1)
    # plt.figure()
    # plt.scatter(df.loc[:, 0], df.loc[:, 1], c=df.loc[:, "y"])
    # plt.colorbar()
    # plt.savefig(f"{DATA_DIR}/dimension_reduction.png")
    return reduced_train_x, reduced_test_x
Ejemplo n.º 17
0
class UMAPRepresentation(Representation):

    @staticmethod
    def default_config():
        default_config = Dict()

        # parameters
        default_config.parameters = Dict()
        default_config.parameters.n_neighbors = 15
        default_config.parameters.metric = 'euclidean'
        default_config.parameters.init = 'spectral'
        default_config.parameters.random_state = None
        default_config.parameters.min_dist = 0.1

        return default_config

    def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs):
        Representation.__init__(self, config=config, **kwargs)

        # input size (flatten)
        self.n_features = n_features
        # latent size
        self.n_latents = n_latents
        # feature range
        self.feature_range = (0.0, 1.0)

        self.algorithm = UMAP()
        self.update_algorithm_parameters()

    def fit(self, X_train, update_range=True):
        ''' 
        X_train: array-like (n_samples, n_features)
        '''
        X_train = np.nan_to_num(X_train)
        if update_range:
            self.feature_range = (X_train.min(axis=0), X_train.max(axis=0))  # save (min, max) for normalization
        X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        self.algorithm.fit(X_train)

    def calc_embedding(self, x):
        x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0])
        x = self.algorithm.transform(x)
        return x

    def update_algorithm_parameters(self):
        self.algorithm.set_params(n_components=self.n_latents, **self.config.parameters, verbose=False)
Ejemplo n.º 18
0
def draw_umap(data,
              n_neighbors=15,
              min_dist=0.1,
              c=None,
              n_components=2,
              metric='euclidean',
              title='',
              plot=True,
              cmap=None,
              use_plotly=False,
              **kwargs):
    fit = UMAP(n_neighbors=n_neighbors,
               min_dist=min_dist,
               n_components=n_components,
               metric=metric,
               random_state=42)
    mapper = fit.fit(data)
    u = fit.transform(data)
    if plot:
        if use_plotly:
            fig = px.scatter(x=u[:, 0],
                             y=u[:, 1],
                             color=c,
                             title=title,
                             **kwargs)
            fig.update_layout({
                'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                'paper_bgcolor': 'rgba(0, 0, 0, 0)',
            })
            fig.show()
        else:
            fig = plt.figure()
            if n_components == 1:
                ax = fig.add_subplot(111)
                ax.scatter(u[:, 0], range(len(u)), c=c)
            if n_components == 2:
                ax = fig.add_subplot(111)
                scatter = ax.scatter(u[:, 0], u[:, 1], c=c, label=c, cmap=cmap)
            if n_components == 3:
                ax = fig.add_subplot(111, projection='3d')
                ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=c, s=100)
            plt.title(title, fontsize=18)
            legend = ax.legend(*scatter.legend_elements())
            ax.add_artist(legend)

    return u, mapper
def test_umap_transform_on_iris_w_pynndescent(iris, iris_selection):
    data = iris.data[iris_selection]
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        n_epochs=100,
        random_state=42,
        force_approximation_algorithm=True,
    ).fit(data)

    new_data = iris.data[~iris_selection]
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert (
        trust >= 0.85
    ), "Insufficiently trustworthy transform for" "iris dataset: {}".format(
        trust)
def test_precomputed_sparse_transform_on_iris(iris, iris_selection):
    data = iris.data[iris_selection]
    distance_matrix = sparse.csr_matrix(squareform(pdist(data)))

    fitter = UMAP(n_neighbors=10,
                  min_dist=0.01,
                  random_state=42,
                  n_epochs=100,
                  metric='precomputed').fit(distance_matrix)

    new_data = iris.data[~iris_selection]
    new_distance_matrix = sparse.csr_matrix(cdist(new_data, data))
    embedding = fitter.transform(new_distance_matrix)

    trust = trustworthiness(new_data, embedding, 10)
    assert (
        trust >= 0.85
    ), "Insufficiently trustworthy transform for" "iris dataset: {}".format(
        trust)
Ejemplo n.º 21
0
def relabel(dict_imp, plot_scatter=False):
    """
    Function that return the imputed datasets with new labels. Obtained by pasting the
    subdomain cluster labels with the domain cluster labels.

    :param dict_imp: imputed datasets
    :type dict_imp: dict
    :param plot_scatter: flag for a scatterplot with new labels
    :type plot_scatter: bool
    :return: imput dictionary with new cluster label column
    :rtype: dict
    """
    transform = UMAP(random_state=42, n_neighbors=30, min_dist=0.0)
    subdomain_feat = [c for c in dict_imp['P1'][0].columns if re.search('vscore', c) and not re.search('written', c)]
    domain_feat = [c for c in dict_imp['P1'][0].columns if re.search('totalb', c) and not re.search('composite', c)]
    col = subdomain_feat + domain_feat
    flatui = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2",
              "#7f7f7f", "#bcbd22", "#17becf", "#8c564b", "#a55194"]

    for p, df in dict_imp.items():
        df[0]['new_cluster'] = ['-'.join([str(a), str(b)]) for a, b in
                                zip(df[0]['cluster_subdomain'].tolist(),
                                    df[0]['cluster_domain'].tolist())]
        df[1]['new_cluster'] = ['-'.join([str(a), str(b)]) for a, b in
                                zip(df[1]['cluster_subdomain'].tolist(),
                                    df[1]['cluster_domain'].tolist())]
        X_tr = transform.fit_transform(df[0][col])
        X_ts = transform.transform(df[1][col])

        if plot_scatter:
            _scatter_plot(X_tr,
                          [(gui, cl) for gui, cl in zip(df[0].index, df[0].new_cluster)],
                          flatui,
                          10, 20,
                          {str(ncl): ' '.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(df[0].new_cluster))},
                          title=f'New labels Vineland {p} training set')
            _scatter_plot(X_ts,
                          [(gui, cl) for gui, cl in zip(df[1].index, df[1].new_cluster)],
                          flatui,
                          10, 20,
                          {str(ncl): ' '.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(df[1].new_cluster))},
                          title=f'New labels Vineland {p} test set')
    return dict_imp
Ejemplo n.º 22
0
def umap_graph(data, plot=True):
    near_n = 15
    min_d = 0.1
    rand_s = np.random.RandomState(30)

    transformer = UMAP(n_neighbors=near_n,random_state=rand_s,min_dist=min_d).fit(data)
    raw_emb = transformer.transform(data);
    kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=rand_s)
    labels = kmeans.fit(raw_emb)

    node_l = labels.labels_

    sim_set = umap.umap_.fuzzy_simplicial_set(data,near_n,random_state=rand_s,\
                                              metric="euclidean")
    sim_set = sim_set[0].toarray()
    if plot:
        nx.draw(nx.DiGraph(sim_set), arrows=False, node_size=30, width=0.1, node_color = node_l, cmap='viridis')

    return sim_set
Ejemplo n.º 23
0
def test_ingest_map_embedding_umap():
    adata_ref = sc.AnnData(X)
    adata_new = sc.AnnData(T)

    sc.pp.neighbors(adata_ref,
                    method='umap',
                    use_rep='X',
                    n_neighbors=4,
                    random_state=0)
    sc.tl.umap(adata_ref, random_state=0)

    ing = sc.tl.Ingest(adata_ref)
    ing.fit(adata_new)
    ing.map_embedding(method='umap')

    reducer = UMAP(min_dist=0.5, random_state=0, n_neighbors=4)
    reducer.fit(X)
    umap_transformed_t = reducer.transform(T)

    assert np.allclose(ing._obsm['X_umap'], umap_transformed_t)
def test_umap_sparse_transform_on_iris(iris, iris_selection):
    data = sparse.csr_matrix(iris.data[iris_selection])
    assert sparse.issparse(data)
    fitter = UMAP(
        n_neighbors=10,
        min_dist=0.01,
        random_state=42,
        n_epochs=100,
        # force_approximation_algorithm=True,
    ).fit(data)

    new_data = sparse.csr_matrix(iris.data[~iris_selection])
    assert sparse.issparse(new_data)
    embedding = fitter.transform(new_data)

    trust = trustworthiness(new_data, embedding, 10)
    assert (
        trust >= 0.80
    ), "Insufficiently trustworthy transform for" "iris dataset: {}".format(
        trust)
Ejemplo n.º 25
0
def plot_umap(X,
              Y,
              validation_data=None,
              style='starplot',
              p_size=3.5,
              save_img=False,
              img_res=300,
              fig_res=72,
              random_state=None):
    from umap import UMAP

    if validation_data is None:
        validation_data = (X, Y)
    X_valid, Y_valid = validation_data
    if style == 'starplot':
        plt.style.use(['dark_background'])
        plt.rcParams['figure.figsize'] = (15, 15)
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.size'] = 14
        plt.rcParams['figure.dpi'] = fig_res
    umap = UMAP(25, random_state=random_state)
    umap.fit(X, Y.ravel())
    embedings = umap.transform(X_valid)
    embedings = np.array(embedings)
    size = p_size
    cmap = LinearSegmentedColormap.from_list("recy", ["magenta", "cyan"])
    for point in range(1, 10):
        plt.scatter(
            embedings[:, 0],
            embedings[:, 1],
            c=Y_valid.ravel(),
            cmap=cmap,
            s=5 * point**size,
            alpha=1 / (point**size),
            edgecolors='',
        )
    file_name = './plots/s' + str(int(size)) + '_umap.png'
    if save_img:
        os.makedirs(os.path.dirname(file_name), exist_ok=True)
        plt.savefig(file_name, dpi=img_res, transparent=True)
    plt.show()
Ejemplo n.º 26
0
    def umap_function(self, semi_super=True):
        umap = UMAP()
        X_train = umap.fit_transform(
            self.raw_data["X_train"],
            y=self.raw_data['y_train']) if semi_super else umap.fit_transform(
                self.raw_data["X_train"])
        X_test = umap.transform(self.raw_data["X_test"])

        d_data = {
            'X_train': X_train,
            'y_train': self.raw_data["y_train"],
            'X_test': X_test,
            'y_test': self.raw_data["y_test"],
        }

        if semi_super:
            self.umap_sup = umap
            self.umap_sup_data = d_data

        else:
            self.umap = umap
            self.umap_data = d_data
Ejemplo n.º 27
0
class UmapTransformer(object):
    def __init__(self,
                 n_components=2,
                 embed_n_neighbors=10,
                 target_metric='categorical'):
        self.umap = UMAP(n_components=n_components,
                         n_neighbors=embed_n_neighbors,
                         target_metric=target_metric,
                         transform_seed=1)
        self.rf = RandomForestRegressor(n_estimators=300)

    def fit(self, x, y=None):
        self.umap.fit(x, y)
        x_low = self.umap.transform(x)
        self.rf.fit(x, x_low)
        return self

    def transform(self, x):
        return self.rf.predict(x)

    def fit_transform(self, x, y):
        return self.fit(x, y).transform(x)
Ejemplo n.º 28
0
def predict_adata(model,
                  adata,
                  make_umap=True,
                  umap_fit_n=10000,
                  batch_size=1024):
    dl = get_prediction_dataloader(adata, model.genes, batch_size=1024)
    logging.info(f'starting prediction of {dl.dataset.adata.shape[0]} cells')
    emb, y_prob = predict_dl(dl, model)
    a = dl.dataset.adata
    a.obsm['X_emb'] = emb

    if make_umap:
        u = UMAP()
        idxs = np.random.choice(np.arange(a.shape[0]),
                                size=min(umap_fit_n, a.shape[0]),
                                replace=False)
        u.fit(emb[idxs])
        a.obsm['X_umap'] = u.transform(emb)

    a.obsm['prediction_probs'] = y_prob

    a.obs['y_pred'] = [np.argmax(probs) for probs in y_prob]
    a.obs['predicted_cell_type_probability'] = [
        np.max(probs) for probs in y_prob
    ]
    a.obs['predicted_cell_type'] = [
        model.classes[np.argmax(probs)] for probs in y_prob
    ]

    prob_df = pd.DataFrame(data=a.obsm['prediction_probs'],
                           columns=model.classes,
                           index=a.obs.index.to_list())
    prob_df.columns = [f'probability {c}' for c in prob_df.columns]

    a.obs = pd.concat((a.obs, prob_df), axis=1)

    return a
Ejemplo n.º 29
0
class UMAPReducer(Reducer):
    """
    Simple wrapper for UMAP, used for API consistency.
    """
    def __init__(self, n_components=2, **kwargs):
        self.n_components = n_components
        kwargs = {**{'n_neighbors': 10, 'min_dist': 0.}, **kwargs}
        self.model = UMAP(n_components=n_components, **kwargs)

    def fit(self, X):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', NumbaWarning)
            self.model.fit(X)
        return self

    def transform(self, X):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', NumbaWarning)
            result = self.model.transform(X)
        return result

    def decrement_components(self):
        self.n_components -= 1
        self.model.n_components -= 1
Ejemplo n.º 30
0
 def get_similarity_matrix(data,
                           n_components_umap=2,
                           n_neighbors_knn=10,
                           random_state=None):
     """ The similarity matrix is derived in an unsupervised way
     (e.g., UMAP projection of the data and k-nearest-neighbors or
     distance thresholding to define the adjacency matrix for the batch),
     but can also be used to include weakly-supervised information (e.g.,
     knowledge about diseased vs. non-diseased patients). If labels
     are available, the model could even be used to derive a latent
     representation with supervision. Thesimilarity feature in MoE-Sim-VAE
     thus allows to include prior knowledge about the best similarity
     measure on the data.
     """
     flat_data = data.reshape(len(data), -1)
     reducer = UMAP(n_components=n_components_umap,
                    random_state=random_state)
     reducer.fit(flat_data)
     embedding = reducer.transform(flat_data)
     neigh = NearestNeighbors(n_neighbors=n_neighbors_knn)
     neigh.fit(embedding)
     similarity = neigh.kneighbors_graph(embedding).toarray()
     similarity = similarity.astype(np.float32)
     return similarity, embedding
from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))