Exemple #1
0
def test_pairwise_distances_unsuppored_metrics(metric):
    rng = np.random.RandomState(3)

    X = rng.random_sample((5, 4))

    with pytest.raises(ValueError):
        pairwise_distances(X, metric=metric)
Exemple #2
0
def test_pairwise_distances_sklearn_comparison(metric: str, matrix_size):
    # Test larger sizes to sklearn
    rng = np.random.RandomState(1)

    element_count = matrix_size[0] * matrix_size[1]

    X = rng.random_sample(matrix_size)
    Y = rng.random_sample(matrix_size)

    # For fp64, compare at 10 decimals, (5 places less than the ~15 max)
    compare_precision = 10

    # Compare to sklearn, fp64
    S = pairwise_distances(X, Y, metric=metric)

    if (element_count <= 2000000):
        S2 = sklearn_pairwise_distances(X, Y, metric=metric)
        cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # For fp32, compare at 4 decimals, (3 places less than the ~7 max)
    compare_precision = 4

    X = np.asfarray(X, dtype=np.float32)
    Y = np.asfarray(Y, dtype=np.float32)

    # Compare to sklearn, fp32
    S = pairwise_distances(X, Y, metric=metric)

    if (element_count <= 2000000):
        S2 = sklearn_pairwise_distances(X, Y, metric=metric)
        cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
Exemple #3
0
def mmr(
    doc_embedding,
    word_embeddings,
    words,
    top_n=5,
    diversity=0.8,
):
    """
    Calculate Maximal Marginal Relevance (MMR)
    between candidate keywords and the document.
    MMR considers the similarity of keywords/keyphrases with the
    document, along with the similarity of already selected
    keywords and keyphrases. This results in a selection of keywords
    that maximize their within diversity with respect to the document.
    Arguments:
        doc_embedding: The document embeddings
        word_embeddings: The embeddings of the selected candidate keywords/phrases
        words: The selected candidate keywords/keyphrases
        top_n: The number of keywords/keyhprases to return
        diversity: How diverse the select keywords/keyphrases are.
                   Values between 0 and 1 with 0 being not diverse at all
                   and 1 being most diverse.
    Returns:
         List[str]: The selected keywords/keyphrases
    """

    # Extract similarity within words, and between words and the document
    word_doc_similarity = 1 - pairwise_distances(
        word_embeddings, doc_embedding, metric="cosine")
    word_similarity = 1 - pairwise_distances(word_embeddings, metric="cosine")

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = cp.argmax(word_doc_similarity)
    target = cp.take(keywords_idx, 0)
    candidates_idx = [i for i in range(len(words)) if i != target]
    for i in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        if i == 0:
            first_row = cp.reshape(
                word_similarity[candidates_idx][:, keywords_idx],
                (word_similarity[candidates_idx][:, keywords_idx].shape[0], 1))
            target_similarities = cp.max(first_row, axis=1)
        else:
            target_similarities = cp.max(
                word_similarity[candidates_idx][:, keywords_idx], axis=1)
        # Calculate MMR
        mmr = (
            1 - diversity
        ) * candidate_similarities - diversity * target_similarities.reshape(
            -1, 1)

        mmr_idx = cp.take(cp.array(candidates_idx), cp.argmax(mmr))

        # Update keywords & candidates
        keywords_idx = cp.append(keywords_idx, mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx.get()]
Exemple #4
0
def test_pairwise_distances(metric: str, matrix_size, is_col_major):
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    def prep_array(array):
        return np.asfortranarray(array) if is_col_major else array

    # For fp64, compare at 13 decimals, (2 places less than the ~15 max)
    compare_precision = 10

    # Compare to sklearn, single input
    X = prep_array(rng.random_sample(matrix_size))
    S = pairwise_distances(X, metric=metric)
    S2 = sklearn_pairwise_distances(X, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, double input with same dimensions
    Y = X
    S = pairwise_distances(X, Y, metric=metric)
    S2 = sklearn_pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare single and double inputs to eachother
    S = pairwise_distances(X, metric=metric)
    S2 = pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, with Y dim != X dim
    Y = prep_array(rng.random_sample((2, matrix_size[1])))
    S = pairwise_distances(X, Y, metric=metric)
    S2 = sklearn_pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Change precision of one parameter
    Y = np.asfarray(Y, dtype=np.float32)
    S = pairwise_distances(X, Y, metric=metric)
    S2 = sklearn_pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # For fp32, compare at 5 decimals, (2 places less than the ~7 max)
    compare_precision = 2

    # Change precision of both parameters to float
    X = np.asfarray(X, dtype=np.float32)
    Y = np.asfarray(Y, dtype=np.float32)
    S = pairwise_distances(X, Y, metric=metric)
    S2 = sklearn_pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Test sending an int type with convert_dtype=True
    Y = prep_array(rng.randint(10, size=Y.shape))
    S = pairwise_distances(X, Y, metric=metric, convert_dtype=True)
    S2 = sklearn_pairwise_distances(X, Y, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Test that uppercase on the metric name throws an error.
    with pytest.raises(ValueError):
        pairwise_distances(X, Y, metric=metric.capitalize())
Exemple #5
0
def test_pairwise_distances_one_dimension_order(metric: str):
    # Test the pairwise_distance helper function for 1 dimensional cases which
    # can break down when using a size of 1 for either dimension
    rng = np.random.RandomState(2)

    Xc = rng.random_sample((1, 4))
    Yc = rng.random_sample((10, 4))
    Xf = np.asfortranarray(Xc)
    Yf = np.asfortranarray(Yc)

    # For fp64, compare at 13 decimals, (2 places less than the ~15 max)
    compare_precision = 13

    # Compare to sklearn, C/C order
    S = pairwise_distances(Xc, Yc, metric=metric)
    S2 = sklearn_pairwise_distances(Xc, Yc, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, C/F order
    S = pairwise_distances(Xc, Yf, metric=metric)
    S2 = sklearn_pairwise_distances(Xc, Yf, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, F/C order
    S = pairwise_distances(Xf, Yc, metric=metric)
    S2 = sklearn_pairwise_distances(Xf, Yc, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, F/F order
    S = pairwise_distances(Xf, Yf, metric=metric)
    S2 = sklearn_pairwise_distances(Xf, Yf, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Switch which input has single dimension
    Xc = rng.random_sample((1, 4))
    Yc = rng.random_sample((10, 4))
    Xf = np.asfortranarray(Xc)
    Yf = np.asfortranarray(Yc)

    # Compare to sklearn, C/C order
    S = pairwise_distances(Xc, Yc, metric=metric)
    S2 = sklearn_pairwise_distances(Xc, Yc, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, C/F order
    S = pairwise_distances(Xc, Yf, metric=metric)
    S2 = sklearn_pairwise_distances(Xc, Yf, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, F/C order
    S = pairwise_distances(Xf, Yc, metric=metric)
    S2 = sklearn_pairwise_distances(Xf, Yc, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)

    # Compare to sklearn, F/F order
    S = pairwise_distances(Xf, Yf, metric=metric)
    S2 = sklearn_pairwise_distances(Xf, Yf, metric=metric)
    cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
Exemple #6
0
def laplacian_kernel(X, Y, gamma=None):
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = -gamma * cp.asarray(pairwise_distances(X, Y, metric='manhattan'))
    cp.exp(K, K)
    return K
Exemple #7
0
def test_pairwise_distances_output_types(input_type, output_type, use_global):
    # Test larger sizes to sklearn
    rng = np.random.RandomState(5)

    X = rng.random_sample((100, 100))
    Y = rng.random_sample((100, 100))

    if input_type == "cudf":
        X = cudf.DataFrame(X)
        Y = cudf.DataFrame(Y)
    elif input_type == "cupy":
        X = cp.asarray(X)
        Y = cp.asarray(Y)

    # Set to None if we are using the global object
    output_type_param = None if use_global else output_type

    # Use the global manager object. Should do nothing unless use_global is set
    with cuml.using_output_type(output_type):

        # Compare to sklearn, fp64
        S = pairwise_distances(X, Y, metric="euclidean",
                               output_type=output_type_param)

        if output_type == "input":
            assert isinstance(S, type(X))
        elif output_type == "cudf":
            assert isinstance(S, cudf.DataFrame)
        elif output_type == "numpy":
            assert isinstance(S, np.ndarray)
        elif output_type == "cupy":
            assert isinstance(S, cp.core.core.ndarray)
Exemple #8
0
 def _compute_spearman_rho(self, fp_sample, Xt_sample, top_k=100):
     if hasattr(fp_sample, 'values'):
         fp_sample = fp_sample.values
     dist_array_tani = tanimoto_calculate(fp_sample, calc_distance=True)
     dist_array_eucl = pairwise_distances(Xt_sample)
     return cupy.nanmean(
         spearmanr(dist_array_tani, dist_array_eucl, top_k=top_k))
Exemple #9
0
def rbf_kernel(X, Y, gamma=None):
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    K = cp.asarray(pairwise_distances(X, Y, metric='sqeuclidean'))
    K *= -gamma
    cp.exp(K, K)
    return K
Exemple #10
0
    def _calculate_metric(self, embeddings, fingerprints, top_k=None):
        embeddings_dist = pairwise_distances(embeddings)
        del embeddings

        fingerprints_dist = tanimoto_calculate(fingerprints,
                                               calc_distance=True)
        del fingerprints

        corr = spearmanr(fingerprints_dist, embeddings_dist, top_k)
        return corr
Exemple #11
0
def test_run_spearman_rho(pca_approved_drugs_csv,
                          fingerprint_approved_drugs_csv, cluster_column,
                          n_dims_eucl_data, top_k):
    """Validate the spearman rho scoring"""

    # Load PCA data to use as Euclidean distances
    pca_data = pd.read_csv(pca_approved_drugs_csv).set_index('molregno').drop(
        cluster_column, axis=1)
    float_data = pca_data[pca_data.columns[:n_dims_eucl_data]]
    euclidean_dist = pairwise_distances(cupy.array(float_data))

    # Load fingerprints and calculate tanimoto distance
    fp_data = pd.read_csv(fingerprint_approved_drugs_csv).set_index('molregno')
    tanimoto_dist = tanimoto_calculate(cupy.array(fp_data), calc_distance=True)

    # Check all data compared to the CPU version
    all_data_gpu = spearmanr(tanimoto_dist, euclidean_dist)

    euclidean_dist_cpu = cupy.asnumpy(euclidean_dist)
    tanimoto_dist_cpu = cupy.asnumpy(tanimoto_dist)
    all_data_cpu = _rowwise_numpy_corr(tanimoto_dist_cpu, euclidean_dist_cpu,
                                       spearmanr_cpu)

    cupy.allclose(cupy.array(all_data_cpu),
                  all_data_gpu,
                  atol=0.005,
                  equal_nan=True)

    # Check using top k calculation compared to the CPU version
    top_k_data_gpu = spearmanr(tanimoto_dist,
                               euclidean_dist,
                               top_k=top_k,
                               axis=1)

    cupy.fill_diagonal(tanimoto_dist, cupy.NaN)
    kth_lim = get_kth_unique_value(tanimoto_dist, top_k, axis=1)
    mask = tanimoto_dist > kth_lim
    tanimoto_dist[mask] = cupy.NaN
    euclidean_dist[mask] = cupy.NaN
    euclidean_dist_cpu = cupy.asnumpy(euclidean_dist)
    tanimoto_dist_cpu = cupy.asnumpy(tanimoto_dist)
    top_k_data_cpu = _rowwise_numpy_corr(tanimoto_dist_cpu, euclidean_dist_cpu,
                                         spearmanr_cpu)

    cupy.allclose(cupy.array(top_k_data_cpu),
                  top_k_data_gpu,
                  atol=0.005,
                  equal_nan=True)
Exemple #12
0
def test_pairwise_distances_exceptions():

    rng = np.random.RandomState(4)

    X_int = rng.randint(10, size=(5, 4))
    X_double = rng.random_sample((5, 4))
    X_float = np.asfarray(X_double, dtype=np.float32)
    X_bool = rng.choice([True, False], size=(5, 4))

    # Test int inputs (only float/double accepted at this time)
    with pytest.raises(TypeError):
        pairwise_distances(X_int, metric="euclidean")

    # Test second int inputs (should not have an exception with
    # convert_dtype=True)
    pairwise_distances(X_double, X_int, metric="euclidean")

    # Test bool inputs (only float/double accepted at this time)
    with pytest.raises(TypeError):
        pairwise_distances(X_bool, metric="euclidean")

    # Test sending different types with convert_dtype=False
    with pytest.raises(TypeError):
        pairwise_distances(X_double, X_float, metric="euclidean",
                           convert_dtype=False)

    # Invalid metric name
    with pytest.raises(ValueError):
        pairwise_distances(X_double, metric="Not a metric")

    # Invalid dimensions
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((5, 7))

    with pytest.raises(ValueError):
        pairwise_distances(X, Y, metric="euclidean")
Exemple #13
0
def tree_epg(
    X,
    Nodes: int = None,
    init: Optional[DataFrame] = None,
    lam: Optional[Union[float, int]] = 0.01,
    mu: Optional[Union[float, int]] = 0.1,
    trimmingradius: Optional = np.inf,
    initnodes: int = None,
    device: str = "cpu",
    seed: Optional[int] = None,
    verbose: bool = True,
):

    try:
        import elpigraph

    except Exception as e:
        warnings.warn('ElPiGraph package is not installed \
            \nPlease use "pip install git+https://github.com/j-bac/elpigraph-python.git" to install it'
                      )
    logg.hint("parameters used \n"
              "    " + str(Nodes) + " principal points, mu = " + str(mu) +
              ", lambda = " + str(lam))

    if seed is not None:
        np.random.seed(seed)

    if device == "gpu":
        import cupy as cp
        from cuml.metrics import pairwise_distances
        from .utils import cor_mat_gpu

        Tree = elpigraph.computeElasticPrincipalTree(
            X.values.astype(np.float64),
            NumNodes=Nodes,
            Do_PCA=False,
            InitNodes=initnodes,
            Lambda=lam,
            Mu=mu,
            TrimmingRadius=trimmingradius,
            GPU=True,
            verbose=verbose,
        )

        R = pairwise_distances(cp.asarray(X.values),
                               cp.asarray(Tree[0]["NodePositions"]))

        R = cp.asnumpy(R)
        # Hard assigment
        R = sparse.csr_matrix(
            (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))),
            R.shape).A

    else:
        from .utils import cor_mat_cpu
        from sklearn.metrics import pairwise_distances

        Tree = elpigraph.computeElasticPrincipalTree(
            X.values.astype(np.float64),
            NumNodes=Nodes,
            Do_PCA=False,
            InitNodes=initnodes,
            Lambda=lam,
            Mu=mu,
            TrimmingRadius=trimmingradius,
            verbose=verbose,
        )

        R = pairwise_distances(X.values, Tree[0]["NodePositions"])
        # Hard assigment
        R = sparse.csr_matrix(
            (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))),
            R.shape).A

    g = igraph.Graph(directed=False)
    g.add_vertices(np.unique(Tree[0]["Edges"][0].flatten().astype(int)))
    g.add_edges(
        pd.DataFrame(Tree[0]["Edges"][0]).astype(int).apply(tuple,
                                                            axis=1).values)

    # mat = np.asarray(g.get_adjacency().data)
    # mat = mat + mat.T - np.diag(np.diag(mat))
    # B=((mat>0).astype(int))

    B = np.asarray(g.get_adjacency().data)

    tips = np.argwhere(np.array(g.degree()) == 1).flatten()
    forks = np.argwhere(np.array(g.degree()) > 2).flatten()

    graph = {
        "B": B,
        "R": R,
        "F": Tree[0]["NodePositions"].T,
        "tips": tips,
        "forks": forks,
        "cells_fitted": X.index.tolist(),
        "metrics": "euclidean",
    }

    Tree[0]["Edges"] = list(Tree[0]["Edges"])

    return graph, Tree[0]
Exemple #14
0
def cosine_similarity(X, Y):
    K = 1.0 - cp.asarray(pairwise_distances(X, Y, metric='cosine'))
    return cp.nan_to_num(K, copy=False)
def dbscan_gpu(model,
               counts_per_word,
               embeddings=None,
               sim_thresh=0.8,
               min_samples=5,
               min_occs=1000,
               verbose=False,
               s2v=False):

    if embeddings is None:

        #print('COUNTS PER WORD:', counts_per_word[:, 1])

        # Keep only hashtags with more than min_occs occurences
        nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs)
        if nb_to_keep == 0:
            raise Exception(
                f'dbscan : No word with more than {min_occs} occurences')
        else:
            pass
            #print(f'dbscan : Keepings {nb_to_keep} words with more than {min_occs} occurences')

        # Create fit data
        #model_words = set(model.wv.vocab.keys())
        if not s2v:
            model_words = set(model.wv.index_to_key)
        else:
            model_words = set(model.keys())

        words_kept = np.array([
            word for word, count in counts_per_word[:nb_to_keep]
            if word in model_words
        ])
        #print('1- len(words_kept) :', len(words_kept))
        X = cudf.DataFrame()

        if s2v:
            transposed = np.array([model[w] for w in words_kept]).transpose()
        else:
            transposed = np.array([model.wv[w]
                                   for w in words_kept]).transpose()

        for e, v in enumerate(transposed):
            X[e] = v
        X = pairwise_distances(X, metric='cosine')

    else:
        X = cudf.DataFrame()
        for e, v in enumerate(embeddings.transpose()):
            X[e] = v
        X = pairwise_distances(X, metric='cosine')
        words_kept = np.arange(len(embeddings)).astype(str)
        #print('2- len(words_kept) :', len(words_kept))

    # cosine DBScan
    #clustering = DBSCAN(eps=1-sim_thresh, min_samples=min_samples, metric='cosine').fit(X)
    #clust_labels = clustering.labels_

    # Setup and fit clusters
    # Create and populate a GPU DataFrame
    #print('len(X):', len(X))
    clustering = DBSCAN(eps=1 - sim_thresh,
                        min_samples=min_samples,
                        metric="precomputed").fit(X)
    clust_labels = clustering.labels_.to_array()
    #print('labels :', clust_labels)
    #.to_pandas().values
    #print('len(clust_labels) :', len(clust_labels))

    if verbose:

        print(np.bincount(clust_labels + 1)[1:])

        for e in range(clust_labels.max() + 1):
            print(f"Topic {e} :")
            tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels
                                                                 == e]
            for tag in tags:
                print(f"\t{tag}")

    return clust_labels, words_kept
Exemple #16
0
def curve_epg(
    adata: AnnData,
    Nodes: int = None,
    use_rep: str = None,
    ndims_rep: Optional[int] = None,
    init: Optional[DataFrame] = None,
    lam: Optional[Union[float, int]] = 0.01,
    mu: Optional[Union[float, int]] = 0.1,
    trimmingradius: Optional = np.inf,
    initnodes: int = None,
    device: str = "cpu",
    seed: Optional[int] = None,
    verbose: bool = True,
):
    try:
        import elpigraph

    except Exception as e:
        warnings.warn('ElPiGraph package is not installed \
            \nPlease use "pip install git+https://github.com/j-bac/elpigraph-python.git" to install it'
                      )

    X = get_data(adata, use_rep, ndims_rep)

    if seed is not None:
        np.random.seed(seed)

    if device == "gpu":
        import cupy as cp
        from .utils import cor_mat_gpu
        from cuml.metrics import pairwise_distances

        Curve = elpigraph.computeElasticPrincipalCurve(
            X.values.astype(np.float64),
            NumNodes=Nodes,
            Do_PCA=False,
            InitNodes=initnodes,
            Lambda=lam,
            Mu=mu,
            TrimmingRadius=trimmingradius,
            GPU=True,
            verbose=verbose,
        )

        R = pairwise_distances(cp.asarray(X.values),
                               cp.asarray(Curve[0]["NodePositions"]))

        R = cp.asnumpy(R)
        # Hard assigment
        R = sparse.csr_matrix(
            (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))),
            R.shape).A

    else:
        from .utils import cor_mat_cpu
        from sklearn.metrics import pairwise_distances

        Curve = elpigraph.computeElasticPrincipalCurve(
            X.values.astype(np.float64),
            NumNodes=Nodes,
            Do_PCA=False,
            InitNodes=initnodes,
            Lambda=lam,
            Mu=mu,
            TrimmingRadius=trimmingradius,
            verbose=verbose,
        )

        R = pairwise_distances(X.values, Curve[0]["NodePositions"])
        # Hard assigment
        R = sparse.csr_matrix(
            (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))),
            R.shape).A

    g = igraph.Graph(directed=False)
    g.add_vertices(np.unique(Curve[0]["Edges"][0].flatten().astype(int)))
    g.add_edges(
        pd.DataFrame(Curve[0]["Edges"][0]).astype(int).apply(tuple,
                                                             axis=1).values)

    # mat = np.asarray(g.get_adjacency().data)
    # mat = mat + mat.T - np.diag(np.diag(mat))
    # B=((mat>0).astype(int))

    B = np.asarray(g.get_adjacency().data)

    tips = np.argwhere(np.array(g.degree()) == 1).flatten()
    forks = np.argwhere(np.array(g.degree()) > 2).flatten()

    graph = {
        "B": B,
        "R": R,
        "F": Curve[0]["NodePositions"].T,
        "tips": tips,
        "forks": forks,
        "cells_fitted": X.index.tolist(),
        "metrics": "euclidean",
    }

    Curve[0]["Edges"] = list(Curve[0]["Edges"])

    adata.uns["graph"] = graph
    adata.uns["epg"] = Curve[0]

    logg.info("    finished",
              time=True,
              end=" " if settings.verbosity > 2 else "\n")
    logg.hint(
        "added \n"
        "    .uns['epg'] dictionnary containing inferred elastic curve generated from elpigraph.\n"
        "    .uns['graph']['B'] adjacency matrix of the principal points.\n"
        "    .uns['graph']['R'] hard assignment of cells to principal point in representation space.\n"
        "    .uns['graph']['F'], coordinates of principal points in representation space."
    )

    return adata
Exemple #17
0
    def score_samples(self, X):
        """Compute the log-likelihood of each sample under the model.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            An array of points to query.  Last dimension should match dimension
            of training data (n_features).

        Returns
        -------

        density : ndarray of shape (n_samples,)
            Log-likelihood of each sample in `X`. These are normalized to be
            probability densities, so values will be low for high-dimensional
            data.
        """
        if not hasattr(self, "X_"):
            raise NotFittedError()
        X_cuml = input_to_cuml_array(X)
        if self.metric_params:
            if len(self.metric_params) != 1:
                raise ValueError(
                    "Cuml only supports metrics with a single arg.")
            metric_arg = list(self.metric_params.values())[0]
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric,
                                           metric_arg=metric_arg)
        else:
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric)

        distances = cp.asarray(distances)

        h = self.bandwidth
        if self.kernel in log_probability_kernels_:
            distances = log_probability_kernels_[self.kernel](distances, h)
        else:
            raise ValueError("Unsupported kernel.")

        log_probabilities = cp.zeros(distances.shape[0])
        if self.sample_weight_ is not None:
            distances += cp.log(self.sample_weight_)

        logsumexp_kernel.forall(log_probabilities.size)(distances,
                                                        log_probabilities)
        # Note that sklearns user guide is wrong
        # It says the (unnormalised) probability output for
        #  the kernel density is sum(K(x,h)).
        # In fact what they implment is (1/n)*sum(K(x,h))
        # Here we divide by n in normal probability space
        # Which becomes -log(n) in log probability space
        sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_
                       is not None else distances.shape[1])
        log_probabilities -= np.log(sum_weights)

        # norm
        if len(X_cuml.array.shape) == 1:
            # if X is one dimensional, we have 1 feature
            dimension = 1
        else:
            dimension = X_cuml.array.shape[1]
        log_probabilities = norm_log_probabilities(log_probabilities,
                                                   self.kernel, h, dimension)

        return log_probabilities