def get_kmer_embedding(
    counts: pd.DataFrame,
    cache_fpath: str,
    norm_method: str,
    pca_dimensions: int,
    embed_dimensions: int,
    embed_method: str,
) -> pd.DataFrame:
    """Retrieve kmer embeddings for provided counts by first performing kmer normalization with `norm_method`
    then PCA down to `pca_dimensions` until the normalized kmer frequencies are embedded to `embed_dimensions` using `embed_method`.

    Parameters
    ----------
    counts : pd.DataFrame
        Kmer counts where index column is 'contig' and each column is a kmer count.

    cache_fpath : str
        Path to cache embedded kmers table for later look-up/inspection.

    norm_method : str
        normalization transformation to use on kmer counts. Choices include 'am_clr', 'ilr' and 'clr'. See :func:kmers.normalize for more details.

    pca_dimensions : int
        Number of dimensions by which to initially reduce normalized kmer frequencies (Must be greater than `embed_dimensions`).

    embed_dimensions : int
        Embedding dimensions by which to reduce normalized PCA-transformed kmer frequencies (Must be less than `pca_dimensions`).

    embed_method : str
        Embedding method to use on normalized, PCA-transformed kmer frequencies. Choices include 'bhsne', 'sksne' and 'umap'. See :func:kmers.embed for more details.

    Returns
    -------
    pd.DataFrame
        [description]
    """
    # No cache dir provided so we perform normalization and embedding then return
    if not cache_fpath:
        return kmers.embed(
            kmers=kmers.normalize(counts, method=norm_method),
            embed_dimensions=embed_dimensions,
            pca_dimensions=pca_dimensions,
            method=embed_method,
        )
    # Cache was provided so we are going to first try to retrieve the cached embedding
    if os.path.exists(cache_fpath) and os.path.getsize(cache_fpath):
        # Retrieve embedding if it has already been cached and we are trying to resume.
        logger.debug(f"Found cached embeddings {cache_fpath}. Reading...")
        return pd.read_csv(cache_fpath, sep="\t", index_col="contig")
    # Cache does not exist, so we perform embedding on rank then cache
    embedding = kmers.embed(
        kmers=kmers.normalize(counts, method=norm_method),
        embed_dimensions=embed_dimensions,
        pca_dimensions=pca_dimensions,
        method=embed_method,
    )
    embedding.to_csv(cache_fpath, sep="\t", index=True, header=True)
    logger.debug(f"Cached embeddings to {cache_fpath}")
    return embedding
Exemple #2
0
def test_embed_dimensions(norm_df, embed_dimensions, tmp_path):
    out = tmp_path / "kmers.embed.tsv"
    df = kmers.embed(
        kmers=norm_df,
        out=out,
        force=False,
        embed_dimensions=embed_dimensions,
        pca_dimensions=5,
        method="bhsne",
        seed=42,
    )
    assert df.shape[1] == embed_dimensions
Exemple #3
0
def test_embed_out_exists(norm_df, force, tmp_path):
    seed = 42
    out = tmp_path / "kmers.embed.tsv"
    method = "bhsne"
    embed_dimensions = 2
    pca_dimensions = 3
    df = kmers.embed(
        kmers=norm_df,
        out=out,
        force=force,
        embed_dimensions=embed_dimensions,
        pca_dimensions=pca_dimensions,
        method=method,
        seed=seed,
    )
Exemple #4
0
def test_embed_methods(norm_df, method, tmp_path):
    seed = 42
    out = tmp_path / "kmers.embed.tsv"
    force = False
    embed_dimensions = 2
    pca_dimensions = 3
    df = kmers.embed(
        kmers=norm_df,
        out=out,
        force=force,
        embed_dimensions=embed_dimensions,
        pca_dimensions=pca_dimensions,
        method=method,
        seed=seed,
    )
    assert df.shape[1] == embed_dimensions
Exemple #5
0
def test_embed_methods(norm_df, method, tmp_path):
    seed = 42
    out = tmp_path / "kmers.embed.tsv"
    force = False
    embed_dimensions = 2
    pca_dimensions = 3
    method_kwargs = {}
    verbose = 1 if method == "sksne" else True
    method_kwargs.update({"verbose": verbose})
    output_dens = {"output_dens": True} if method == "densmap" else {}
    method_kwargs.update(output_dens)
    df = kmers.embed(
        kmers=norm_df,
        out=out,
        force=force,
        embed_dimensions=embed_dimensions,
        pca_dimensions=pca_dimensions,
        method=method,
        seed=seed,
        **method_kwargs,
    )
    out_shape = embed_dimensions + 2 if method == "densmap" else embed_dimensions
    assert df.shape[1] == out_shape
Exemple #6
0
def test_embed_empty_dataframe(tmp_path):
    empty_df = pd.DataFrame({})
    out = tmp_path / "kmers.embed.tsv"
    with pytest.raises(FileNotFoundError):
        kmers.embed(kmers=empty_df, out=out, force=True)
Exemple #7
0
def test_embed_input_not_string_or_dataframe(tmp_path):
    kmer_fpath = tmp_path / "kmers.embed.tsv"
    with pytest.raises(TypeError):
        kmers.embed(kmers=kmer_fpath)
Exemple #8
0
def test_embed_TableFormatError(invalid_df_fpath):
    with pytest.raises(TableFormatError):
        kmers.embed(kmers=invalid_df_fpath)