예제 #1
0
def get_kmer_embedding(
    counts: pd.DataFrame,
    cache_fpath: str,
    norm_method: str,
    pca_dimensions: int,
    embed_dimensions: int,
    embed_method: str,
) -> pd.DataFrame:
    """Retrieve kmer embeddings for provided counts by first performing kmer normalization with `norm_method`
    then PCA down to `pca_dimensions` until the normalized kmer frequencies are embedded to `embed_dimensions` using `embed_method`.

    Parameters
    ----------
    counts : pd.DataFrame
        Kmer counts where index column is 'contig' and each column is a kmer count.

    cache_fpath : str
        Path to cache embedded kmers table for later look-up/inspection.

    norm_method : str
        normalization transformation to use on kmer counts. Choices include 'am_clr', 'ilr' and 'clr'. See :func:kmers.normalize for more details.

    pca_dimensions : int
        Number of dimensions by which to initially reduce normalized kmer frequencies (Must be greater than `embed_dimensions`).

    embed_dimensions : int
        Embedding dimensions by which to reduce normalized PCA-transformed kmer frequencies (Must be less than `pca_dimensions`).

    embed_method : str
        Embedding method to use on normalized, PCA-transformed kmer frequencies. Choices include 'bhsne', 'sksne' and 'umap'. See :func:kmers.embed for more details.

    Returns
    -------
    pd.DataFrame
        [description]
    """
    # No cache dir provided so we perform normalization and embedding then return
    if not cache_fpath:
        return kmers.embed(
            kmers=kmers.normalize(counts, method=norm_method),
            embed_dimensions=embed_dimensions,
            pca_dimensions=pca_dimensions,
            method=embed_method,
        )
    # Cache was provided so we are going to first try to retrieve the cached embedding
    if os.path.exists(cache_fpath) and os.path.getsize(cache_fpath):
        # Retrieve embedding if it has already been cached and we are trying to resume.
        logger.debug(f"Found cached embeddings {cache_fpath}. Reading...")
        return pd.read_csv(cache_fpath, sep="\t", index_col="contig")
    # Cache does not exist, so we perform embedding on rank then cache
    embedding = kmers.embed(
        kmers=kmers.normalize(counts, method=norm_method),
        embed_dimensions=embed_dimensions,
        pca_dimensions=pca_dimensions,
        method=embed_method,
    )
    embedding.to_csv(cache_fpath, sep="\t", index=True, header=True)
    logger.debug(f"Cached embeddings to {cache_fpath}")
    return embedding
예제 #2
0
def test_normalize(counts, method, tmp_path):
    out = tmp_path / "kmers.norm.tsv"
    force = False
    df = kmers.normalize(df=counts, method=method, out=out, force=force)
    if method in {"am_clr", "clr"}:
        assert df.shape == counts.shape
    else:
        # ILR will reduce the columns by one.
        assert df.shape[1] < counts.shape[1]
    assert out.exists()
예제 #3
0
    def get_kmers(self, num_records: int = 5):
        if num_records < 5:
            raise ValueError(
                f"At least 5 records are required for embedding tests! provided: {num_records}"
            )
        logger.info("Preparing kmer counts test data...")
        # kmer size is 5 (b/c this is the default).
        counts = kmers.count(assembly=self.metagenome, size=5)
        # subset counts to `num_records`
        counts = counts.sample(n=num_records, random_state=42)
        # method is am_clr (b/c this is the default).
        am_clr_normalized_counts = kmers.normalize(df=counts, method="am_clr")

        for df in [counts, am_clr_normalized_counts]:
            df.reset_index(inplace=True)
        self.data["kmers"] = {
            "counts": counts.to_json(),
            "am_clr_normalized_counts": am_clr_normalized_counts.to_json(),
        }
예제 #4
0
def test_normalize_wrong_method(counts, tmp_path):
    out = tmp_path / "kmers.norm.tsv"
    with pytest.raises(ValueError):
        kmers.normalize(df=counts, method="am_ilr", out=out, force=False)
예제 #5
0
def test_normalize_out_exists(counts, norm_df, force, tmp_path):
    out = tmp_path / "kmers.norm.tsv"
    norm_df.to_csv(out, sep="\t", index=True, header=True)
    df = kmers.normalize(df=counts, method="am_clr", out=out, force=force)
    assert df.shape == counts.shape
    assert df.index.name == "contig"