def get_kmer_embedding( counts: pd.DataFrame, cache_fpath: str, norm_method: str, pca_dimensions: int, embed_dimensions: int, embed_method: str, ) -> pd.DataFrame: """Retrieve kmer embeddings for provided counts by first performing kmer normalization with `norm_method` then PCA down to `pca_dimensions` until the normalized kmer frequencies are embedded to `embed_dimensions` using `embed_method`. Parameters ---------- counts : pd.DataFrame Kmer counts where index column is 'contig' and each column is a kmer count. cache_fpath : str Path to cache embedded kmers table for later look-up/inspection. norm_method : str normalization transformation to use on kmer counts. Choices include 'am_clr', 'ilr' and 'clr'. See :func:kmers.normalize for more details. pca_dimensions : int Number of dimensions by which to initially reduce normalized kmer frequencies (Must be greater than `embed_dimensions`). embed_dimensions : int Embedding dimensions by which to reduce normalized PCA-transformed kmer frequencies (Must be less than `pca_dimensions`). embed_method : str Embedding method to use on normalized, PCA-transformed kmer frequencies. Choices include 'bhsne', 'sksne' and 'umap'. See :func:kmers.embed for more details. Returns ------- pd.DataFrame [description] """ # No cache dir provided so we perform normalization and embedding then return if not cache_fpath: return kmers.embed( kmers=kmers.normalize(counts, method=norm_method), embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=embed_method, ) # Cache was provided so we are going to first try to retrieve the cached embedding if os.path.exists(cache_fpath) and os.path.getsize(cache_fpath): # Retrieve embedding if it has already been cached and we are trying to resume. logger.debug(f"Found cached embeddings {cache_fpath}. Reading...") return pd.read_csv(cache_fpath, sep="\t", index_col="contig") # Cache does not exist, so we perform embedding on rank then cache embedding = kmers.embed( kmers=kmers.normalize(counts, method=norm_method), embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=embed_method, ) embedding.to_csv(cache_fpath, sep="\t", index=True, header=True) logger.debug(f"Cached embeddings to {cache_fpath}") return embedding
def test_embed_dimensions(norm_df, embed_dimensions, tmp_path): out = tmp_path / "kmers.embed.tsv" df = kmers.embed( kmers=norm_df, out=out, force=False, embed_dimensions=embed_dimensions, pca_dimensions=5, method="bhsne", seed=42, ) assert df.shape[1] == embed_dimensions
def test_embed_out_exists(norm_df, force, tmp_path): seed = 42 out = tmp_path / "kmers.embed.tsv" method = "bhsne" embed_dimensions = 2 pca_dimensions = 3 df = kmers.embed( kmers=norm_df, out=out, force=force, embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=method, seed=seed, )
def test_embed_methods(norm_df, method, tmp_path): seed = 42 out = tmp_path / "kmers.embed.tsv" force = False embed_dimensions = 2 pca_dimensions = 3 df = kmers.embed( kmers=norm_df, out=out, force=force, embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=method, seed=seed, ) assert df.shape[1] == embed_dimensions
def test_embed_methods(norm_df, method, tmp_path): seed = 42 out = tmp_path / "kmers.embed.tsv" force = False embed_dimensions = 2 pca_dimensions = 3 method_kwargs = {} verbose = 1 if method == "sksne" else True method_kwargs.update({"verbose": verbose}) output_dens = {"output_dens": True} if method == "densmap" else {} method_kwargs.update(output_dens) df = kmers.embed( kmers=norm_df, out=out, force=force, embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=method, seed=seed, **method_kwargs, ) out_shape = embed_dimensions + 2 if method == "densmap" else embed_dimensions assert df.shape[1] == out_shape
def test_embed_empty_dataframe(tmp_path): empty_df = pd.DataFrame({}) out = tmp_path / "kmers.embed.tsv" with pytest.raises(FileNotFoundError): kmers.embed(kmers=empty_df, out=out, force=True)
def test_embed_input_not_string_or_dataframe(tmp_path): kmer_fpath = tmp_path / "kmers.embed.tsv" with pytest.raises(TypeError): kmers.embed(kmers=kmer_fpath)
def test_embed_TableFormatError(invalid_df_fpath): with pytest.raises(TableFormatError): kmers.embed(kmers=invalid_df_fpath)