def get_kmer_embedding( counts: pd.DataFrame, cache_fpath: str, norm_method: str, pca_dimensions: int, embed_dimensions: int, embed_method: str, ) -> pd.DataFrame: """Retrieve kmer embeddings for provided counts by first performing kmer normalization with `norm_method` then PCA down to `pca_dimensions` until the normalized kmer frequencies are embedded to `embed_dimensions` using `embed_method`. Parameters ---------- counts : pd.DataFrame Kmer counts where index column is 'contig' and each column is a kmer count. cache_fpath : str Path to cache embedded kmers table for later look-up/inspection. norm_method : str normalization transformation to use on kmer counts. Choices include 'am_clr', 'ilr' and 'clr'. See :func:kmers.normalize for more details. pca_dimensions : int Number of dimensions by which to initially reduce normalized kmer frequencies (Must be greater than `embed_dimensions`). embed_dimensions : int Embedding dimensions by which to reduce normalized PCA-transformed kmer frequencies (Must be less than `pca_dimensions`). embed_method : str Embedding method to use on normalized, PCA-transformed kmer frequencies. Choices include 'bhsne', 'sksne' and 'umap'. See :func:kmers.embed for more details. Returns ------- pd.DataFrame [description] """ # No cache dir provided so we perform normalization and embedding then return if not cache_fpath: return kmers.embed( kmers=kmers.normalize(counts, method=norm_method), embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=embed_method, ) # Cache was provided so we are going to first try to retrieve the cached embedding if os.path.exists(cache_fpath) and os.path.getsize(cache_fpath): # Retrieve embedding if it has already been cached and we are trying to resume. logger.debug(f"Found cached embeddings {cache_fpath}. Reading...") return pd.read_csv(cache_fpath, sep="\t", index_col="contig") # Cache does not exist, so we perform embedding on rank then cache embedding = kmers.embed( kmers=kmers.normalize(counts, method=norm_method), embed_dimensions=embed_dimensions, pca_dimensions=pca_dimensions, method=embed_method, ) embedding.to_csv(cache_fpath, sep="\t", index=True, header=True) logger.debug(f"Cached embeddings to {cache_fpath}") return embedding
def test_normalize(counts, method, tmp_path): out = tmp_path / "kmers.norm.tsv" force = False df = kmers.normalize(df=counts, method=method, out=out, force=force) if method in {"am_clr", "clr"}: assert df.shape == counts.shape else: # ILR will reduce the columns by one. assert df.shape[1] < counts.shape[1] assert out.exists()
def get_kmers(self, num_records: int = 5): if num_records < 5: raise ValueError( f"At least 5 records are required for embedding tests! provided: {num_records}" ) logger.info("Preparing kmer counts test data...") # kmer size is 5 (b/c this is the default). counts = kmers.count(assembly=self.metagenome, size=5) # subset counts to `num_records` counts = counts.sample(n=num_records, random_state=42) # method is am_clr (b/c this is the default). am_clr_normalized_counts = kmers.normalize(df=counts, method="am_clr") for df in [counts, am_clr_normalized_counts]: df.reset_index(inplace=True) self.data["kmers"] = { "counts": counts.to_json(), "am_clr_normalized_counts": am_clr_normalized_counts.to_json(), }
def test_normalize_wrong_method(counts, tmp_path): out = tmp_path / "kmers.norm.tsv" with pytest.raises(ValueError): kmers.normalize(df=counts, method="am_ilr", out=out, force=False)
def test_normalize_out_exists(counts, norm_df, force, tmp_path): out = tmp_path / "kmers.norm.tsv" norm_df.to_csv(out, sep="\t", index=True, header=True) df = kmers.normalize(df=counts, method="am_clr", out=out, force=force) assert df.shape == counts.shape assert df.index.name == "contig"