def count_kmers_to_df(self, path_kmers): """ Take all splits, count the kmer distribution and save to the kmer folder as pandas DataFrame """ for_csv = [] for segment, taxon, cat, start, end in self.yield_genome_split(): if cython_is_there: kmer_count = cyt_ext.kmer_counter(str(segment.seq), k=self.k, dictionary=True, combine=True) else: kmer_count = combine_counts_forward_w_rc(seq_count_kmer( str(segment.seq), deepcopy(self.kmer_count_zeros), k=self.k), k=self.k) for_csv.append( (taxon, cat, start, end, segment.name, segment.description, self.path_fna, *kmer_count.values())) # kmer_keys = list(self.kmer_count_zeros.keys()) df = pd.DataFrame(for_csv, columns=main.cols_types) df.taxon = df.taxon.astype('category') df.category = df.category.astype('category') df.name = df.name.astype('category') df.fna_path = df.fna_path.astype('category') for col in self.col_kmers: df[col] = df[col].astype("uint16") # todo: not using float32 ?? df.to_pickle(path_kmers) logger.debug(f"saved kmer count to {path_kmers}")
def kmer_count(self, ignore_N=True): """ common method """ if self._kmer_count is None: if cython_is_there: self._kmer_count = cyt_ext.kmer_counter(str(self.seq), k=K, dictionary=True, combine=True, length=len(self.seq)) else: self._kmer_count = combine_counts_forward_w_rc(seq_count_kmer(self.seq, self.KMER.copy(), K, ignore_N=ignore_N), k=K) return self._kmer_count
def count_kmers_to_df(self, path_kmers): """ Take all splits, count the kmer distribution and save to the kmer folder as pandas DataFrame """ # todo: consider combining forward and backward kmers as well as complements. # Single counter for AAAT, TAAA, TTTA and ATTT for_csv = [] for segment, taxon, cat, start, end in self.yield_genome_split(): kmer_count = seq_count_kmer(str(segment.seq), deepcopy(self.kmer_count_zeros), k=self.k) for_csv.append( (taxon, cat, start, end, segment.name, segment.description, self.path_fna, *kmer_count.values())) # kmer_keys = list(self.kmer_count_zeros.keys()) df = pd.DataFrame(for_csv, columns=main.cols_types) df.taxon = df.taxon.astype('category') df.category = df.category.astype('category') df.name = df.name.astype('category') df.fna_path = df.fna_path.astype('category') for col in self.col_kmers: df[col] = df[col].astype("uint16") df.to_pickle(path_kmers) logger.debug(f"saved kmer count to {path_kmers}")
def test_bio_count_and_combine_seq(k, seq, counts): assert bio.combine_counts_forward_w_rc(bio.seq_count_kmer(seq, k=k), k) == counts
def test_bio_seq_count_kmer(k, seq, counts, np_counts): assert bio.seq_count_kmer(seq, k=k) == counts