Esempio n. 1
0
 def count_kmers_to_df(self, path_kmers):
     """ Take all splits, count the kmer distribution and save to the kmer folder as pandas DataFrame """
     for_csv = []
     for segment, taxon, cat, start, end in self.yield_genome_split():
         if cython_is_there:
             kmer_count = cyt_ext.kmer_counter(str(segment.seq),
                                               k=self.k,
                                               dictionary=True,
                                               combine=True)
         else:
             kmer_count = combine_counts_forward_w_rc(seq_count_kmer(
                 str(segment.seq),
                 deepcopy(self.kmer_count_zeros),
                 k=self.k),
                                                      k=self.k)
         for_csv.append(
             (taxon, cat, start, end, segment.name, segment.description,
              self.path_fna, *kmer_count.values()))
     # kmer_keys = list(self.kmer_count_zeros.keys())
     df = pd.DataFrame(for_csv, columns=main.cols_types)
     df.taxon = df.taxon.astype('category')
     df.category = df.category.astype('category')
     df.name = df.name.astype('category')
     df.fna_path = df.fna_path.astype('category')
     for col in self.col_kmers:
         df[col] = df[col].astype("uint16")  # todo: not using float32 ??
     df.to_pickle(path_kmers)
     logger.debug(f"saved kmer count to {path_kmers}")
Esempio n. 2
0
 def kmer_count(self, ignore_N=True):
     """ common method """
     if self._kmer_count is None:
         if cython_is_there:
             self._kmer_count = cyt_ext.kmer_counter(str(self.seq), k=K, dictionary=True, combine=True, length=len(self.seq))
         else:
             self._kmer_count = combine_counts_forward_w_rc(seq_count_kmer(self.seq, self.KMER.copy(), K, ignore_N=ignore_N), k=K)
     return self._kmer_count
Esempio n. 3
0
    def count_kmers_to_df(self, path_kmers):
        """ Take all splits, count the kmer distribution and save to the kmer folder as pandas DataFrame """
        # todo: consider combining forward and backward kmers as well as complements.
        #  Single counter for AAAT, TAAA, TTTA and ATTT

        for_csv = []
        for segment, taxon, cat, start, end in self.yield_genome_split():
            kmer_count = seq_count_kmer(str(segment.seq),
                                        deepcopy(self.kmer_count_zeros),
                                        k=self.k)
            for_csv.append(
                (taxon, cat, start, end, segment.name, segment.description,
                 self.path_fna, *kmer_count.values()))
        # kmer_keys = list(self.kmer_count_zeros.keys())
        df = pd.DataFrame(for_csv, columns=main.cols_types)
        df.taxon = df.taxon.astype('category')
        df.category = df.category.astype('category')
        df.name = df.name.astype('category')
        df.fna_path = df.fna_path.astype('category')
        for col in self.col_kmers:
            df[col] = df[col].astype("uint16")
        df.to_pickle(path_kmers)
        logger.debug(f"saved kmer count to {path_kmers}")
Esempio n. 4
0
def test_bio_count_and_combine_seq(k, seq, counts):
    assert bio.combine_counts_forward_w_rc(bio.seq_count_kmer(seq, k=k),
                                           k) == counts
Esempio n. 5
0
def test_bio_seq_count_kmer(k, seq, counts, np_counts):
    assert bio.seq_count_kmer(seq, k=k) == counts