def get_kmc_data(k, genome_file, out_file, out_dir, verbose=False):
    #run from CmashKmerAbundance dir
    kmc_database = kmc.KMCFile()
    abs_path = os.getcwd()
    out_path = abs_path + '/' + out_dir + '/' + out_file
    os.system('kmc -k%d -cs1000 -b -v -fm -ci2 %s %s %s' %
              (k, genome_file, out_path, '.'))
    kmc_database.OpenForListing(out_path)
    kmer_obj = kmc.KmerAPI(kmc_database.Info().kmer_length)
    counter = kmc.Count()
    counter_dict = dict()
    while kmc_database.ReadNextKmer(kmer_obj, counter):
        if int(counter.value) in counter_dict.keys():
            counter_dict[int(counter.value)] += 1
        else:
            counter_dict[int(counter.value)] = 1
    print(len(counter_dict.keys()))
    print(counter_dict)
    normed_dict = dict()
    total_count = sum(counter_dict.values())
    for k, v in counter_dict.items():
        normed_dict[k] = counter_dict[k] / total_count
    #df = pd.DataFrame(list(normed_dict.items()), columns=['kmer_count', 'percentage'])
    #print(sum(normed_dict.values()))
    #print(df)
    #sns.histplot(x='kmer_count', y='percentage', binwidth=1, data=df)
    #plt.savefig('test1.png')
    print(normed_dict)
    return normed_dict
Esempio n. 2
0
def test_kmc_file_next_kmer(create_kmc_db):
    ''' Test if all counted k-mers are returned by KMC API using NextKmer method. '''
    pattern = create_kmc_db['kmers']
    kmc_file = _open_for_listing()
    counter = pka.Count()
    kmer = pka.KmerAPI(create_kmc_db['kmer_len'])
    res = {}
    while kmc_file.ReadNextKmer(kmer, counter):
        res[str(kmer)] = counter.value
    assert res == pattern
Esempio n. 3
0
def k_mer_global_histogram_KMC(k, genome, runKMC=False):
    # input: k - k-mer size, genome - fasta(.gz)
    # return np.array of abundance and normalized abundance distribution

    # create KMC database
    KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res'
    outpath = os.path.dirname(
        os.path.realpath(__file__)) + '/kmc_global_count/'
    # if the value not stored, compute it, else load it
    if not os.path.isfile(outpath + KMC_outname + '.global.pickle') or runKMC:
        # check if KMC database exists
        if runKMC or not os.path.isfile(outpath + KMC_outname + '.kmc_pre'):
            # -ci2 - exclude k-mers occurring less than 2 times
            if '.fastq' in genome or '.fq' in genome:
                os.system(
                    '/storage/home/xbz5174/work/tools/KMC-3.1.1/bin/kmc -fq -v -ci0 -b -cs3000 -k%d %s %s %s'
                    % (k, genome, outpath + KMC_outname, outpath))
            elif '.fasta' in genome or '.fa' in genome or '.fna' in genome:
                os.system(
                    '/storage/home/xbz5174/work/tools/KMC-3.1.1/bin/kmc -fm -v -ci0 -b -cs3000 -k%d %s %s %s'
                    % (k, genome, outpath + KMC_outname, outpath))
            else:
                print("Is file fa/fq? Check file and its name!",
                      file=sys.stderr)
                exit(2)
        # read KMC database to get count values
        kmer_data_base = kmc.KMCFile()
        kmer_data_base.OpenForListing(outpath + KMC_outname)
        kmer_object = kmc.KmerAPI(kmer_data_base.Info().kmer_length)
        counter = kmc.Count()
        counter_dict = {}
        while kmer_data_base.ReadNextKmer(kmer_object, counter):
            try:
                counter_dict[int(
                    counter.value)] = counter_dict[int(counter.value)] + 1
            except KeyError:
                counter_dict[int(counter.value)] = 1
        # get normalized distribution
        dist = np.zeros(max(counter_dict.keys()))
        for _k, _v in counter_dict.items():
            dist[_k - 1] = _v
        dist_norm = dist / np.sum(dist)
        with open(outpath + KMC_outname + '.global.pickle',
                  'wb') as config_global_file:
            pickle.dump([dist, dist_norm], config_global_file)
    else:
        with open(outpath + KMC_outname + '.global.pickle',
                  'rb') as config_global_file:
            dist, dist_norm = pickle.load(config_global_file)
    return dist, dist_norm
Esempio n. 4
0
def test_check_kmer(create_kmc_db):
    '''
    Test case for CheckKmer method.

    Check if are k-mers from input are present in the database and
    if some not present in the input are absent in the output.
    '''
    kmers = create_kmc_db['kmers']
    kmer_len = create_kmc_db['kmer_len']
    kmer = pka.KmerAPI(kmer_len)
    counter = pka.Count()
    kmc_file = _open_for_ra()
    for kmer_str, count in kmers.items():
        kmer.from_string(kmer_str)
        assert kmc_file.CheckKmer(kmer, counter)
        assert counter.value == count
    absent_kmers = create_kmc_db['absent_kmers']
    for kmer_str in absent_kmers:
        kmer.from_string(kmer_str)
        assert not kmc_file.CheckKmer(kmer, counter)
Esempio n. 5
0
def _py_kmer_api_from_string(kmer_str):
    kmer = pka.KmerAPI(len(kmer_str))
    kmer.from_string(kmer_str)
    return kmer
Esempio n. 6
0
                    default=0)
group2.add_argument("-cx",
                    "--cutoff_max",
                    type=int,
                    help="exclude k-mers occurring more of than CX times",
                    default=0)

args = parser.parse_args()

kmer_data_base = pka.KMCFile()
if not kmer_data_base.OpenForListing(args.kmc_database):
    print("Error: cannot open kmc database")
    sys.exit(1)

info = kmer_data_base.Info()
kmer_object = pka.KmerAPI(info.kmer_length)

if args.cutoff_min > 0:
    if not kmer_data_base.SetMinCount(args.cutoff_min):
        print("Error: cannot set cutoff min")
        sys.exit(1)

if args.cutoff_max > 0:
    if not kmer_data_base.SetMaxCount(args.cutoff_max):
        print("Error: cannot set cutoff max")
        sys.exit(1)

output_file = open(args.output_file, 'w')

counter = pka.Count()
while kmer_data_base.ReadNextKmer(kmer_object, counter):
Esempio n. 7
0
def make_kmc_genome_counter(path, lag, reverse=True, no_end=False):
    """ Get a function that takes a batch of kmers and returns transition counts.
    End symbol is 0 because ends in assemblies aren't reliable.
    
    Parameters
    ----------
    kmc_path : str
        Path to kmc file with counts.
    lag : int
    reverse : bool, default=True
        Whether to include counts of the reverse complement of kmers as well.
    no_end : bool, default=False
        Don't load kmc files for starts and ends and assume kmers don't end.
        In this case you can enter the exact res file.
    
    Returns
    -------
    counter : function
        Takes kmer strings and returns transition counts.
    """
    global kmc
    import py_kmc_api as kmc
    
    alphabet = core.alphabets_en['dna'][:-1]
    alphabet_size = len(alphabet)
    
    # create tokens for calling kmc
    kmer_token = kmc.KmerAPI(lag+1)
    c = kmc.Count()
    
    if no_end:
        # Load kmc file into memory
        print("loading", path)
        if '.res' not in path:
            path = path + '_kmc_inter_0_full_{}.res'.format(lag+1)
        file = load_kmc(path)
        def counter(kmers):
            final_shape = np.r_[np.shape(kmers), [alphabet_size+1]]
            counts = np.zeros([np.size(kmers), alphabet_size+1])
            for i, k in enumerate(kmers.flatten()):
                for j, b in enumerate(alphabet):
                    # Get kp1mer count
                    counts[i, j] = get_kmc_count(k + b, file, kmer_token, c)
                    # Get reverse count (assemblies only look at one strand).
                    if reverse:
                        counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                                      file, kmer_token, c)
            return counts.reshape(final_shape)
    else:
        # Load kmc file into memory
        print("loading", path)
        files = []
        files_suf = []
        for l in np.arange(lag) + 1:
            files.append(load_kmc(path + '_kmc_inter_0_pre_{}.res'.format(l)))
            files_suf.append(load_kmc(path + '_kmc_inter_0_suf_{}.res'.format(l)))
        files.append(load_kmc(path + '_kmc_inter_0_full_{}.res'.format(lag+1)))
        def counter(kmers):
            final_shape = np.r_[np.shape(kmers), [alphabet_size+1]]
            counts = np.zeros([np.size(kmers), alphabet_size+1])
            for i, k in enumerate(kmers.flatten()):
                k = k.replace('[', '')
                for j, b in enumerate(alphabet):
                    # Get kp1mer count
                    counts[i, j] = get_kmc_count(k + b, files[len(k)], kmer_token, c)
                    # Get reverse count (assemblies only look at one strand).
                    if reverse:
                        if len(k) == lag:
                            counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                                          files[len(k)], kmer_token, c)
                        if len(k) < lag:
                            counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                                          files_suf[len(k)], kmer_token, c)
                if len(k) == lag:
                    counts[i, -1] = get_kmc_count(k, files_suf[len(k)-1], kmer_token, c)
                    if reverse:
                        counts[i, -1] += get_kmc_count(Seq.reverse_complement(k),
                                                       files[len(k)-1], kmer_token, c)
            return counts.reshape(final_shape)
    return counter