def get_MH_data(n, k, genome_file, rev_comp=False): ''' :param n: :param k: kmer size :param genome_file: fasta format :param rev_comp: :return: ''' estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome_file, rev_comp=rev_comp) counts = estimator._counts count_dict = dict() for count in counts: if count > 0: if count in count_dict.keys(): count_dict[count] += 1 else: count_dict[count] = 1 normed_dict = dict() total_count = sum(count_dict.values()) for k, v in count_dict.items(): normed_dict[k] = count_dict[k] / total_count #print("minhash results:") #print(normed_dict) #print(len(normed_dict.keys())) #print("checking if MH estimate is correct:") #print(sum(count_dict.values())) #print(count_dict) return normed_dict
def make_minhash(genome, max_h, prime, ksize): MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y', input_file_name=genome, rev_comp=False) # the query automatically takes care of rev_comp's for me # Just use HLL to estimate the number of kmers, no need to get exact count hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(genome) MHS._true_num_kmers = hll.estimate_cardinality() MHS.input_file_name = genome return MHS
def quick_dump(k_list, n, input_file): for k in k_list: pickle_file = 'k' + str(k) + 'n' + str(n) + input_file + '.pickle' print(pickle_file) estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file, rev_comp=False) counts = estimator._counts with open(pickle_file, 'wb') as pf: pickle.dump(counts, pf)
def quicker_dump(input_file): n = 10000 for k in [25, 50, 75]: pickle_file = 'k' + str(k) + 'n10000' + input_file + '.pickle' print(pickle_file) estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file, rev_comp=False) counts = estimator._counts with open(pickle_file, 'wb') as pf: pickle.dump(counts, pf)
def kmc_cmash_compare(k, n, input_file): kmc_normed_dict = get_kmc_data(k, input_file, input_file + '_out', 'out') #minhash estimate estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file) real_dist = pd.DataFrame(list(kmc_normed_dict.items()), columns=['kmer_count', 'percentage']) sns.barplot(x='kmer_count', y='percentage', data=real_dist) plt.savefig('quicklook_real.png') counts = estimator._counts estimated_normed_dict = get_count_dict(counts) #quick look at distribution df = pd.DataFrame(list(estimated_normed_dict.items()), columns=['kmer_count', 'percentage']) sns.barplot(x='kmer_count', y='percentage', data=df) plt.savefig('quicklook.png') ##### print(sum(estimated_normed_dict.values())) print(counts) print(get_distance(kmc_normed_dict, estimated_normed_dict, 'wasserstein'))
import khmer # FIXME: could probably do all the data creation, module initialization, and method calling, and then have the tests # FIXME: just test the data # create some test data # First, the TST seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG" seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT" seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" seqs = [seq1, seq2, seq3, seq4] query_seq = seq3 num_hashes = 5 CE1 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE2 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE3 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE4 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE1.add_sequence(seq1) CE2.add_sequence(seq2)
def k_mer_sketch_histogram(n, k, genome, rev_comp=False): n = int(n) k = int(k) # input: n - sketch size (# Hash function), k - k-mer size, genome - fasta(.gz) # return np.array of abundance and normalized abundance distribution KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res' outpath = os.path.dirname( os.path.realpath(__file__)) + '/kmc_global_count/' # if the value not stored, compute it, else load it if not os.path.isfile(outpath + KMC_outname + '.sketch' + str(n) + '.pickle'): # if MinHash Estimator with larger sketch size doesn't exists, compute it with current sketch size MHS_filenames = os.listdir(outpath + 'MH_counts/') if MHS_filenames: try: # get min sketch sizes of existing MinHash Estimator which is greater than n sketch_size_existing = [ int(_.split('.sketch')[-1].split('.MHScounts.pickle')[0]) for _ in MHS_filenames if (_.endswith('.MHScounts.pickle') and '.ksize' + str(k) + '.' in _ and KMC_outname in _) ] sketch_size_existing_greater_than_n = min( [_ for _ in sketch_size_existing if _ >= n]) MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str( sketch_size_existing_greater_than_n) + '.MHScounts.pickle' with open(MHS_count_name, 'rb') as MHS_sketch_count_file: MHS_count = pickle.load(MHS_sketch_count_file) counts = MHS_count[:n] # sketch_size_existing_greater_than_n is empty except (ValueError, FileNotFoundError): MHS = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome, rev_comp=rev_comp) counts = MHS._counts else: MHS = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome, rev_comp=rev_comp) counts = MHS._counts # check if MHS counts with k & n is saved nor not MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str( n) + '.MHScounts.pickle' if not os.path.isfile(MHS_count_name): with open(MHS_count_name, 'wb') as MHS_sketch_count_file: pickle.dump(counts, MHS_sketch_count_file) # turn array of counts of k-mers into occurrence of k-mers with the counts dist = np.zeros(max(counts)) for _c in counts: dist[_c - 1] = dist[_c - 1] + 1 dist_norm = dist / np.sum(dist) with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle', 'wb') as config_sketch_file: pickle.dump([dist, dist_norm], config_sketch_file) else: with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle', 'rb') as config_sketch_file: dist, dist_norm = pickle.load(config_sketch_file) return dist, dist_norm # np.array(list(dist))