def main(): parser = argparse.ArgumentParser(description="This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21) parser.add_argument('in_file', help="Input file: file containing (absolute) file names of training genomes.") parser.add_argument('out_file', help='Output training database/reference file (in HDF5 format). An additional file ' '(ending in .tst) will also be created in the same directory with the same base name.') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) # check for and make filename for tst file streaming_database_file = os.path.splitext(out_file)[0] + ".tst" streaming_database_file = os.path.abspath(streaming_database_file) file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() file_names = sorted(file_names, key=os.path.basename) # sort based off of base name # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # Save the ternary search tree to_insert = set() for i in range(len(genome_sketches)): for kmer_index in range(len(genome_sketches[i]._kmers)): kmer = genome_sketches[i]._kmers[kmer_index] to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index tree = mt.Trie(to_insert) tree.save(streaming_database_file)
def find_kmers_in_filtered_results(self, training_database_file: str) -> None: """ For each of the genomes that showed up in self.filtered_results, collects all their k-mers and counts and puts it in self.all_kmers_with_counts. :param training_database_file: file pointing to the HDF5 training database created with MakeStreamingDNADatabase.py :type training_database_file: string """ to_select_names = self.to_select_names k_range = self.k_range #is_unique_kmer_per_ksize = self.is_unique_kmer_per_ksize # get the count estimators of just the organisms of interest # TODO: could make it a LOT more memory efficient by sub-selecting the 'sketches' self.CEs = MH.import_multiple_from_single_hdf5( training_database_file, import_list=to_select_names) # get all the kmers (for each kmer size) and form their counts in the subset of predicted sketches to be in the sample self.all_kmers_with_counts = dict() for k_size in k_range: #self.is_unique_kmer_per_ksize[k_size] = set() for i in range(len(self.CEs)): for big_kmer in self.CEs[i]._kmers: kmer = big_kmer[:k_size] if kmer in self.all_kmers_with_counts: self.all_kmers_with_counts[kmer] += 1 else: self.all_kmers_with_counts[kmer] = 1
def get_MH_data(n, k, genome_file, rev_comp=False): ''' :param n: :param k: kmer size :param genome_file: fasta format :param rev_comp: :return: ''' estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome_file, rev_comp=rev_comp) counts = estimator._counts count_dict = dict() for count in counts: if count > 0: if count in count_dict.keys(): count_dict[count] += 1 else: count_dict[count] = 1 normed_dict = dict() total_count = sum(count_dict.values()) for k, v in count_dict.items(): normed_dict[k] = count_dict[k] / total_count #print("minhash results:") #print(normed_dict) #print(len(normed_dict.keys())) #print("checking if MH estimate is correct:") #print(sum(count_dict.values())) #print(count_dict) return normed_dict
def make_minhash(genome, max_h, prime, ksize): MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y', input_file_name=genome, rev_comp=False) # the query automatically takes care of rev_comp's for me # Just use HLL to estimate the number of kmers, no need to get exact count hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(genome) MHS._true_num_kmers = hll.estimate_cardinality() MHS.input_file_name = genome return MHS
def __import_database(self) -> list: """ Private function that imports the HDF5 training file. :return: a list of CountEstimators :rtype: MinHash.CountEstimator """ CEs = MH.import_multiple_from_single_hdf5(self.training_database_file) return CEs
def quick_dump(k_list, n, input_file): for k in k_list: pickle_file = 'k' + str(k) + 'n' + str(n) + input_file + '.pickle' print(pickle_file) estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file, rev_comp=False) counts = estimator._counts with open(pickle_file, 'wb') as pf: pickle.dump(counts, pf)
def quicker_dump(input_file): n = 10000 for k in [25, 50, 75]: pickle_file = 'k' + str(k) + 'n10000' + input_file + '.pickle' print(pickle_file) estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file, rev_comp=False) counts = estimator._counts with open(pickle_file, 'wb') as pf: pickle.dump(counts, pf)
def kmc_cmash_compare(k, n, input_file): kmc_normed_dict = get_kmc_data(k, input_file, input_file + '_out', 'out') #minhash estimate estimator = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=input_file) real_dist = pd.DataFrame(list(kmc_normed_dict.items()), columns=['kmer_count', 'percentage']) sns.barplot(x='kmer_count', y='percentage', data=real_dist) plt.savefig('quicklook_real.png') counts = estimator._counts estimated_normed_dict = get_count_dict(counts) #quick look at distribution df = pd.DataFrame(list(estimated_normed_dict.items()), columns=['kmer_count', 'percentage']) sns.barplot(x='kmer_count', y='percentage', data=df) plt.savefig('quicklook.png') ##### print(sum(estimated_normed_dict.values())) print(counts) print(get_distance(kmc_normed_dict, estimated_normed_dict, 'wasserstein'))
def test_yield_overlaps_3(): x1 = [1, 3, 6] x2 = [1, 2, 6] assert len(list(MH._yield_overlaps(x1, x2))) == 2 assert len(list(MH._yield_overlaps(x2, x1))) == 2
def test_yield_overlaps(): x1 = [1, 3, 5] x2 = [2, 4, 6] assert len(list(MH._yield_overlaps(x1, x2))) == 0
def test_yield_overlaps_2(): x1 = [1, 3, 5] x2 = [1, 2, 4, 6] assert len(list(MH._yield_overlaps(x1, x2))) == 1 assert len(list(MH._yield_overlaps(x2, x1))) == 1
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21) parser.add_argument('-v', '--verbose', action="store_true", help="Print out progress report/timing information") parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help= 'Output training database/reference file (in HDF5 format). An additional file ' '(ending in .tst) will also be created in the same directory with the same base name.' ) args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size max_h = args.num_hashes verbose = args.verbose input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) # check for and make filename for tst file if verbose: print("Checking file names") streaming_database_file = os.path.splitext(out_file)[0] + ".tst" streaming_database_file = os.path.abspath(streaming_database_file) file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(os.path.abspath(line)) fid.close() file_names = sorted(file_names, key=os.path.basename) # sort based off of base name # Open the pool and make the sketches if verbose: print("Creating Min Hash Sketches") pool = Pool(processes=num_threads) #genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # use imap so we get an iterable instead, that way we can immediately start writing to file and don't need to keep # the entire genome sketches in memory genome_sketches = pool.imap( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) #pool.close() # Export all the sketches if verbose: print("Exporting sketches") MH.export_multiple_to_single_hdf5(genome_sketches, out_file) pool.close() # Initialize the creation of the TST M = MakeTSTNew(out_file, streaming_database_file) if verbose: print("Creating and saving the ternary search tree") # make the actual TST M.make_TST() if verbose: print("Finished.")
if args.plot_file: plot_file = os.path.abspath(os.path.splitext(results_file)[0] + ".png") # Import data and error checking # Query file if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) if not os.path.exists(training_database_file): raise Exception("Training/reference file %s does not exist." % training_database_file) # Training data if verbose: print("Reading in sketches") t0 = timeit.default_timer() sketches = MH.import_multiple_from_single_hdf5(training_database_file) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeStreamingDNADatabase.py again." ) num_hashes = len( sketches[0]._kmers ) # note: this is relying on the fact that the sketches were properly constructed max_ksize = sketches[0].ksize sketches = sorted(sketches, key=lambda x: os.path.basename(x.input_file_name)) # adjust the k-range if necessary k_range = [val for val in k_range if val <= max_ksize]
import khmer # FIXME: could probably do all the data creation, module initialization, and method calling, and then have the tests # FIXME: just test the data # create some test data # First, the TST seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG" seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT" seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" seqs = [seq1, seq2, seq3, seq4] query_seq = seq3 num_hashes = 5 CE1 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE2 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE3 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE4 = MH.CountEstimator(n=num_hashes, max_prime=9999999999971, ksize=5, save_kmers='y') CE1.add_sequence(seq1) CE2.add_sequence(seq2)
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \ help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \ " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py") # adding new parser argument to handle parser.add_argument( '-d', '--temp_dir', type=str, help="temporary storage directory (define for continue flag)", default="./temp") parser.add_argument('-s', '--data_stream', action="store_true", \ help="Optional flag to define whether the input_files are urls to stream data instead of" \ " absolute paths to files.", default=False) parser.add_argument('-z', '--unzip_data', action="store_true", \ help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \ "chunks and delete unzipped fastas after use", default=False) parser.add_argument('-c', '--continue', action="store_true", \ help="Optional flag to define whether to continue sketching files defined in input file. " \ "Functionally, checks against the existing sketches in the temporary directory.", default=False) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None # create temporary dict if it doesn't exist if not os.path.isdir(args.temp_dir): os.mkdir(args.temp_dir) if args.unzip_data is True and args.data_stream is True: raise InputError( "unzip_data and data_stream flags cannot both be specified.") if args.unzip_data is True or args.data_stream is True: with open(input_file_names, 'r') as fid: lines = fid.readlines() lines = [l.strip() for l in lines] # just do everything in one chunk chunks = [lines] # chunk_size = 75 # with open(input_file_names, 'r') as fid: # lines = fid.readlines() # chunks = [] # for i in range(int(math.ceil(len(lines) / chunk_size))): # if (i+1)*chunk_size > len(lines)-1: # chunks[i*chunk_size:len(lines)] # else: # chunks[i*chunk_size:(i+1)*chunk_size] genome_sketches = [] temp_path = args.temp_dir if args.unzip_data: print("Beginning unzipping data") print(chunks) if not os.path.isdir(os.path.join(temp_path, "fastas")): os.mkdir(os.path.join(temp_path, "fastas")) for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: f = unzip_file(line, os.path.join(temp_path, "fastas")) file_names.append(f) # if not check_if_pickled(line): # f = unzip_file(line, os.path.join(temp_path, "fastas")) # file_names.append(f) if len(file_names) > 0: print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: print("pickled files found, continuing...") # adding new elif args.data_stream: for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: file = stream_file(line.strip()) file_names.append(file) print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) print("Beginning export to one HDF5 file") # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
if args.plot_file: plot_file = os.path.abspath(os.path.splitext(results_file)[0] + ".png") # Import data and error checking # Query file if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) if not os.path.exists(training_data): raise Exception("Training/reference file %s does not exist." % training_data) # Training data if verbose: print("Reading in sketches") t0 = timeit.default_timer() sketches = MH.import_multiple_from_single_hdf5(training_data) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeStreamingDNADatabase.py again." ) num_hashes = len( sketches[0]._kmers ) # note: this is relying on the fact that the sketches were properly constructed max_ksize = sketches[0].ksize def keyfunction(item): return os.path.basename(item.input_file_name) sketches = sorted( sketches, key=keyfunction) # sort the sketches by the basename of input file
coverage_threshold = 0.0062 sort_key = 'k=60' location_of_thresh = -1 # read in the file and sort as needed df = pd.read_csv(cmash_out_file, index_col=0) #df = df[df['k=60'] > 0.01].sort_values('k=60', ascending=False) # for the ones that had -c 0, add a threshold for sanity sake names_passed_thresh = list(df.index) names_passed_thresh_with_path = [] for name in names_passed_thresh: names_passed_thresh_with_path.append(training_base_name + name) CEs = MH.import_multiple_from_single_hdf5(training_hdf_file, import_list=names_passed_thresh_with_path) training_file_names = [c.input_file_name for c in CEs] # import the hit matrices hit_matrices_dict = loadmat(hit_matrices_file) # now, for each one of the sketches, look for unique k-mer in it, set non-unique to zero k_range = sorted([int(i.split('=')[1]) for i in df.keys()]) # Make the hit matrices dense hit_matrices_dense_dict = dict() for k_size in k_range: hit_matrices_dense_dict['k=%d' % k_size] = hit_matrices_dict['k=%d' % k_size].todense() hit_matrices_dict = hit_matrices_dense_dict
all_file_names.append(line.strip()) # form the training database on a few of them subset_file_names_file = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/TrainingData/NathanRefSeq/TestFileNameOrder/FileNames.txt" with open(subset_file_names_file, "w") as fid: for i in range(num_train): fid.write("%s\n" % all_file_names[i]) out_hdf5_file = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/TrainingData/NathanRefSeq/TestFileNameOrder/TrainingData.h5" python = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/CMashVE/bin/python " script = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/CMash/scripts/MakeStreamingDNADatabase.py " script_args = subset_file_names_file + " " + out_hdf5_file + " -n 1000 -k 60" os.system(python + script + script_args) # Import the HDF5 file sketches = MH.import_multiple_from_single_hdf5(out_hdf5_file) tree = mt.Trie() tree.load(out_hdf5_file.split('.')[0] + ".tst") for sketch_index in range(num_train): for kmer in sketches[sketch_index]._kmers: is_correct = False for hit in tree.keys(kmer): hit_split = hit.split('x') tree_sketch_index = int(hit_split[1]) if tree_sketch_index == sketch_index: is_correct = True break if not is_correct: raise Exception( "Mismatch: sketch index was %d while in the tree it's %d: %s" %
def k_mer_sketch_histogram(n, k, genome, rev_comp=False): n = int(n) k = int(k) # input: n - sketch size (# Hash function), k - k-mer size, genome - fasta(.gz) # return np.array of abundance and normalized abundance distribution KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res' outpath = os.path.dirname( os.path.realpath(__file__)) + '/kmc_global_count/' # if the value not stored, compute it, else load it if not os.path.isfile(outpath + KMC_outname + '.sketch' + str(n) + '.pickle'): # if MinHash Estimator with larger sketch size doesn't exists, compute it with current sketch size MHS_filenames = os.listdir(outpath + 'MH_counts/') if MHS_filenames: try: # get min sketch sizes of existing MinHash Estimator which is greater than n sketch_size_existing = [ int(_.split('.sketch')[-1].split('.MHScounts.pickle')[0]) for _ in MHS_filenames if (_.endswith('.MHScounts.pickle') and '.ksize' + str(k) + '.' in _ and KMC_outname in _) ] sketch_size_existing_greater_than_n = min( [_ for _ in sketch_size_existing if _ >= n]) MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str( sketch_size_existing_greater_than_n) + '.MHScounts.pickle' with open(MHS_count_name, 'rb') as MHS_sketch_count_file: MHS_count = pickle.load(MHS_sketch_count_file) counts = MHS_count[:n] # sketch_size_existing_greater_than_n is empty except (ValueError, FileNotFoundError): MHS = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome, rev_comp=rev_comp) counts = MHS._counts else: MHS = MH.CountEstimator(n=n, ksize=k, save_kmers='n', input_file_name=genome, rev_comp=rev_comp) counts = MHS._counts # check if MHS counts with k & n is saved nor not MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str( n) + '.MHScounts.pickle' if not os.path.isfile(MHS_count_name): with open(MHS_count_name, 'wb') as MHS_sketch_count_file: pickle.dump(counts, MHS_sketch_count_file) # turn array of counts of k-mers into occurrence of k-mers with the counts dist = np.zeros(max(counts)) for _c in counts: dist[_c - 1] = dist[_c - 1] + 1 dist_norm = dist / np.sum(dist) with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle', 'wb') as config_sketch_file: pickle.dump([dist, dist_norm], config_sketch_file) else: with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle', 'rb') as config_sketch_file: dist, dist_norm = pickle.load(config_sketch_file) return dist, dist_norm # np.array(list(dist))
def main(): parser = argparse.ArgumentParser( description= "This script creates a CSV file of similarity indicies between the" " input file and each of the sketches in the training/reference file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-f', '--force', action="store_true", help="Force creation of new NodeGraph.") parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-ct', '--containment_threshold', type=restricted_float, help="Only return results with containment index above this value", default=0.02) parser.add_argument( '-c', '--confidence', type=restricted_float, help= "Desired probability that all results were returned with containment index above threshold [-ct]", default=0.95) parser.add_argument( '-ng', '--node_graph', help="NodeGraph/bloom filter location. Used if it exists; if not, one " "will be created and put in the same directory as the specified " "output CSV file.", default=None) parser.add_argument( '-b', '--base_name', action="store_true", help= "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file" ) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Option to only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space.") parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument( 'training_data', help= "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)" ) parser.add_argument('out_csv', help='Output CSV file') # Parse and check args args = parser.parse_args() base_name = args.base_name training_data = os.path.abspath(args.training_data) if not os.path.exists(training_data): raise Exception("Training/reference file %s does not exist." % training_data) # Let's get the k-mer sizes in the training database ksizes = set() # Import all the training data sketches = MH.import_multiple_from_single_hdf5(training_data) # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters)) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) num_hashes = len(sketches[0]._kmers) for i in range(len(sketches)): sketch = sketches[i] if sketch._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) if len(sketch._kmers) != num_hashes: raise Exception("Unequal number of hashes for sketch of %s" % sketch.input_file_name) ksizes.add(sketch.ksize) if len(ksizes) > 1: raise Exception( "Training/reference data uses different k-mer sizes. Culprit was %s." % (sketch.input_file_name)) # Get the appropriate k-mer size ksize = ksizes.pop() # Get number of threads to use num_threads = args.threads # Check and parse the query file query_file = os.path.abspath(args.in_file) if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size> if args.node_graph is None: # If no node graph is specified, create one node_graph_out = os.path.join( os.path.dirname(os.path.abspath(args.out_csv)), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if not os.path.exists( node_graph_out ): # Don't complain if the default location works print("Node graph not provided (via -ng). Creating one at: %s" % node_graph_out) elif os.path.exists( args.node_graph): # If one is specified and it exists, use it node_graph_out = args.node_graph else: # Otherwise, the specified one doesn't exist raise Exception("Provided NodeGraph %s does not exist." % args.node_graph) # import and check the intersect nodegraph if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( training_data)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) results_file = os.path.abspath(args.out_csv) force = args.force fprate = args.fp_rate coverage_threshold = args.containment_threshold # desired coverage cutoff confidence = args.confidence # desired confidence that you got all the organisms with coverage >= desired coverage # Get names of training files for use as rows in returned tabular data training_file_names = [] for i in range(len(sketches)): training_file_names.append(sketches[i].input_file_name) # Only form the Nodegraph if we need to global sample_kmers if not os.path.exists(node_graph_out) or force is True: hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Not technically correct, but I need to wait until khmer is updated if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) else: sample_kmers = khmer.load_nodegraph(node_graph_out) node_ksize = sample_kmers.ksize() if node_ksize != ksize: raise Exception( "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k." % (node_graph_out, node_ksize, ksize)) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) #num_sample_kmers = sample_kmers.n_unique_kmers() # For some reason this only works when creating a new node graph, use the following instead num_sample_kmers = sample_kmers.n_occupied() # Compute all the indicies for all the training data pool = Pool(processes=num_threads) res = pool.map( unwrap_compute_indicies, zip(sketches, repeat(num_sample_kmers), repeat(true_fprate))) # Gather up the results in a nice form intersection_cardinalities = np.zeros(len(sketches)) containment_indexes = np.zeros(len(sketches)) jaccard_indexes = np.zeros(len(sketches)) for i in range(len(res)): (intersection_cardinality, containment_index, jaccard_index) = res[i] intersection_cardinalities[i] = intersection_cardinality containment_indexes[i] = containment_index jaccard_indexes[i] = jaccard_index d = { 'intersection': intersection_cardinalities, 'containment index': containment_indexes, 'jaccard index': jaccard_indexes } # Use only the basenames to label the rows (if requested) if base_name is True: df = pd.DataFrame(d, map(os.path.basename, training_file_names)) else: df = pd.DataFrame(d, training_file_names) # Only get the rows above a certain threshold if coverage_threshold <= 0: est_threshold = 0 else: est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate, confidence) filtered_results = df[df['containment index'] > est_threshold].sort_values( 'containment index', ascending=False) # Export the results filtered_results.to_csv(results_file, index=True, encoding='utf-8')
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py" ) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
print("reading file list") file_names = [] with open(training_file_names, 'r') as fid: for line in fid.readlines(): line = line.strip() file_names.append(os.path.basename(line)) print("importing kmers") chunk_size = 5000 iter = 0 with open(training_out_file, 'w') as fid: for file_iter in xrange(0, len(file_names), chunk_size): print("on file: %d" % file_iter) file_names_subset = file_names[file_iter:file_iter + chunk_size] sketchs = MH.import_multiple_from_single_hdf5( training_data, import_list=file_names_subset) all_kmers = itertools.chain.from_iterable(sketch._kmers for sketch in sketchs) print("forming the set") all_kmers_set = set(all_kmers) to_write = "" for kmer in all_kmers_set: to_write += ">seq%d\n" % iter to_write += "%s\n" % kmer iter += 1 fid.write(to_write) ########################################################################################### # Next, run kmc on this thing # with a bash script, use the following: # /usr/bin/time /home/pi/koslickd/KMC/./kmc -v -k60 -m200 -sm -fm -ci0 -cs3 -t48 -jlog_train NathanRefSeqTraining60mers.fa NathanRefSeq60mers /scratch/kmc_temp/