def main(): parser = argparse.ArgumentParser(description="This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21) parser.add_argument('in_file', help="Input file: file containing (absolute) file names of training genomes.") parser.add_argument('out_file', help='Output training database/reference file (in HDF5 format). An additional file ' '(ending in .tst) will also be created in the same directory with the same base name.') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) # check for and make filename for tst file streaming_database_file = os.path.splitext(out_file)[0] + ".tst" streaming_database_file = os.path.abspath(streaming_database_file) file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() file_names = sorted(file_names, key=os.path.basename) # sort based off of base name # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # Save the ternary search tree to_insert = set() for i in range(len(genome_sketches)): for kmer_index in range(len(genome_sketches[i]._kmers)): kmer = genome_sketches[i]._kmers[kmer_index] to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index tree = mt.Trie(to_insert) tree.save(streaming_database_file)
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py" ) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21) parser.add_argument('-v', '--verbose', action="store_true", help="Print out progress report/timing information") parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help= 'Output training database/reference file (in HDF5 format). An additional file ' '(ending in .tst) will also be created in the same directory with the same base name.' ) args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size max_h = args.num_hashes verbose = args.verbose input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) # check for and make filename for tst file if verbose: print("Checking file names") streaming_database_file = os.path.splitext(out_file)[0] + ".tst" streaming_database_file = os.path.abspath(streaming_database_file) file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(os.path.abspath(line)) fid.close() file_names = sorted(file_names, key=os.path.basename) # sort based off of base name # Open the pool and make the sketches if verbose: print("Creating Min Hash Sketches") pool = Pool(processes=num_threads) #genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # use imap so we get an iterable instead, that way we can immediately start writing to file and don't need to keep # the entire genome sketches in memory genome_sketches = pool.imap( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) #pool.close() # Export all the sketches if verbose: print("Exporting sketches") MH.export_multiple_to_single_hdf5(genome_sketches, out_file) pool.close() # Initialize the creation of the TST M = MakeTSTNew(out_file, streaming_database_file) if verbose: print("Creating and saving the ternary search tree") # make the actual TST M.make_TST() if verbose: print("Finished.")
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \ help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \ " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py") # adding new parser argument to handle parser.add_argument( '-d', '--temp_dir', type=str, help="temporary storage directory (define for continue flag)", default="./temp") parser.add_argument('-s', '--data_stream', action="store_true", \ help="Optional flag to define whether the input_files are urls to stream data instead of" \ " absolute paths to files.", default=False) parser.add_argument('-z', '--unzip_data', action="store_true", \ help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \ "chunks and delete unzipped fastas after use", default=False) parser.add_argument('-c', '--continue', action="store_true", \ help="Optional flag to define whether to continue sketching files defined in input file. " \ "Functionally, checks against the existing sketches in the temporary directory.", default=False) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None # create temporary dict if it doesn't exist if not os.path.isdir(args.temp_dir): os.mkdir(args.temp_dir) if args.unzip_data is True and args.data_stream is True: raise InputError( "unzip_data and data_stream flags cannot both be specified.") if args.unzip_data is True or args.data_stream is True: with open(input_file_names, 'r') as fid: lines = fid.readlines() lines = [l.strip() for l in lines] # just do everything in one chunk chunks = [lines] # chunk_size = 75 # with open(input_file_names, 'r') as fid: # lines = fid.readlines() # chunks = [] # for i in range(int(math.ceil(len(lines) / chunk_size))): # if (i+1)*chunk_size > len(lines)-1: # chunks[i*chunk_size:len(lines)] # else: # chunks[i*chunk_size:(i+1)*chunk_size] genome_sketches = [] temp_path = args.temp_dir if args.unzip_data: print("Beginning unzipping data") print(chunks) if not os.path.isdir(os.path.join(temp_path, "fastas")): os.mkdir(os.path.join(temp_path, "fastas")) for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: f = unzip_file(line, os.path.join(temp_path, "fastas")) file_names.append(f) # if not check_if_pickled(line): # f = unzip_file(line, os.path.join(temp_path, "fastas")) # file_names.append(f) if len(file_names) > 0: print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: print("pickled files found, continuing...") # adding new elif args.data_stream: for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: file = stream_file(line.strip()) file_names.append(file) print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) print("Beginning export to one HDF5 file") # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
max_prime=9999999999971, ksize=5, save_kmers='y') CE1.add_sequence(seq1) CE2.add_sequence(seq2) CE3.add_sequence(seq3) CE4.add_sequence(seq4) # CE's must have input names CE1.input_file_name = "seq1" CE2.input_file_name = "seq2" CE3.input_file_name = "seq3" CE4.input_file_name = "seq4" training_file_names = ["seq1", "seq2", "seq3", "seq4"] CEs = [CE1, CE2, CE3, CE4] temp_database_file = tempfile.mktemp() MH.export_multiple_to_single_hdf5(CEs, temp_database_file) # And create the TST to_insert = set() # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement for i in range(len(CEs)): for kmer_index in range(len(CEs[i]._kmers)): # normal kmer kmer = CEs[i]._kmers[kmer_index] if kmer: to_insert.add( kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index # rev-comp kmer kmer = khmer.reverse_complement(CEs[i]._kmers[kmer_index]) to_insert.add(