コード例 #1
0
def main():
	parser = argparse.ArgumentParser(description="This script creates training/reference sketches for each FASTA/Q file"
									" listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971)
	parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count())
	parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500)
	parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21)
	parser.add_argument('in_file', help="Input file: file containing (absolute) file names of training genomes.")
	parser.add_argument('out_file', help='Output training database/reference file (in HDF5 format). An additional file '
										 '(ending in .tst) will also be created in the same directory with the same base name.')
	args = parser.parse_args()
	num_threads = args.threads
	prime = args.prime  # taking hashes mod this prime
	ksize = args.k_size
	max_h = args.num_hashes
	input_file_names = os.path.abspath(args.in_file)
	if not os.path.exists(input_file_names):
		raise Exception("Input file %s does not exist." % input_file_names)
	out_file = os.path.abspath(args.out_file)

	# check for and make filename for tst file
	streaming_database_file = os.path.splitext(out_file)[0] + ".tst"
	streaming_database_file = os.path.abspath(streaming_database_file)

	file_names = list()
	fid = open(input_file_names, 'r')
	for line in fid.readlines():
		line = line.strip()
		if not os.path.exists(line):
			raise Exception("Training genome %s does not exist." % line)
		file_names.append(line)
	fid.close()
	file_names = sorted(file_names, key=os.path.basename)  # sort based off of base name

	# Open the pool and make the sketches
	pool = Pool(processes=num_threads)
	genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

	# Export all the sketches
	MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

	# Save the ternary search tree
	to_insert = set()
	for i in range(len(genome_sketches)):
		for kmer_index in range(len(genome_sketches[i]._kmers)):
			kmer = genome_sketches[i]._kmers[kmer_index]
			to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index))  # format here is kmer+x+hash_index+kmer_index
	tree = mt.Trie(to_insert)
	tree.save(streaming_database_file)
コード例 #2
0
ファイル: MakeDNADatabase.py プロジェクト: mealser/CMash
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the"
        " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py"
    )
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None

    file_names = list()
    fid = open(input_file_names, 'r')
    for line in fid.readlines():
        line = line.strip()
        if not os.path.exists(line):
            raise Exception("Training genome %s does not exist." % line)
        file_names.append(line)
    fid.close()

    # Open the pool and make the sketches
    pool = Pool(processes=num_threads)
    genome_sketches = pool.map(
        make_minhash_star,
        zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="k-mer size",
                        default=21)
    parser.add_argument('-v',
                        '--verbose',
                        action="store_true",
                        help="Print out progress report/timing information")
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help=
        'Output training database/reference file (in HDF5 format). An additional file '
        '(ending in .tst) will also be created in the same directory with the same base name.'
    )
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    max_h = args.num_hashes
    verbose = args.verbose
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)

    # check for and make filename for tst file
    if verbose:
        print("Checking file names")
    streaming_database_file = os.path.splitext(out_file)[0] + ".tst"
    streaming_database_file = os.path.abspath(streaming_database_file)

    file_names = list()
    fid = open(input_file_names, 'r')
    for line in fid.readlines():
        line = line.strip()
        if not os.path.exists(line):
            raise Exception("Training genome %s does not exist." % line)
        file_names.append(os.path.abspath(line))
    fid.close()
    file_names = sorted(file_names,
                        key=os.path.basename)  # sort based off of base name

    # Open the pool and make the sketches
    if verbose:
        print("Creating Min Hash Sketches")
    pool = Pool(processes=num_threads)
    #genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    # use imap so we get an iterable instead, that way we can immediately start writing to file and don't need to keep
    # the entire genome sketches in memory
    genome_sketches = pool.imap(
        make_minhash_star,
        zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    #pool.close()
    # Export all the sketches
    if verbose:
        print("Exporting sketches")
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)
    pool.close()
    # Initialize the creation of the TST
    M = MakeTSTNew(out_file, streaming_database_file)
    if verbose:
        print("Creating and saving the ternary search tree")
    # make the actual TST
    M.make_TST()

    if verbose:
        print("Finished.")
コード例 #4
0
ファイル: MakeDNADatabase.py プロジェクト: slebras/CMash
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \
                        help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \
                             " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py")
    # adding new parser argument to handle
    parser.add_argument(
        '-d',
        '--temp_dir',
        type=str,
        help="temporary storage directory (define for continue flag)",
        default="./temp")
    parser.add_argument('-s', '--data_stream', action="store_true", \
                        help="Optional flag to define whether the input_files are urls to stream data instead of" \
                             " absolute paths to files.", default=False)
    parser.add_argument('-z', '--unzip_data', action="store_true", \
                        help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \
                             "chunks and delete unzipped fastas after use", default=False)
    parser.add_argument('-c', '--continue', action="store_true", \
                        help="Optional flag to define whether to continue sketching files defined in input file. " \
                             "Functionally, checks against the existing sketches in the temporary directory.", default=False)
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    # create temporary dict if it doesn't exist
    if not os.path.isdir(args.temp_dir):
        os.mkdir(args.temp_dir)

    if args.unzip_data is True and args.data_stream is True:
        raise InputError(
            "unzip_data and data_stream flags cannot both be specified.")

    if args.unzip_data is True or args.data_stream is True:
        with open(input_file_names, 'r') as fid:
            lines = fid.readlines()
        lines = [l.strip() for l in lines]
        # just do everything in one chunk
        chunks = [lines]
        # chunk_size = 75
        # with open(input_file_names, 'r') as fid:
        #     lines = fid.readlines()
        # chunks = []
        # for i in range(int(math.ceil(len(lines) / chunk_size))):
        #     if (i+1)*chunk_size > len(lines)-1:
        #         chunks[i*chunk_size:len(lines)]
        #     else:
        #         chunks[i*chunk_size:(i+1)*chunk_size]

    genome_sketches = []

    temp_path = args.temp_dir
    if args.unzip_data:
        print("Beginning unzipping data")
        print(chunks)
        if not os.path.isdir(os.path.join(temp_path, "fastas")):
            os.mkdir(os.path.join(temp_path, "fastas"))
        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                f = unzip_file(line, os.path.join(temp_path, "fastas"))
                file_names.append(f)
                # if not check_if_pickled(line):
                #     f = unzip_file(line, os.path.join(temp_path, "fastas"))
                #     file_names.append(f)

            if len(file_names) > 0:
                print("starting sketches")
                pool = Pool(processes=num_threads)
                curr_genome_sketches = pool.map(
                    make_minhash_star,
                    zip(file_names, repeat(max_h), repeat(prime),
                        repeat(ksize)))
                genome_sketches += curr_genome_sketches

                print("removing fasta files")
                for file_name in file_names:
                    os.remove(file_name)
            else:
                print("pickled files found, continuing...")

    # adding new
    elif args.data_stream:

        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                file = stream_file(line.strip())
                file_names.append(file)
            print("starting sketches")

            pool = Pool(processes=num_threads)
            curr_genome_sketches = pool.map(
                make_minhash_star,
                zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
            genome_sketches += curr_genome_sketches

            print("removing fasta files")
            for file_name in file_names:
                os.remove(file_name)

    else:
        file_names = list()
        fid = open(input_file_names, 'r')
        for line in fid.readlines():
            line = line.strip()
            if not os.path.exists(line):
                raise Exception("Training genome %s does not exist." % line)
            file_names.append(line)
        fid.close()

        # Open the pool and make the sketches
        pool = Pool(processes=num_threads)
        genome_sketches = pool.map(
            make_minhash_star,
            zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    print("Beginning export to one HDF5 file")
    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
コード例 #5
0
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE1.add_sequence(seq1)
CE2.add_sequence(seq2)
CE3.add_sequence(seq3)
CE4.add_sequence(seq4)
# CE's must have input names
CE1.input_file_name = "seq1"
CE2.input_file_name = "seq2"
CE3.input_file_name = "seq3"
CE4.input_file_name = "seq4"
training_file_names = ["seq1", "seq2", "seq3", "seq4"]
CEs = [CE1, CE2, CE3, CE4]
temp_database_file = tempfile.mktemp()
MH.export_multiple_to_single_hdf5(CEs, temp_database_file)

# And create the TST
to_insert = set()
# add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
for i in range(len(CEs)):
    for kmer_index in range(len(CEs[i]._kmers)):
        # normal kmer
        kmer = CEs[i]._kmers[kmer_index]
        if kmer:
            to_insert.add(
                kmer + 'x' + str(i) + 'x' +
                str(kmer_index))  # format here is kmer+x+hash_index+kmer_index
            # rev-comp kmer
            kmer = khmer.reverse_complement(CEs[i]._kmers[kmer_index])
            to_insert.add(