def main():
	parser = argparse.ArgumentParser(description="This script creates training/reference sketches for each FASTA/Q file"
									" listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971)
	parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count())
	parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500)
	parser.add_argument('-k', '--k_size', type=int, help="k-mer size", default=21)
	parser.add_argument('in_file', help="Input file: file containing (absolute) file names of training genomes.")
	parser.add_argument('out_file', help='Output training database/reference file (in HDF5 format). An additional file '
										 '(ending in .tst) will also be created in the same directory with the same base name.')
	args = parser.parse_args()
	num_threads = args.threads
	prime = args.prime  # taking hashes mod this prime
	ksize = args.k_size
	max_h = args.num_hashes
	input_file_names = os.path.abspath(args.in_file)
	if not os.path.exists(input_file_names):
		raise Exception("Input file %s does not exist." % input_file_names)
	out_file = os.path.abspath(args.out_file)

	# check for and make filename for tst file
	streaming_database_file = os.path.splitext(out_file)[0] + ".tst"
	streaming_database_file = os.path.abspath(streaming_database_file)

	file_names = list()
	fid = open(input_file_names, 'r')
	for line in fid.readlines():
		line = line.strip()
		if not os.path.exists(line):
			raise Exception("Training genome %s does not exist." % line)
		file_names.append(line)
	fid.close()
	file_names = sorted(file_names, key=os.path.basename)  # sort based off of base name

	# Open the pool and make the sketches
	pool = Pool(processes=num_threads)
	genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

	# Export all the sketches
	MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

	# Save the ternary search tree
	to_insert = set()
	for i in range(len(genome_sketches)):
		for kmer_index in range(len(genome_sketches[i]._kmers)):
			kmer = genome_sketches[i]._kmers[kmer_index]
			to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index))  # format here is kmer+x+hash_index+kmer_index
	tree = mt.Trie(to_insert)
	tree.save(streaming_database_file)
Example #2
0
    def find_kmers_in_filtered_results(self,
                                       training_database_file: str) -> None:
        """
		For each of the genomes that showed up in self.filtered_results, collects all their k-mers and counts
		and puts it in self.all_kmers_with_counts.
		:param training_database_file: file pointing to the HDF5 training database created with MakeStreamingDNADatabase.py
		:type training_database_file: string
		"""
        to_select_names = self.to_select_names
        k_range = self.k_range
        #is_unique_kmer_per_ksize = self.is_unique_kmer_per_ksize

        # get the count estimators of just the organisms of interest
        # TODO: could make it a LOT more memory efficient by sub-selecting the 'sketches'
        self.CEs = MH.import_multiple_from_single_hdf5(
            training_database_file, import_list=to_select_names)

        # get all the kmers (for each kmer size) and form their counts in the subset of predicted sketches to be in the sample
        self.all_kmers_with_counts = dict()
        for k_size in k_range:
            #self.is_unique_kmer_per_ksize[k_size] = set()
            for i in range(len(self.CEs)):
                for big_kmer in self.CEs[i]._kmers:
                    kmer = big_kmer[:k_size]
                    if kmer in self.all_kmers_with_counts:
                        self.all_kmers_with_counts[kmer] += 1
                    else:
                        self.all_kmers_with_counts[kmer] = 1
def get_MH_data(n, k, genome_file, rev_comp=False):
    '''

    :param n:
    :param k: kmer size
    :param genome_file: fasta format
    :param rev_comp:
    :return:
    '''
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=genome_file,
                                  rev_comp=rev_comp)
    counts = estimator._counts
    count_dict = dict()
    for count in counts:
        if count > 0:
            if count in count_dict.keys():
                count_dict[count] += 1
            else:
                count_dict[count] = 1
    normed_dict = dict()
    total_count = sum(count_dict.values())
    for k, v in count_dict.items():
        normed_dict[k] = count_dict[k] / total_count
    #print("minhash results:")
    #print(normed_dict)
    #print(len(normed_dict.keys()))
    #print("checking if MH estimate is correct:")
    #print(sum(count_dict.values()))
    #print(count_dict)
    return normed_dict
def make_minhash(genome, max_h, prime, ksize):
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y', input_file_name=genome, rev_comp=False)  # the query automatically takes care of rev_comp's for me
	# Just use HLL to estimate the number of kmers, no need to get exact count
	hll = khmer.HLLCounter(0.01, ksize)
	hll.consume_seqfile(genome)
	MHS._true_num_kmers = hll.estimate_cardinality()
	MHS.input_file_name = genome
	return MHS
Example #5
0
    def __import_database(self) -> list:
        """
		Private function that imports the HDF5 training file.
		:return: a list of CountEstimators
		:rtype: MinHash.CountEstimator
		"""
        CEs = MH.import_multiple_from_single_hdf5(self.training_database_file)
        return CEs
def quick_dump(k_list, n, input_file):
    for k in k_list:
        pickle_file = 'k' + str(k) + 'n' + str(n) + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)
def quicker_dump(input_file):
    n = 10000
    for k in [25, 50, 75]:
        pickle_file = 'k' + str(k) + 'n10000' + input_file + '.pickle'
        print(pickle_file)
        estimator = MH.CountEstimator(n=n,
                                      ksize=k,
                                      save_kmers='n',
                                      input_file_name=input_file,
                                      rev_comp=False)
        counts = estimator._counts
        with open(pickle_file, 'wb') as pf:
            pickle.dump(counts, pf)
def kmc_cmash_compare(k, n, input_file):
    kmc_normed_dict = get_kmc_data(k, input_file, input_file + '_out', 'out')
    #minhash estimate
    estimator = MH.CountEstimator(n=n,
                                  ksize=k,
                                  save_kmers='n',
                                  input_file_name=input_file)
    real_dist = pd.DataFrame(list(kmc_normed_dict.items()),
                             columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=real_dist)
    plt.savefig('quicklook_real.png')
    counts = estimator._counts
    estimated_normed_dict = get_count_dict(counts)
    #quick look at distribution
    df = pd.DataFrame(list(estimated_normed_dict.items()),
                      columns=['kmer_count', 'percentage'])
    sns.barplot(x='kmer_count', y='percentage', data=df)
    plt.savefig('quicklook.png')
    #####
    print(sum(estimated_normed_dict.values()))
    print(counts)
    print(get_distance(kmc_normed_dict, estimated_normed_dict, 'wasserstein'))
Example #9
0
def test_yield_overlaps_3():
    x1 = [1, 3, 6]
    x2 = [1, 2, 6]
    assert len(list(MH._yield_overlaps(x1, x2))) == 2
    assert len(list(MH._yield_overlaps(x2, x1))) == 2
Example #10
0
def test_yield_overlaps():
    x1 = [1, 3, 5]
    x2 = [2, 4, 6]
    assert len(list(MH._yield_overlaps(x1, x2))) == 0
Example #11
0
def test_yield_overlaps_2():
    x1 = [1, 3, 5]
    x2 = [1, 2, 4, 6]
    assert len(list(MH._yield_overlaps(x1, x2))) == 1
    assert len(list(MH._yield_overlaps(x2, x1))) == 1
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="k-mer size",
                        default=21)
    parser.add_argument('-v',
                        '--verbose',
                        action="store_true",
                        help="Print out progress report/timing information")
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help=
        'Output training database/reference file (in HDF5 format). An additional file '
        '(ending in .tst) will also be created in the same directory with the same base name.'
    )
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    max_h = args.num_hashes
    verbose = args.verbose
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)

    # check for and make filename for tst file
    if verbose:
        print("Checking file names")
    streaming_database_file = os.path.splitext(out_file)[0] + ".tst"
    streaming_database_file = os.path.abspath(streaming_database_file)

    file_names = list()
    fid = open(input_file_names, 'r')
    for line in fid.readlines():
        line = line.strip()
        if not os.path.exists(line):
            raise Exception("Training genome %s does not exist." % line)
        file_names.append(os.path.abspath(line))
    fid.close()
    file_names = sorted(file_names,
                        key=os.path.basename)  # sort based off of base name

    # Open the pool and make the sketches
    if verbose:
        print("Creating Min Hash Sketches")
    pool = Pool(processes=num_threads)
    #genome_sketches = pool.map(make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    # use imap so we get an iterable instead, that way we can immediately start writing to file and don't need to keep
    # the entire genome sketches in memory
    genome_sketches = pool.imap(
        make_minhash_star,
        zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    #pool.close()
    # Export all the sketches
    if verbose:
        print("Exporting sketches")
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)
    pool.close()
    # Initialize the creation of the TST
    M = MakeTSTNew(out_file, streaming_database_file)
    if verbose:
        print("Creating and saving the ternary search tree")
    # make the actual TST
    M.make_TST()

    if verbose:
        print("Finished.")
Example #13
0
    if args.plot_file:
        plot_file = os.path.abspath(os.path.splitext(results_file)[0] + ".png")

    # Import data and error checking
    # Query file
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    if not os.path.exists(training_database_file):
        raise Exception("Training/reference file %s does not exist." %
                        training_database_file)

    # Training data
    if verbose:
        print("Reading in sketches")
        t0 = timeit.default_timer()
    sketches = MH.import_multiple_from_single_hdf5(training_database_file)
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeStreamingDNADatabase.py again."
        )
    num_hashes = len(
        sketches[0]._kmers
    )  # note: this is relying on the fact that the sketches were properly constructed
    max_ksize = sketches[0].ksize

    sketches = sorted(sketches,
                      key=lambda x: os.path.basename(x.input_file_name))

    # adjust the k-range if necessary
    k_range = [val for val in k_range if val <= max_ksize]
Example #14
0
import khmer

# FIXME: could probably do all the data creation, module initialization, and method calling, and then have the tests
# FIXME: just test the data

# create some test data
# First, the TST
seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG"
seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT"
seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
seqs = [seq1, seq2, seq3, seq4]
query_seq = seq3
num_hashes = 5
CE1 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE2 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE3 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE4 = MH.CountEstimator(n=num_hashes,
                        max_prime=9999999999971,
                        ksize=5,
                        save_kmers='y')
CE1.add_sequence(seq1)
CE2.add_sequence(seq2)
Example #15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \
                        help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \
                             " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py")
    # adding new parser argument to handle
    parser.add_argument(
        '-d',
        '--temp_dir',
        type=str,
        help="temporary storage directory (define for continue flag)",
        default="./temp")
    parser.add_argument('-s', '--data_stream', action="store_true", \
                        help="Optional flag to define whether the input_files are urls to stream data instead of" \
                             " absolute paths to files.", default=False)
    parser.add_argument('-z', '--unzip_data', action="store_true", \
                        help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \
                             "chunks and delete unzipped fastas after use", default=False)
    parser.add_argument('-c', '--continue', action="store_true", \
                        help="Optional flag to define whether to continue sketching files defined in input file. " \
                             "Functionally, checks against the existing sketches in the temporary directory.", default=False)
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    # create temporary dict if it doesn't exist
    if not os.path.isdir(args.temp_dir):
        os.mkdir(args.temp_dir)

    if args.unzip_data is True and args.data_stream is True:
        raise InputError(
            "unzip_data and data_stream flags cannot both be specified.")

    if args.unzip_data is True or args.data_stream is True:
        with open(input_file_names, 'r') as fid:
            lines = fid.readlines()
        lines = [l.strip() for l in lines]
        # just do everything in one chunk
        chunks = [lines]
        # chunk_size = 75
        # with open(input_file_names, 'r') as fid:
        #     lines = fid.readlines()
        # chunks = []
        # for i in range(int(math.ceil(len(lines) / chunk_size))):
        #     if (i+1)*chunk_size > len(lines)-1:
        #         chunks[i*chunk_size:len(lines)]
        #     else:
        #         chunks[i*chunk_size:(i+1)*chunk_size]

    genome_sketches = []

    temp_path = args.temp_dir
    if args.unzip_data:
        print("Beginning unzipping data")
        print(chunks)
        if not os.path.isdir(os.path.join(temp_path, "fastas")):
            os.mkdir(os.path.join(temp_path, "fastas"))
        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                f = unzip_file(line, os.path.join(temp_path, "fastas"))
                file_names.append(f)
                # if not check_if_pickled(line):
                #     f = unzip_file(line, os.path.join(temp_path, "fastas"))
                #     file_names.append(f)

            if len(file_names) > 0:
                print("starting sketches")
                pool = Pool(processes=num_threads)
                curr_genome_sketches = pool.map(
                    make_minhash_star,
                    zip(file_names, repeat(max_h), repeat(prime),
                        repeat(ksize)))
                genome_sketches += curr_genome_sketches

                print("removing fasta files")
                for file_name in file_names:
                    os.remove(file_name)
            else:
                print("pickled files found, continuing...")

    # adding new
    elif args.data_stream:

        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                file = stream_file(line.strip())
                file_names.append(file)
            print("starting sketches")

            pool = Pool(processes=num_threads)
            curr_genome_sketches = pool.map(
                make_minhash_star,
                zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
            genome_sketches += curr_genome_sketches

            print("removing fasta files")
            for file_name in file_names:
                os.remove(file_name)

    else:
        file_names = list()
        fid = open(input_file_names, 'r')
        for line in fid.readlines():
            line = line.strip()
            if not os.path.exists(line):
                raise Exception("Training genome %s does not exist." % line)
            file_names.append(line)
        fid.close()

        # Open the pool and make the sketches
        pool = Pool(processes=num_threads)
        genome_sketches = pool.map(
            make_minhash_star,
            zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    print("Beginning export to one HDF5 file")
    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
    if args.plot_file:
        plot_file = os.path.abspath(os.path.splitext(results_file)[0] + ".png")

    # Import data and error checking
    # Query file
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    if not os.path.exists(training_data):
        raise Exception("Training/reference file %s does not exist." %
                        training_data)

    # Training data
    if verbose:
        print("Reading in sketches")
        t0 = timeit.default_timer()
    sketches = MH.import_multiple_from_single_hdf5(training_data)
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeStreamingDNADatabase.py again."
        )
    num_hashes = len(
        sketches[0]._kmers
    )  # note: this is relying on the fact that the sketches were properly constructed
    max_ksize = sketches[0].ksize

    def keyfunction(item):
        return os.path.basename(item.input_file_name)

    sketches = sorted(
        sketches,
        key=keyfunction)  # sort the sketches by the basename of input file
Example #17
0
coverage_threshold = 0.0062
sort_key = 'k=60'
location_of_thresh = -1

# read in the file and sort as needed
df = pd.read_csv(cmash_out_file, index_col=0)
#df = df[df['k=60'] > 0.01].sort_values('k=60', ascending=False)  # for the ones that had -c 0, add a threshold for sanity sake


names_passed_thresh = list(df.index)
names_passed_thresh_with_path = []
for name in names_passed_thresh:
	names_passed_thresh_with_path.append(training_base_name + name)

CEs = MH.import_multiple_from_single_hdf5(training_hdf_file, import_list=names_passed_thresh_with_path)
training_file_names = [c.input_file_name for c in CEs]

# import the hit matrices
hit_matrices_dict = loadmat(hit_matrices_file)

# now, for each one of the sketches, look for unique k-mer in it, set non-unique to zero
k_range = sorted([int(i.split('=')[1]) for i in df.keys()])

# Make the hit matrices dense
hit_matrices_dense_dict = dict()
for k_size in k_range:
	hit_matrices_dense_dict['k=%d' % k_size] = hit_matrices_dict['k=%d' % k_size].todense()

hit_matrices_dict = hit_matrices_dense_dict
Example #18
0
        all_file_names.append(line.strip())

# form the training database on a few of them
subset_file_names_file = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/TrainingData/NathanRefSeq/TestFileNameOrder/FileNames.txt"
with open(subset_file_names_file, "w") as fid:
    for i in range(num_train):
        fid.write("%s\n" % all_file_names[i])

out_hdf5_file = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/TrainingData/NathanRefSeq/TestFileNameOrder/TrainingData.h5"
python = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/CMashVE/bin/python "
script = "/nfs1/Koslicki_Lab/koslickd/MiCOPCMash/CMash/scripts/MakeStreamingDNADatabase.py "
script_args = subset_file_names_file + " " + out_hdf5_file + " -n 1000 -k 60"
os.system(python + script + script_args)

# Import the HDF5 file
sketches = MH.import_multiple_from_single_hdf5(out_hdf5_file)
tree = mt.Trie()
tree.load(out_hdf5_file.split('.')[0] + ".tst")

for sketch_index in range(num_train):
    for kmer in sketches[sketch_index]._kmers:
        is_correct = False
        for hit in tree.keys(kmer):
            hit_split = hit.split('x')
            tree_sketch_index = int(hit_split[1])
            if tree_sketch_index == sketch_index:
                is_correct = True
                break
        if not is_correct:
            raise Exception(
                "Mismatch: sketch index was %d while in the tree it's %d: %s" %
Example #19
0
def k_mer_sketch_histogram(n, k, genome, rev_comp=False):
    n = int(n)
    k = int(k)
    # input: n - sketch size (# Hash function), k - k-mer size, genome - fasta(.gz)
    # return np.array of abundance and normalized abundance distribution
    KMC_outname = genome.split('/')[-1] + '.ksize' + str(k) + '.res'
    outpath = os.path.dirname(
        os.path.realpath(__file__)) + '/kmc_global_count/'
    # if the value not stored, compute it, else load it
    if not os.path.isfile(outpath + KMC_outname + '.sketch' + str(n) +
                          '.pickle'):
        # if MinHash Estimator with larger sketch size doesn't exists, compute it with current sketch size
        MHS_filenames = os.listdir(outpath + 'MH_counts/')
        if MHS_filenames:
            try:
                # get min sketch sizes of existing MinHash Estimator which is greater than n
                sketch_size_existing = [
                    int(_.split('.sketch')[-1].split('.MHScounts.pickle')[0])
                    for _ in MHS_filenames
                    if (_.endswith('.MHScounts.pickle') and '.ksize' + str(k) +
                        '.' in _ and KMC_outname in _)
                ]
                sketch_size_existing_greater_than_n = min(
                    [_ for _ in sketch_size_existing if _ >= n])
                MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
                    sketch_size_existing_greater_than_n) + '.MHScounts.pickle'
                with open(MHS_count_name, 'rb') as MHS_sketch_count_file:
                    MHS_count = pickle.load(MHS_sketch_count_file)
                    counts = MHS_count[:n]
            # sketch_size_existing_greater_than_n is empty
            except (ValueError, FileNotFoundError):
                MHS = MH.CountEstimator(n=n,
                                        ksize=k,
                                        save_kmers='n',
                                        input_file_name=genome,
                                        rev_comp=rev_comp)
                counts = MHS._counts
        else:
            MHS = MH.CountEstimator(n=n,
                                    ksize=k,
                                    save_kmers='n',
                                    input_file_name=genome,
                                    rev_comp=rev_comp)
            counts = MHS._counts
        # check if MHS counts with k & n is saved nor not
        MHS_count_name = outpath + 'MH_counts/' + KMC_outname + '.sketch' + str(
            n) + '.MHScounts.pickle'
        if not os.path.isfile(MHS_count_name):
            with open(MHS_count_name, 'wb') as MHS_sketch_count_file:
                pickle.dump(counts, MHS_sketch_count_file)
        # turn array of counts of k-mers into occurrence of k-mers with the counts
        dist = np.zeros(max(counts))
        for _c in counts:
            dist[_c - 1] = dist[_c - 1] + 1
        dist_norm = dist / np.sum(dist)
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'wb') as config_sketch_file:
            pickle.dump([dist, dist_norm], config_sketch_file)
    else:
        with open(outpath + KMC_outname + '.sketch' + str(n) + '.pickle',
                  'rb') as config_sketch_file:
            dist, dist_norm = pickle.load(config_sketch_file)
    return dist, dist_norm  # np.array(list(dist))
Example #20
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates a CSV file of similarity indicies between the"
        " input file and each of the sketches in the training/reference file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-f',
                        '--force',
                        action="store_true",
                        help="Force creation of new NodeGraph.")
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-ct',
        '--containment_threshold',
        type=restricted_float,
        help="Only return results with containment index above this value",
        default=0.02)
    parser.add_argument(
        '-c',
        '--confidence',
        type=restricted_float,
        help=
        "Desired probability that all results were returned with containment index above threshold [-ct]",
        default=0.95)
    parser.add_argument(
        '-ng',
        '--node_graph',
        help="NodeGraph/bloom filter location. Used if it exists; if not, one "
        "will be created and put in the same directory as the specified "
        "output CSV file.",
        default=None)
    parser.add_argument(
        '-b',
        '--base_name',
        action="store_true",
        help=
        "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file"
    )
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Option to only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space.")
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument(
        'training_data',
        help=
        "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)"
    )
    parser.add_argument('out_csv', help='Output CSV file')

    # Parse and check args
    args = parser.parse_args()
    base_name = args.base_name
    training_data = os.path.abspath(args.training_data)
    if not os.path.exists(training_data):
        raise Exception("Training/reference file %s does not exist." %
                        training_data)
    # Let's get the k-mer sizes in the training database
    ksizes = set()
    # Import all the training data
    sketches = MH.import_multiple_from_single_hdf5(training_data)
    # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters))
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
        )
    num_hashes = len(sketches[0]._kmers)
    for i in range(len(sketches)):
        sketch = sketches[i]
        if sketch._kmers is None:
            raise Exception(
                "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
            )
        if len(sketch._kmers) != num_hashes:
            raise Exception("Unequal number of hashes for sketch of %s" %
                            sketch.input_file_name)
        ksizes.add(sketch.ksize)
        if len(ksizes) > 1:
            raise Exception(
                "Training/reference data uses different k-mer sizes. Culprit was %s."
                % (sketch.input_file_name))
    # Get the appropriate k-mer size
    ksize = ksizes.pop()
    # Get number of threads to use
    num_threads = args.threads
    # Check and parse the query file
    query_file = os.path.abspath(args.in_file)
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size>
    if args.node_graph is None:  # If no node graph is specified, create one
        node_graph_out = os.path.join(
            os.path.dirname(os.path.abspath(args.out_csv)),
            os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
        if not os.path.exists(
                node_graph_out
        ):  # Don't complain if the default location works
            print("Node graph not provided (via -ng). Creating one at: %s" %
                  node_graph_out)
    elif os.path.exists(
            args.node_graph):  # If one is specified and it exists, use it
        node_graph_out = args.node_graph
    else:  # Otherwise, the specified one doesn't exist
        raise Exception("Provided NodeGraph %s does not exist." %
                        args.node_graph)
    # import and check the intersect nodegraph
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            training_data)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    results_file = os.path.abspath(args.out_csv)
    force = args.force
    fprate = args.fp_rate
    coverage_threshold = args.containment_threshold  # desired coverage cutoff
    confidence = args.confidence  # desired confidence that you got all the organisms with coverage >= desired coverage

    # Get names of training files for use as rows in returned tabular data
    training_file_names = []
    for i in range(len(sketches)):
        training_file_names.append(sketches[i].input_file_name)

    # Only form the Nodegraph if we need to
    global sample_kmers
    if not os.path.exists(node_graph_out) or force is True:
        hll = khmer.HLLCounter(0.01, ksize)
        hll.consume_seqfile(query_file)
        full_kmer_count_estimate = hll.estimate_cardinality()
        res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
        if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
            #sample_kmers.consume_seqfile(query_file)
            rparser = khmer.ReadParser(query_file)
            threads = []
            for _ in range(num_threads):
                cur_thrd = threading.Thread(
                    target=sample_kmers.consume_seqfile_with_reads_parser,
                    args=(rparser, ))
                threads.append(cur_thrd)
                cur_thrd.start()
            for thread in threads:
                thread.join()
        else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
            # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
            #  instead of J(query, training)
            # (TODO: fix this after khmer is updated)
            #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
            intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
            )  # Not technically correct, but I need to wait until khmer is updated
            if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
                res = optimal_size(intersect_nodegraph_kmer_count,
                                   fp_rate=fprate)
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            else:
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            for record in screed.open(query_file):
                seq = record.sequence
                for i in range(len(seq) - ksize + 1):
                    kmer = seq[i:i + ksize]
                    if intersect_nodegraph.get(kmer) > 0:
                        sample_kmers.add(kmer)
        # Save the sample_kmers
        sample_kmers.save(node_graph_out)
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)
    else:
        sample_kmers = khmer.load_nodegraph(node_graph_out)
        node_ksize = sample_kmers.ksize()
        if node_ksize != ksize:
            raise Exception(
                "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k."
                % (node_graph_out, node_ksize, ksize))
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)

    #num_sample_kmers = sample_kmers.n_unique_kmers()  # For some reason this only works when creating a new node graph, use the following instead
    num_sample_kmers = sample_kmers.n_occupied()

    # Compute all the indicies for all the training data
    pool = Pool(processes=num_threads)
    res = pool.map(
        unwrap_compute_indicies,
        zip(sketches, repeat(num_sample_kmers), repeat(true_fprate)))

    # Gather up the results in a nice form
    intersection_cardinalities = np.zeros(len(sketches))
    containment_indexes = np.zeros(len(sketches))
    jaccard_indexes = np.zeros(len(sketches))
    for i in range(len(res)):
        (intersection_cardinality, containment_index, jaccard_index) = res[i]
        intersection_cardinalities[i] = intersection_cardinality
        containment_indexes[i] = containment_index
        jaccard_indexes[i] = jaccard_index

    d = {
        'intersection': intersection_cardinalities,
        'containment index': containment_indexes,
        'jaccard index': jaccard_indexes
    }
    # Use only the basenames to label the rows (if requested)
    if base_name is True:
        df = pd.DataFrame(d, map(os.path.basename, training_file_names))
    else:
        df = pd.DataFrame(d, training_file_names)

    # Only get the rows above a certain threshold
    if coverage_threshold <= 0:
        est_threshold = 0
    else:
        est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate,
                                       confidence)
    filtered_results = df[df['containment index'] > est_threshold].sort_values(
        'containment index', ascending=False)
    # Export the results
    filtered_results.to_csv(results_file, index=True, encoding='utf-8')
Example #21
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the"
        " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py"
    )
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None

    file_names = list()
    fid = open(input_file_names, 'r')
    for line in fid.readlines():
        line = line.strip()
        if not os.path.exists(line):
            raise Exception("Training genome %s does not exist." % line)
        file_names.append(line)
    fid.close()

    # Open the pool and make the sketches
    pool = Pool(processes=num_threads)
    genome_sketches = pool.map(
        make_minhash_star,
        zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
Example #22
0
print("reading file list")
file_names = []
with open(training_file_names, 'r') as fid:
    for line in fid.readlines():
        line = line.strip()
        file_names.append(os.path.basename(line))

print("importing kmers")
chunk_size = 5000
iter = 0
with open(training_out_file, 'w') as fid:
    for file_iter in xrange(0, len(file_names), chunk_size):
        print("on file: %d" % file_iter)
        file_names_subset = file_names[file_iter:file_iter + chunk_size]
        sketchs = MH.import_multiple_from_single_hdf5(
            training_data, import_list=file_names_subset)
        all_kmers = itertools.chain.from_iterable(sketch._kmers
                                                  for sketch in sketchs)
        print("forming the set")
        all_kmers_set = set(all_kmers)
        to_write = ""
        for kmer in all_kmers_set:
            to_write += ">seq%d\n" % iter
            to_write += "%s\n" % kmer
            iter += 1
        fid.write(to_write)

###########################################################################################
# Next, run kmc on this thing
# with a bash script, use the following:
# /usr/bin/time /home/pi/koslickd/KMC/./kmc -v -k60 -m200 -sm -fm -ci0 -cs3 -t48 -jlog_train NathanRefSeqTraining60mers.fa NathanRefSeq60mers /scratch/kmc_temp/