Esempio n. 1
0
    def create_BF_prefilter(self, result_file=None) -> None:
        """
		Imports or creates the pre-filter Bloom filter
		:param result_file: (optional) if you'd like to export the bloom filter, populate that here
		:type result_file: str
		"""
        tree = self.tree
        k_range = self.k_range
        if not self.bloom_filter_file:  # create one
            try:
                # Get all the k-mers in the TST, put them in a bloom filter
                # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01)
                if result_file:
                    # save it to the file
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True,
                        filename=result_file
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                else:
                    # keep it in memory
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                for kmer_info in tree.keys():
                    kmer = kmer_info.split(
                        'x'
                    )[0]  # remove the location information and just get the kmer
                    for ksize in k_range:
                        self.all_kmers_bf.add(kmer[0:ksize])
                        self.all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize]))
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
        else:  # otherwise read it in
            try:
                self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file)
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
Esempio n. 2
0
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
                        all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize])
                        )  # also add the reverse complement
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
            all_kmers_bf = ReadingBloomFilter(hydra_file)
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    if verbose:
        print("Finished reading in/creating ternary search tree")
        t1 = timeit.default_timer()
        print("Time: %f" % (t1 - t0))
    # Seen k-mers (set of k-mers that already hit the trie, so don't need to check again)
    seen_kmers = set()

    # shared object that will update the intersection counts
    class Counters(object):
        # This class is basically an array of counters (on the same basis as the sketches)
        # it's used to keep track (in a parallel friendly way) of which streamed k-mers went into the training file sketches
Esempio n. 3
0
class Create:
    """
	This class has functionality to:
	1. Import the ternary search tree created in the training step
	2. Create or import the bloom filter pre-filter file
	"""
    def __init__(self, training_database_file: str, bloom_filter_file: str,
                 TST_file: str, k_range: list):
        """
		Initializes the class
		:param training_database_file: file pointing to the HDF5 training database created with MakeStreamingDNADatabase.py
		:param bloom_filter_file: (optional) file pointing to file created with MakeStreamingPrePfilter.py. If empty string, one will be created
		:param TST_file: file pointing to the TST file (ternary search tree) that was created with MakeStreamingDNADatabase.py
		:param k_range: range of k-mer sizes. eg [10, 20, 30]
		"""
        self.bloom_filter_file = bloom_filter_file
        self.TST_file = TST_file
        self.k_range = k_range
        self.training_database = training_database_file
        self.tree = None  # populated by import_TST
        self.all_kmers_bf = None  # populated by create_BF_prefilter

    def import_TST(self) -> None:
        """
		Imports the ternary search tree
		"""
        # no more safety net for those that didn't create a TST properly with the CreateStreamingQueryDNADatabase.py
        self.tree = mt.Trie()
        self.tree.load(self.TST_file)

    def create_BF_prefilter(self, result_file=None) -> None:
        """
		Imports or creates the pre-filter Bloom filter
		:param result_file: (optional) if you'd like to export the bloom filter, populate that here
		:type result_file: str
		"""
        tree = self.tree
        k_range = self.k_range
        if not self.bloom_filter_file:  # create one
            try:
                # Get all the k-mers in the TST, put them in a bloom filter
                # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01)
                if result_file:
                    # save it to the file
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True,
                        filename=result_file
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                else:
                    # keep it in memory
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                for kmer_info in tree.keys():
                    kmer = kmer_info.split(
                        'x'
                    )[0]  # remove the location information and just get the kmer
                    for ksize in k_range:
                        self.all_kmers_bf.add(kmer[0:ksize])
                        self.all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize]))
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
        else:  # otherwise read it in
            try:
                self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file)
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)