def create_BF_prefilter(self, result_file=None) -> None: """ Imports or creates the pre-filter Bloom filter :param result_file: (optional) if you'd like to export the bloom filter, populate that here :type result_file: str """ tree = self.tree k_range = self.k_range if not self.bloom_filter_file: # create one try: # Get all the k-mers in the TST, put them in a bloom filter # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01) if result_file: # save it to the file self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True, filename=result_file ) # fudge factor of 5 will make the BF larger, but also slightly faster else: # keep it in memory self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True ) # fudge factor of 5 will make the BF larger, but also slightly faster for kmer_info in tree.keys(): kmer = kmer_info.split( 'x' )[0] # remove the location information and just get the kmer for ksize in k_range: self.all_kmers_bf.add(kmer[0:ksize]) self.all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize])) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1) else: # otherwise read it in try: self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1)
for sketch in sketches: for kmer in sketch._kmers: for ksize in k_range: all_kmers_bf.add( kmer[0:ksize] ) # put all the k-mers and the appropriate suffixes in all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize]) ) # also add the reverse complement except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) else: # otherwise read it in try: all_kmers_bf = ReadingBloomFilter(hydra_file) except IOError: print("No such file or directory/error opening file: %s" % hydra_file) sys.exit(1) if verbose: print("Finished reading in/creating ternary search tree") t1 = timeit.default_timer() print("Time: %f" % (t1 - t0)) # Seen k-mers (set of k-mers that already hit the trie, so don't need to check again) seen_kmers = set() # shared object that will update the intersection counts class Counters(object): # This class is basically an array of counters (on the same basis as the sketches) # it's used to keep track (in a parallel friendly way) of which streamed k-mers went into the training file sketches
class Create: """ This class has functionality to: 1. Import the ternary search tree created in the training step 2. Create or import the bloom filter pre-filter file """ def __init__(self, training_database_file: str, bloom_filter_file: str, TST_file: str, k_range: list): """ Initializes the class :param training_database_file: file pointing to the HDF5 training database created with MakeStreamingDNADatabase.py :param bloom_filter_file: (optional) file pointing to file created with MakeStreamingPrePfilter.py. If empty string, one will be created :param TST_file: file pointing to the TST file (ternary search tree) that was created with MakeStreamingDNADatabase.py :param k_range: range of k-mer sizes. eg [10, 20, 30] """ self.bloom_filter_file = bloom_filter_file self.TST_file = TST_file self.k_range = k_range self.training_database = training_database_file self.tree = None # populated by import_TST self.all_kmers_bf = None # populated by create_BF_prefilter def import_TST(self) -> None: """ Imports the ternary search tree """ # no more safety net for those that didn't create a TST properly with the CreateStreamingQueryDNADatabase.py self.tree = mt.Trie() self.tree.load(self.TST_file) def create_BF_prefilter(self, result_file=None) -> None: """ Imports or creates the pre-filter Bloom filter :param result_file: (optional) if you'd like to export the bloom filter, populate that here :type result_file: str """ tree = self.tree k_range = self.k_range if not self.bloom_filter_file: # create one try: # Get all the k-mers in the TST, put them in a bloom filter # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01) if result_file: # save it to the file self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True, filename=result_file ) # fudge factor of 5 will make the BF larger, but also slightly faster else: # keep it in memory self.all_kmers_bf = WritingBloomFilter( len(tree.keys()) * len(k_range) * 5, 0.01, ignore_case=True ) # fudge factor of 5 will make the BF larger, but also slightly faster for kmer_info in tree.keys(): kmer = kmer_info.split( 'x' )[0] # remove the location information and just get the kmer for ksize in k_range: self.all_kmers_bf.add(kmer[0:ksize]) self.all_kmers_bf.add( khmer.reverse_complement(kmer[0:ksize])) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1) else: # otherwise read it in try: self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file) except IOError: print("No such file or directory/error opening file: %s" % self.bloom_filter_file) sys.exit(1)