def stream_cmash_for_ji(self, other):
		if self.ksize != other.ksize:
			raise Exception("different k-mer sizes - cannot compare")
		if self.p != other.p:
			raise Exception("different primes - cannot compare")
		
		A_kmers = set([x[0:self.ksize] for x in self._kmers])   #unnessary, just a double security
		A_matches = dict()
		for kmer in A_kmers:
			if self.rev_comp:
				kmer = min(kmer, khmer.reverse_complement(kmer))
			A_matches[kmer] = 0 # count purpose
		
		# streaming all other kmers for CI (can't do JI directly)
		for record in screed.open(other.input_file_name):
			seq = record.sequence
			seq = seq.upper()
			seq_split_onlyACTG = re.compile('[^ACTG]').split(seq)
			for sub_seq in seq_split_onlyACTG:
				for i in range(len(sub_seq) - self.ksize + 1):
					# enumerate all kmers
					kmer = sub_seq[i:i + self.ksize]
					if self.rev_comp:
						kmer = min(kmer, khmer.reverse_complement(kmer))
					if kmer in A_matches:
						A_matches[kmer] = 1
						
		# return results
		C_est = np.sum(list(A_matches.values())) / len(A_kmers)
		J_est = containment_to_jaccard(C_est, self, other)
		print(C_est)
		print(J_est)
		return J_est
Exemple #2
0
    def add(self, kmer, weight, rev_comp):
        _mins = self._mins
        _counts = self._counts
        _kmers = self._kmers
        # use rev_comp if needed
        if rev_comp:
            h1 = khmer.hash_no_rc_murmur3(kmer)
            h2 = khmer.hash_no_rc_murmur3(khmer.reverse_complement(kmer))
            h = min(h1, h2)
            if h == h2:
                kmer = khmer.reverse_complement(kmer)
        else:
            h = khmer.hash_no_rc_murmur3(kmer)

        # reminder of max_prime we use
        h = h % self.p
        # early stop if n sketches are found
        if h >= _mins[-1]:
            return

        # insert kmer into the sketch
        i = bisect.bisect_left(_mins, h)  # find index to insert h
        if _mins[i] == h:  #already in sketch
            _counts[i] += weight
        else:
            #h not in sketch, insert
            _mins.insert(i, h)
            _counts.insert(i, weight)
            _kmers.insert(i, kmer)
            _mins.pop()
            _counts.pop()
            _kmers.pop()
            return
def canonical_kmer(kmer):
	"""
	transfer an input kmer into its canomical form
	:param kmer:
	:return:
	"""
	h1 = khmer.hash_no_rc_murmur3(kmer)
	h2 = khmer.hash_no_rc_murmur3(khmer.reverse_complement(kmer))
	if h1 > h2:
		kmer = khmer.reverse_complement(kmer)
	return kmer
Exemple #4
0
def test_reverse_complement():
    s = 'AATTCCGG'
    assert khmer.reverse_complement(s) == 'CCGGAATT'

    s = 'A'
    assert khmer.reverse_complement(s) == 'T'
    s = 'T'
    assert khmer.reverse_complement(s) == 'A'
    s = 'C'
    assert khmer.reverse_complement(s) == 'G'
    s = 'G'
    assert khmer.reverse_complement(s) == 'C'
def test_reverse_complement():
    s = 'AATTCCGG'
    assert khmer.reverse_complement(s) == 'CCGGAATT'

    s = 'A'
    assert khmer.reverse_complement(s) == 'T'
    s = 'T'
    assert khmer.reverse_complement(s) == 'A'
    s = 'C'
    assert khmer.reverse_complement(s) == 'G'
    s = 'G'
    assert khmer.reverse_complement(s) == 'C'
	def add(self, kmer, update_full=False):
		_mins = self._mins
		_kmers = self._kmers
		# use rev_comp if needed
		if self.rev_comp:
			kmer = min(kmer, khmer.reverse_complement(kmer))
		h = khmer.hash_no_rc_murmur3(kmer)
		# insert into full kmer set
		if update_full:
			_full = self._all_kmer
			if kmer not in _full:
				_full[kmer] = 1
			else:
				_full[kmer] += 1
		# insert into MH sketches reminder of max_prime we use
		h = h % self.p
		# early stop if n sketches are found
		if h >= _mins[-1]:
			return
		# insert kmer into the sketch
		i = bisect.bisect_left(_mins, h)  # find index to insert h
		if _mins[i] == h:  # already in sketch
			return
		else:
			# h not in sketch, insert
			_mins.insert(i, h)
			_kmers.insert(i, kmer)
			_mins.pop()
			_kmers.pop()
			return
	def brute_force_truncation(self, new_ksize):
		if not isinstance(new_ksize, int):
			raise Exception("Input number is not an integer")
		if new_ksize > self.ksize:
			raise Exception("New size must be smaller than %d." % self.ksize)
		elif new_ksize == self.ksize:
			return
		elif new_ksize < self.ksize:
			# data to be updated after the truncation:
			self.ksize = new_ksize
			self.cardinality = estimate_genome_size(self.input_file_name, new_ksize)
			while self._mins[-1] == self.p:  # rm unused cells, otherwise empty cell (though very rare) has hash value 0
				self._mins.pop()
				self._kmers.pop()
			new_kmers = list(set([x[0:new_ksize] for x in self._kmers]))
			sketch_size = len(new_kmers)
			self._mins = [self.p] * sketch_size
			self._kmers = [''] * sketch_size
			# update
			for i in range(sketch_size):
				self.add(new_kmers[i])  # for MH sketch only
			# clean trailing empty cells in sketches
			while self._mins[-1] == self.p:
				self._mins.pop()
				self._kmers.pop()
			# conditional: truncate the full kmer to current ksize
			if self.full_kmer:
				old_kmers = [x[0:new_ksize] for x in self._all_kmer]
				if self.rev_comp:
					old_kmers = [min(x, khmer.reverse_complement(x)) for x in old_kmers]
				self._truncated_all_kmer = list(set(old_kmers))
			return
Exemple #8
0
def define_canonical_kmers(cg, nkmers):
    """
    Define canonical k-mers, i.e. exclude palindromic and rev. compl. k-mers

    Parameters
    ----------
    cg : khmer.Countgraph
        a k-mer countgraph
    nkmers : int
        number of all possible k-mers

    Returns
    -------
    set
        a set of canonical k-mers
    """
    canonical_kmers = set(
    )  # TODO Consider a sorting step to guarantee order of canonical kmers/kmer-hashes
    for i in range(nkmers):
        kmer = cg.reverse_hash(i)
        kmer_rev_comp = khmer.reverse_complement(kmer)
        # Store only the lexicographically *smaller* kmer or the palindromic kmer
        if kmer < kmer_rev_comp or kmer == kmer_rev_comp:
            canonical_kmers.add(i)
        else:
            continue
    return canonical_kmers
def make_minhash(genome, max_h, prime, ksize):
	kmers = set()
	name = os.path.basename(genome)
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(genome):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	#genome_sketches.append(MHS)
	# export the kmers
	fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'w')
	#fid = bz2.open(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'wt')  # python3
	for kmer in kmers:
		fid.write("%s\n" % kmer)
	fid.close()
	return MHS
Exemple #10
0
    def return_matches(self, input_kmer: str, k_size_loc: int) -> tuple:
        """
		Get all the matches in the TST with the kmer prefix
		:param input_kmer: an input k-mer
		:type input_kmer: str
		:param k_size_loc: where in self.k_range this k-mer (via it's length) belongs
		:type k_size_loc: int
		:return: a tuple: first of which is a list of strings (all the matches in the TST), and the second is a Boolean indicating if you saw a match
		:rtype: tuple
		"""
        match_info = set()
        to_return = []
        saw_match = False
        tree = self.tree

        # look for matches to both the kmer and its reverse complement in the TST as we can't assume
        # directionality of reads (and training database is constructed without reverse complements)
        for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]:
            prefix_matches = tree.keys(
                kmer)  # get all the k-mers whose prefix matches
            # get the location of the found kmers in the counters
            for item in prefix_matches:
                split_string = item.split(
                    'x')  # first is the hash location, second is which k-mer
                hash_loc = int(split_string[1])
                kmer_loc = int(split_string[2])
                match_info.add((hash_loc, k_size_loc, kmer_loc))
            saw_match = False
            if match_info:
                saw_match = True
                for tup in match_info:
                    to_return.append(tup)
            if saw_match:  # Only need to see a match to the original kmer or the reverse complement, don't return both otherwise you over-count
                break
        return to_return, saw_match
Exemple #11
0
def make_minhash(genome, max_h, prime, ksize):
    kmers = set()
    name = os.path.basename(genome)
    MHS = MH.CountEstimator(n=max_h,
                            max_prime=prime,
                            ksize=ksize,
                            save_kmers='y')
    for record in screed.open(genome):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                kmers.add(kmer)
                MHS.add(kmer)
            else:
                kmers.add(kmer_rev)
                MHS.add(kmer_rev)
    MHS._true_num_kmers = len(kmers)
    MHS.input_file_name = os.path.basename(genome)
    # Export the hash k-mers
    fid = open(
        os.path.abspath(
            os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w')
    for kmer in MHS._kmers:
        fid.write(">\n%s\n" % kmer)
    fid.close()
    return MHS
Exemple #12
0
    def add(self, kmer, rev_comp=False):
        """
        Add kmer into sketch, keeping sketch sorted, update counts accordingly
        """
        _mins = self._mins
        _counts = self._counts
        _kmers = self._kmers

        if rev_comp:
            h1 = khmer.hash_murmur3(kmer)
            h2 = khmer.hash_murmur3(khmer.reverse_complement(kmer))
            #h1 = hash(kmer)
            #h2 = hash(khmer.reverse_complement(kmer))
            h = min(h1, h2)
            if h == h2:
                kmer = khmer.reverse_complement(kmer)
        else:
            h = khmer.hash_murmur3(kmer)
            #h = hash(kmer)

        h = h % self.p
        if self.hash_list:  # If I only want to include hashes that occur in hash_list
            if h not in self.hash_list:  # If the kmer isn't in the hash_list, then break
                return

        if h >= _mins[-1]:
            return

        i = bisect.bisect_left(_mins, h)  # find index to insert h
        if _mins[i] == h:  # if h in mins, increment counts
            _counts[i] += 1
            return
        else:  # otherwise insert h, initialize counts to 1, and insert kmer if necessary
            _mins.insert(i, h)
            _mins.pop()
            _counts.insert(i, 1)
            _counts.pop()
            if _kmers:
                _kmers.insert(i, np.string_(kmer))
                _kmers.pop()
            return

        assert 0, "should never reach this"
    def add(self, kmer, rev_comp=False):
        """
        Add kmer into sketch, keeping sketch sorted, update counts accordingly
        """
        _mins = self._mins
        _counts = self._counts
        _kmers = self._kmers

        if rev_comp:
            h1 = khmer.hash_murmur3(kmer)
            h2 = khmer.hash_murmur3(khmer.reverse_complement(kmer))
            #h1 = hash(kmer)
            #h2 = hash(khmer.reverse_complement(kmer))
            h = min(h1, h2)
            if h == h2:
                kmer = khmer.reverse_complement(kmer)
        else:
            h = khmer.hash_murmur3(kmer)
            #h = hash(kmer)

        h = h % self.p
        if self.hash_list:  # If I only want to include hashes that occur in hash_list
            if h not in self.hash_list:  # If the kmer isn't in the hash_list, then break
                return

        if h >= _mins[-1]:
            return

        i = bisect.bisect_left(_mins, h)  # find index to insert h
        if _mins[i] == h:  # if h in mins, increment counts
            _counts[i] += 1
            return
        else:  # otherwise insert h, initialize counts to 1, and insert kmer if necessary
            _mins.insert(i, h)
            _mins.pop()
            _counts.insert(i, 1)
            _counts.pop()
            if _kmers:
                _kmers.insert(i, np.string_(kmer))
                _kmers.pop()
            return

        assert 0, "should never reach this"
Exemple #14
0
def test_Counters_return_matches():
    C = Create(training_database_file=temp_database_file,
               bloom_filter_file="",
               TST_file=temp_TST_file,
               k_range=k_range)
    C.import_TST()
    C.create_BF_prefilter()
    counters = Counters(tree=C.tree,
                        k_range=k_range,
                        all_kmers_bf=C.all_kmers_bf)

    # test the return matches on known k-mers
    # each sketch kmer (or it's reverse complement) should match to the TST
    # TODO: big note here: proper way to check this: take the reverse complement, THEN truncate
    #  (which effectively takes the suffix, as the suffix of a rev-comp is the prefix of the original)
    #  but this calls into question how create_BF_prefilter is working since it truncates, THEN takes the revcomp
    #  but this is the only way I could get all these tests to pass successfully
    for CE in CEs:
        for k_size in k_range:
            for kmer in CE._kmers:
                kmer = kmer[0:k_size]
                if kmer:
                    k_size_loc = k_range.index(len(kmer))
                    to_return, saw_match = counters.return_matches(
                        input_kmer=kmer, k_size_loc=k_size_loc)
                    assert saw_match
                    for to_return_elem in to_return:
                        truncated_sketches = list(
                            map(lambda x: x[0:k_size],
                                CEs[to_return_elem[0]]._kmers))
                        # add the reverse complements as well, since the TST return_matches matches to rev-comps as well
                        truncated_sketches_revcomp = list(
                            map(
                                lambda x: khmer.reverse_complement(x)[
                                    0:k_size], CEs[to_return_elem[0]]._kmers))
                        # make sure the kmer really is in the sketch indicated by to_return, could be in the truncated or the rev-comp one
                        assert (kmer in truncated_sketches) or (
                            kmer in truncated_sketches_revcomp)
                        # make sure the k_size_loc is correct
                        assert to_return_elem[1] == k_size_loc
                        # make sure it returned the correct location in the sketch
                        # note that at some smaller kmer values, it may appear in multiple locations, so just make sure
                        # that it appears somewhere in the list
                        indices = [
                            i for i, x in enumerate(truncated_sketches)
                            if x == kmer
                        ]
                        indices_revcomp = [
                            i for i, x in enumerate(truncated_sketches_revcomp)
                            if x == kmer
                        ]
                        assert (to_return_elem[2]
                                in indices) or (to_return_elem[2]
                                                in indices_revcomp)
def get_all_kmers(input_file, temp_k, use_rev_comp=True):
	temp_dict = dict()
	for record in screed.open(input_file):
		for kmer in kmers(record.sequence, temp_k):
			if use_rev_comp:
				kmer = min(kmer, khmer.reverse_complement(kmer))
			if kmer in temp_dict:
				temp_dict[kmer] += 1
			else:
				temp_dict[kmer] = 1
	return temp_dict
Exemple #16
0
def test_Create_BF_prefilter():
    C = Create(training_database_file=temp_database_file,
               bloom_filter_file="",
               TST_file=temp_TST_file,
               k_range=k_range)
    C.import_TST()
    C.create_BF_prefilter()

    # Make sure each TST kmers has been inserted into the bloom tree
    for kmer_with_info in C.tree.keys():
        kmer = kmer_with_info.split('x')[0]
        assert kmer in C.all_kmers_bf

    # Make sure all the reverse complements are in there too
    for kmer_with_info in C.tree.keys():
        kmer = kmer_with_info.split('x')[0]
        kmer = khmer.reverse_complement(kmer)
        assert kmer in C.all_kmers_bf

    # go through each individual sequence in the sketches and make sure them and their rev-comps are in the BF
    for CE in CEs:
        for kmer in CE._kmers:
            if kmer:
                for k_size in k_range:
                    assert kmer[0:k_size] in C.all_kmers_bf
                    assert khmer.reverse_complement(
                        kmer[0:k_size]) in C.all_kmers_bf

    # check if the BF is case insensitive
    for CE in CEs:
        for kmer in CE._kmers:
            if kmer:
                for k_size in k_range:
                    trunc_kmer = kmer[0:k_size]
                    trunc_kmer = trunc_kmer.lower()
                    assert trunc_kmer in C.all_kmers_bf
                    # khmer doesn't properly handle rev-comps of lower-case characters
                    # see https://github.com/dib-lab/khmer/issues/1904
                    assert khmer.reverse_complement(
                        trunc_kmer.upper()).lower() in C.all_kmers_bf
Exemple #17
0
    def process_seq(self, seq: str) -> list:
        """
		Takes an input sequence, breaks it into its k-mers (for every size self.k_range), and after some filtering and
		checking, sends it to return_matches to query the TST
		:param seq: an input DNA sequence
		:type seq: string
		:return: a list of keys indicating all the TST hits for all the k-mers in seq
		:rtype: list
		"""
        k_range = self.k_range
        seen_kmers = self.seen_kmers
        all_kmers_bf = self.all_kmers_bf
        #  start with small kmer size, if see match, then continue looking for longer k-mer sizes, otherwise move on
        small_k_size = k_range[0]  # start with the small k-size
        to_return = []
        seq = seq.upper()
        # TODO: could, for efficiency, also remove non-ACTG, but those won't match anyways since they aren't in the TST
        #  might not actually be more efficient to search for non-ACTG too
        for i in range(len(seq) - small_k_size + 1):  # look at all k-mers
            kmer = seq[i:i + small_k_size]
            possible_match = False
            if kmer not in seen_kmers:  # if we should process it
                if kmer in all_kmers_bf:  # if we should process it
                    match_list, saw_match = self.return_matches(kmer, 0)
                    if saw_match:
                        seen_kmers.add(kmer)
                        seen_kmers.add(khmer.reverse_complement(kmer))
                        to_return.extend(match_list)
                    possible_match = True
            # TODO: note: I could (since it'd only be for a single kmer size, keep a set of *all* small_kmers I've tried and use this as another pre-filter
            else:
                possible_match = True  # FIXME: bug introduced here in cf64b7aace5eadf738b920109d6419c9d930a1dc, make sure it didn't happen again

            # start looking at the other k_sizes, don't overhang len(seq)
            if possible_match:
                for other_k_size in [
                        x for x in k_range[1:] if i + x <= len(seq)
                ]:
                    kmer = seq[i:i + other_k_size]
                    if kmer in all_kmers_bf:
                        # if True:
                        k_size_loc = k_range.index(other_k_size)
                        match_list, saw_match = self.return_matches(
                            kmer, k_size_loc)
                        if saw_match:
                            to_return.extend(match_list)
                    else:
                        pass  # if you didn't see a match at a smaller k-length, you won't at a larger one
        return to_return
Exemple #18
0
    def create_BF_prefilter(self, result_file=None) -> None:
        """
		Imports or creates the pre-filter Bloom filter
		:param result_file: (optional) if you'd like to export the bloom filter, populate that here
		:type result_file: str
		"""
        tree = self.tree
        k_range = self.k_range
        if not self.bloom_filter_file:  # create one
            try:
                # Get all the k-mers in the TST, put them in a bloom filter
                # all_kmers_bf = WritingBloomFilter(len(sketches) * len(k_range) * num_hashes * 20, 0.01)
                if result_file:
                    # save it to the file
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True,
                        filename=result_file
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                else:
                    # keep it in memory
                    self.all_kmers_bf = WritingBloomFilter(
                        len(tree.keys()) * len(k_range) * 5,
                        0.01,
                        ignore_case=True
                    )  # fudge factor of 5 will make the BF larger, but also slightly faster
                for kmer_info in tree.keys():
                    kmer = kmer_info.split(
                        'x'
                    )[0]  # remove the location information and just get the kmer
                    for ksize in k_range:
                        self.all_kmers_bf.add(kmer[0:ksize])
                        self.all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize]))
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
        else:  # otherwise read it in
            try:
                self.all_kmers_bf = ReadingBloomFilter(self.bloom_filter_file)
            except IOError:
                print("No such file or directory/error opening file: %s" %
                      self.bloom_filter_file)
                sys.exit(1)
	def calculate_bias_factor(self, other):
		"""
		Calculate the bias factor from 2 JI_CE object, need to be truncated first
		Will use: 2 truncated full kmer, 2 full kmer, maxk, current ksize
		"""
		if self.ksize != other.ksize:
			raise Exception("different k-mer sizes - cannot compare")
		if self.p != other.p:
			raise Exception("different primes - cannot compare")
		if self.maxk != other.maxk:
			raise Exception("different maxk - cannot compare")
		if not self.full_kmer or not other.full_kmer:
			raise Exception("full kmer not enabled for the CE object")
		
		# use dict to count prefix
		ksmall_intersect = dict()
		for kmer in list(set(self._truncated_all_kmer).intersection(other._truncated_all_kmer)):
			ksmall_intersect[kmer] = 0  # for counting purpose
		ksmall_union = dict()
		for kmer in list(set(self._truncated_all_kmer).union(other._truncated_all_kmer)):
			ksmall_union[kmer] = 0
		
		# count prefix match
		for kmer in list(set(self._all_kmer.keys()).union(other._all_kmer.keys())):
			kmer = kmer[0:self.ksize]  # prefix
			if self.rev_comp:
				kmer = min(kmer, khmer.reverse_complement(kmer))
			if kmer in ksmall_intersect:
				ksmall_intersect[kmer] += 1
				ksmall_union[kmer] += 1
			elif kmer in ksmall_union:
				ksmall_union[kmer] += 1
		
		# bias factor
		if len(ksmall_intersect) == 0:
			numerator = 0
		else:
			numerator = sum(ksmall_intersect.values()) * 1.0 / len(ksmall_intersect)
		denominator = sum(ksmall_union.values()) * 1.0 / len(ksmall_union)
		bias_factor = numerator / denominator
		print(numerator)
		print(denominator)
		return bias_factor
Exemple #20
0
    def yield_trie_items_to_insert_no_import(file_name):
        fid = h5py.File(file_name, 'r')
        if "CountEstimators" not in fid:
            fid.close()
            raise Exception(
                "This function imports a single HDF5 file containing multiple sketches."
                " It appears you've used it on a file containing a single sketch."
                "Try using import_single_hdf5 instead")

        grp = fid["CountEstimators"]
        iterator = grp.keys()

        iterator = sorted(iterator, key=os.path.basename
                          )  # sort so that we know the order of the input

        for (i, key) in enumerate(iterator):
            if key not in grp:
                fid.close()
                raise Exception("The key " + key + " is not in " + file_name)

            subgrp = grp[key]
            if "kmers" not in subgrp:
                raise Exception(
                    "Kmers were not saved when creating the count estimators. Please make sure save_kmers='y' "
                    "when creating the count estimators.")
            else:
                temp_kmers = subgrp["kmers"][...]
                kmers = [kmer.decode('utf-8') for kmer in temp_kmers]
                for (kmer_index, kmer) in enumerate(kmers):
                    # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
                    if kmer:
                        yield kmer + 'x' + str(i) + 'x' + str(
                            kmer_index
                        )  # format here is kmer+x+hash_index+kmer_index
                        # rev-comp kmer
                        kmer_rc = khmer.reverse_complement(kmer)
                        yield kmer_rc + 'x' + str(i) + 'x' + str(
                            kmer_index
                        )  # format here is kmer+x+hash_index+kmer_index
def make_minhash(genome, max_h, prime, ksize):
	kmers = set()
	name = os.path.basename(genome)
	MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(genome):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				kmers.add(kmer)
				MHS.add(kmer)
			else:
				kmers.add(kmer_rev)
				MHS.add(kmer_rev)
	MHS._true_num_kmers = len(kmers)
	MHS.input_file_name = os.path.basename(genome)
	# Export the hash k-mers
	fid = open(os.path.abspath(os.path.join('../data/Viruses/', name + ".Hash21mers.fa")), 'w')
	for kmer in MHS._kmers:
		fid.write(">\n%s\n" % kmer)
	fid.close()
	return MHS
Exemple #22
0
    def make_TST(self):
        genome_sketches = self.genome_sketches
        to_insert = set()
        # add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
        for i in range(len(genome_sketches)):
            for kmer_index in range(len(genome_sketches[i]._kmers)):
                # normal kmer
                kmer = genome_sketches[i]._kmers[kmer_index]
                # only insert the kmer if it's actually non-empty
                if kmer:
                    to_insert.add(
                        kmer + 'x' + str(i) + 'x' + str(kmer_index)
                    )  # format here is kmer+x+hash_index+kmer_index
                    # rev-comp kmer
                    kmer = khmer.reverse_complement(
                        genome_sketches[i]._kmers[kmer_index])
                    to_insert.add(
                        kmer + 'x' + str(i) + 'x' + str(kmer_index)
                    )  # format here is kmer+x+hash_index+kmer_index

        # export the TST
        tree = mt.Trie(to_insert)
        tree.save(self.TST_export_file_name)
 def return_matches(self, input_kmer, k_size_loc):
     """ Get all the matches in the trie with the kmer prefix"""
     match_info = set()
     to_return = []
     for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]:
         prefix_matches = tree.keys(
             kmer)  # get all the k-mers whose prefix matches
         #match_info = set()
         # get the location of the found kmers in the counters
         for item in prefix_matches:
             split_string = item.split(
                 'x'
             )  # first is the hash location, second is which k-mer
             hash_loc = int(split_string[1])
             kmer_loc = int(split_string[2])
             match_info.add((hash_loc, k_size_loc, kmer_loc))
         #to_return = []
         saw_match = False
         if match_info:
             saw_match = True
             for tup in match_info:
                 to_return.append(tup)
     return to_return, saw_match
			to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index))  # format here is kmer+x+hash_index+kmer_index
	tree = mt.Trie(to_insert)
	tree.save(streaming_database_file)
else:
	tree = mt.Trie()
	tree.load(streaming_database_file)

# all the k-mers of interest in a set (as a pre-filter)
if not hydra_file:  # create one
	try:
		all_kmers_bf = WritingBloomFilter(len(sketches)*len(k_range)*num_hashes*2, 0.01)
		for sketch in sketches:
			for kmer in sketch._kmers:
				for ksize in k_range:
					all_kmers_bf.add(kmer[0:ksize])  # put all the k-mers and the appropriate suffixes in
					all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize]))  # also add the reverse complement
	except IOError:
		print("No such file or directory/error opening file: %s" % hydra_file)
		sys.exit(1)
else:  # otherwise read it in
	try:
		all_kmers_bf = ReadingBloomFilter(hydra_file)
	except IOError:
		print("No such file or directory/error opening file: %s" % hydra_file)
		sys.exit(1)
if verbose:
	print("Finished reading in/creating ternary search tree")
	t1 = timeit.default_timer()
	print("Time: %f" % (t1 - t0))
# Seen k-mers (set of k-mers that already hit the trie, so don't need to check again)
seen_kmers = set()
Exemple #25
0
def test_reverse_complement_exception():
    # deal with DNA, ignore rest
    assert khmer.reverse_complement('FGF') == 'FCF'
def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc, prime, p, ksize, hash_range):
	# Make a simulation
	simulation_file, abundances_file, selected_genomes = make_simulation(num_genomes, num_reads, python_loc, gen_sim_loc)

	# Get simulation k-mers, use canonical k-mers
	# Simultaneously, make the min hash sketch of the simulation
	simulation_kmers = set()
	simulation_MHS = MH.CountEstimator(n=max_h, max_prime=prime, ksize=ksize, save_kmers='y')
	for record in screed.open(simulation_file):
		seq = record.sequence
		for i in range(len(seq) - ksize + 1):
			kmer = seq[i:i+ksize]
			kmer_rev = khmer.reverse_complement(kmer)
			if kmer < kmer_rev:
				simulation_kmers.add(kmer)
				simulation_MHS.add(kmer)
			else:
				simulation_kmers.add(kmer_rev)
				simulation_MHS.add(kmer_rev)

	# Use them to populate a bloom filter
	simulation_bloom = BloomFilter(capacity=1.1*len(simulation_kmers), error_rate=p)
	simulation_kmers_length = len(simulation_kmers)  # in practice, this would be computed when the bloom filter is created
	# or can use an estimate based on the bloom filter entries
	for kmer in simulation_kmers:
		simulation_bloom.add(kmer)

	# Use pre-computed data to load the kmers and the sketches
	base_names = [os.path.basename(item) for item in selected_genomes]
	# Load the sketches
	genome_sketches = MH.import_multiple_from_single_hdf5(os.path.abspath('../data/Genomes/AllSketches.h5'), base_names)
	# Get the true number of kmers
	genome_lengths = list()
	for i in range(len(genome_sketches)):
		genome_lengths.append(genome_sketches[i]._true_num_kmers)

	# Get *all* the kmers for computation of ground truth
	genome_kmers = list()
	for i in range(len(base_names)):
		name = base_names[i]
		kmers = set()
		fid = bz2.BZ2File(os.path.abspath(os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r')
		for line in fid.readlines():
			kmers.add(line.strip())
		fid.close()
		genome_kmers.append(kmers)

	# Calculate the true Jaccard index
	true_jaccards = list()
	for kmers in genome_kmers:
		true_jaccard = len(kmers.intersection(simulation_kmers)) / float(len(kmers.union(simulation_kmers)))
		true_jaccards.append(true_jaccard)

	# Calculate the min hash estimate of jaccard index
	MH_relative_errors = list()
	CMH_relative_errors = list()
	for h in hash_range:
		MH_jaccards = list()
		for MHS in genome_sketches:
			# Down sample each sketch to h
			MHS.down_sample(h)
			simulation_MHS.down_sample(h)
			MH_jaccard = MHS.jaccard(simulation_MHS)
			MH_jaccards.append(MH_jaccard)

		MH_jaccards_corrected = list()
		for MHS in genome_sketches:
			MHS_set = set(MHS._mins)
			sample_set = set(simulation_MHS._mins)
			MH_jaccard = len(set(list(MHS_set.union(sample_set))[0:h]).intersection(MHS_set.intersection(sample_set))) / float(h)
			MH_jaccards_corrected.append(MH_jaccard)

		# Calculate the containment min hash estimate of the jaccard index
		CMH_jaccards = list()
		for i in range(len(genome_sketches)):
			genome_kmers_len = genome_lengths[i]  # pre-computed when creating the "training" data
			MHS = genome_sketches[i]
			# down sample each sketch to h
			MHS.down_sample(h)
			kmers = MHS._kmers  # use only the k-mers in the min hash sketch
			int_est = 0
			for kmer in kmers:
				if kmer in simulation_bloom:  # test if the k-mers are in the simulation bloom filter
					int_est += 1
			int_est -= p*h  # adjust for false positive rate
			containment_est = int_est / float(h)
			containment_est_jaccard = genome_kmers_len * containment_est / \
				(genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est)
			CMH_jaccards.append(containment_est_jaccard)

		# compute the average deviation from the truth (relative error)
		true_jaccards = np.array(true_jaccards)
		MH_jaccards = np.array(MH_jaccards)
		CMH_jaccards = np.array(CMH_jaccards)
		MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards)/true_jaccards)
		CMH_mean = np.mean(np.abs(true_jaccards - CMH_jaccards)/true_jaccards)
		#print("Classic min hash mean relative error: %f" % MH_mean)
		#print("Containment min hash mean relative error: %f" % CMH_mean)
		MH_relative_errors.append(MH_mean)
		CMH_relative_errors.append(CMH_mean)

	# remove temp files
	os.remove(simulation_file)
	os.remove(abundances_file)
	# return the relative errors
	return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(genome_lengths)
Exemple #27
0
def test_kmer_revcom_hash(kmer):
    a = khmer.Counttable(21, 1e4, 3)
    assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer))
        tree = mt.Trie()
        tree.load(streaming_database_file)

    # all the k-mers of interest in a set (as a pre-filter)
    if not hydra_file:  # create one
        try:
            all_kmers_bf = WritingBloomFilter(
                len(sketches) * len(k_range) * num_hashes * 2, 0.01)
            for sketch in sketches:
                for kmer in sketch._kmers:
                    for ksize in k_range:
                        all_kmers_bf.add(
                            kmer[0:ksize]
                        )  # put all the k-mers and the appropriate suffixes in
                        all_kmers_bf.add(
                            khmer.reverse_complement(kmer[0:ksize])
                        )  # also add the reverse complement
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    else:  # otherwise read it in
        try:
            all_kmers_bf = ReadingBloomFilter(hydra_file)
        except IOError:
            print("No such file or directory/error opening file: %s" %
                  hydra_file)
            sys.exit(1)
    if verbose:
        print("Finished reading in/creating ternary search tree")
        t1 = timeit.default_timer()
Exemple #29
0
def test_reverse_complement_exception():
    # deal with DNA, ignore rest
    assert khmer.reverse_complement('FGF') == 'FCF'
Exemple #30
0
def test_kmer_revcom_hash(kmer):
    a = khmer.Counttable(21, 1e4, 3)
    assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer))
Exemple #31
0
def test_reverse_complement_exception():
    with pytest.raises(RuntimeError):
        khmer.reverse_complement('FGF')
Exemple #32
0
temp_database_file = tempfile.mktemp()
MH.export_multiple_to_single_hdf5(CEs, temp_database_file)

# And create the TST
to_insert = set()
# add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
for i in range(len(CEs)):
    for kmer_index in range(len(CEs[i]._kmers)):
        # normal kmer
        kmer = CEs[i]._kmers[kmer_index]
        if kmer:
            to_insert.add(
                kmer + 'x' + str(i) + 'x' +
                str(kmer_index))  # format here is kmer+x+hash_index+kmer_index
            # rev-comp kmer
            kmer = khmer.reverse_complement(CEs[i]._kmers[kmer_index])
            to_insert.add(
                kmer + 'x' + str(i) + 'x' +
                str(kmer_index))  # format here is kmer+x+hash_index+kmer_index

# export the TST
tree = mt.Trie(to_insert)
temp_TST_file = tempfile.mktemp()
tree.save(temp_TST_file)

# TODO: marisa_trie has an issue with single character prefix lookups
# TODO: see https://github.com/pytries/marisa-trie/issues/55
# TODO: so set k-range above that
k_range = [2, 3, 5]

def create_relative_errors(num_genomes, num_reads, python_loc, gen_sim_loc,
                           prime, p, ksize, hash_range):
    # Make a simulation
    simulation_file, abundances_file, selected_genomes = make_simulation(
        num_genomes, num_reads, python_loc, gen_sim_loc)

    # Get simulation k-mers, use canonical k-mers
    # Simultaneously, make the min hash sketch of the simulation
    simulation_kmers = set()
    simulation_MHS = MH.CountEstimator(n=max_h,
                                       max_prime=prime,
                                       ksize=ksize,
                                       save_kmers='y')
    for record in screed.open(simulation_file):
        seq = record.sequence
        for i in range(len(seq) - ksize + 1):
            kmer = seq[i:i + ksize]
            kmer_rev = khmer.reverse_complement(kmer)
            if kmer < kmer_rev:
                simulation_kmers.add(kmer)
                simulation_MHS.add(kmer)
            else:
                simulation_kmers.add(kmer_rev)
                simulation_MHS.add(kmer_rev)

    # Use them to populate a bloom filter
    simulation_bloom = BloomFilter(capacity=1.1 * len(simulation_kmers),
                                   error_rate=p)
    simulation_kmers_length = len(
        simulation_kmers
    )  # in practice, this would be computed when the bloom filter is created
    # or can use an estimate based on the bloom filter entries
    for kmer in simulation_kmers:
        simulation_bloom.add(kmer)

    # Use pre-computed data to load the kmers and the sketches
    base_names = [os.path.basename(item) for item in selected_genomes]
    # Load the sketches
    genome_sketches = MH.import_multiple_from_single_hdf5(
        os.path.abspath('../data/Genomes/AllSketches.h5'), base_names)
    # Get the true number of kmers
    genome_lengths = list()
    for i in range(len(genome_sketches)):
        genome_lengths.append(genome_sketches[i]._true_num_kmers)

    # Get *all* the kmers for computation of ground truth
    genome_kmers = list()
    for i in range(len(base_names)):
        name = base_names[i]
        kmers = set()
        fid = bz2.BZ2File(
            os.path.abspath(
                os.path.join('../data/Genomes/', name + ".kmers.bz2")), 'r')
        for line in fid.readlines():
            kmers.add(line.strip())
        fid.close()
        genome_kmers.append(kmers)

    # Calculate the true Jaccard index
    true_jaccards = list()
    for kmers in genome_kmers:
        true_jaccard = len(kmers.intersection(simulation_kmers)) / float(
            len(kmers.union(simulation_kmers)))
        true_jaccards.append(true_jaccard)

    # Calculate the min hash estimate of jaccard index
    MH_relative_errors = list()
    CMH_relative_errors = list()
    for h in hash_range:
        MH_jaccards = list()
        for MHS in genome_sketches:
            # Down sample each sketch to h
            MHS.down_sample(h)
            simulation_MHS.down_sample(h)
            MH_jaccard = MHS.jaccard(simulation_MHS)
            MH_jaccards.append(MH_jaccard)

        MH_jaccards_corrected = list()
        for MHS in genome_sketches:
            MHS_set = set(MHS._mins)
            sample_set = set(simulation_MHS._mins)
            MH_jaccard = len(
                set(list(MHS_set.union(sample_set))[0:h]).intersection(
                    MHS_set.intersection(sample_set))) / float(h)
            MH_jaccards_corrected.append(MH_jaccard)

        # Calculate the containment min hash estimate of the jaccard index
        CMH_jaccards = list()
        for i in range(len(genome_sketches)):
            genome_kmers_len = genome_lengths[
                i]  # pre-computed when creating the "training" data
            MHS = genome_sketches[i]
            # down sample each sketch to h
            MHS.down_sample(h)
            kmers = MHS._kmers  # use only the k-mers in the min hash sketch
            int_est = 0
            for kmer in kmers:
                if kmer in simulation_bloom:  # test if the k-mers are in the simulation bloom filter
                    int_est += 1
            int_est -= p * h  # adjust for false positive rate
            containment_est = int_est / float(h)
            containment_est_jaccard = genome_kmers_len * containment_est / \
             (genome_kmers_len + simulation_kmers_length - genome_kmers_len * containment_est)
            CMH_jaccards.append(containment_est_jaccard)

        # compute the average deviation from the truth (relative error)
        true_jaccards = np.array(true_jaccards)
        MH_jaccards = np.array(MH_jaccards)
        CMH_jaccards = np.array(CMH_jaccards)
        MH_mean = np.mean(np.abs(true_jaccards - MH_jaccards) / true_jaccards)
        CMH_mean = np.mean(
            np.abs(true_jaccards - CMH_jaccards) / true_jaccards)
        #print("Classic min hash mean relative error: %f" % MH_mean)
        #print("Containment min hash mean relative error: %f" % CMH_mean)
        MH_relative_errors.append(MH_mean)
        CMH_relative_errors.append(CMH_mean)

    # remove temp files
    os.remove(simulation_file)
    os.remove(abundances_file)
    # return the relative errors
    return MH_relative_errors, CMH_relative_errors, simulation_kmers_length, np.mean(
        genome_lengths)
Exemple #34
0
def test_reverse_complement_exception():
    with pytest.raises(RuntimeError):
        khmer.reverse_complement('FGF')