def annotate_guides_with_score(candidates_count_dictionary, jellyfish_filename, priors, posteriors, max_hd, target_string, target_coverage): iteration_count = 0 list_candidates = [] for candidate in list(candidates_count_dictionary.keys()): strand_type = candidates_count_dictionary[candidate] trie = generate_adjacent_mers(candidate, max_hd) value1 = value2 = 0.0 print('processing candidate ' + candidate) flag = True for mer in trie.keys(): if strand_type == '+': cp = get_score(candidate, mer) else: cp = get_score(reverse_complement(candidate), reverse_complement(mer)) qf = jellyfish.QueryMerFile(jellyfish_filename) merDNA = jellyfish.MerDNA(mer) rev_comp_merDNA = jellyfish.MerDNA(reverse_complement(mer)) k = max(qf[merDNA], qf[rev_comp_merDNA]) if k <= 0: continue if k >= max_k: flag = False break p = float(target_string.count(mer)) accum = 0.0 for count in range(1, max_limit_count): probability = get_probability(count, k) p_count = priors[count] p_k = posteriors[k] new_val = 1.0 * probability * count * p_count / p_k accum = accum + new_val value1 = value1 + cp * p value2 = value2 + cp * accum if value1 <= 0.0 or flag is False: continue score = 1.0 * value2 / (value1 * target_coverage) qf = jellyfish.QueryMerFile(jellyfish_filename) merDNA = jellyfish.MerDNA(candidate) k = max(qf[merDNA], qf[jellyfish.MerDNA(reverse_complement(candidate))]) list_candidates.append((candidate, score, k, trie, strand_type)) iteration_count = iteration_count + 1 print('processed ' + str(iteration_count) + 'th gRNA: ' + candidate + ' with score= ' + str(score)) print('DONE! Sorting...') list_candidates.sort(key=sort_second) print('Final list:') f = open('scores', 'w') for annotated_candidate in list_candidates: print(annotated_candidate) f.write(str(annotated_candidate[1]) + '\n') f.close() return list_candidates
def query(self, seq): """Fetch kmer count data from database.""" kmer = jellyfish.MerDNA(seq) if (self.canonical): kmer.canonicalize() return self.jf[kmer]
def generate_k_spectrum_of_target_and_count(target_string, jellyfish_count_file, max_k_limit=200, k=15): """ k-spectrum of target, then count the k-mers found within the target, then generate the histogram :type max_k_limit: int :param target_string: the target string :param k: value of k :param jellyfish_count_file: jellyfish binary file name :param max_k_limit: max value upto which the histogram is to be generated :return: the histogram data in a dictionary """ target = target_string length = len(target) a = set() for i in range(length - k): a.add(target[i:i + k]) lst = [] qf = jellyfish.QueryMerFile(jellyfish_count_file) for substr in a: mer = jellyfish.MerDNA(substr) count = qf[mer] lst.append(count) dic = {} for i in range(max_k_limit): dic[i + 1] = lst.count(i + 1) return dic
def get_kmer_presence(kmerF): mer = jellyfish.MerDNA(kmerF) mer.canonicalize() kmer_pres = [] pres = int(qjellies[mer] > 0) if pres: return None else: kmer_pres.append(1) return kmer_pres
def test_canonical_mers(self): good = True mers = jf.string_canonicals(self.str) for count, m in enumerate(mers): m2 = jf.MerDNA(self.str[count:count + self.k]) rm2 = m2.get_reverse_complement() good = good and (m == m2 or m == rm2) good = good and (not (m > m2)) and (not (m > rm2)) # count += 1 self.assertTrue(good) self.assertEqual(len(self.str) - self.k + 0, count)
def test_all_mers(self): count = 0 good1 = True good2 = True mers = jf.string_mers(self.str) for m in mers: m2 = jf.MerDNA(self.str[count:count + self.k]) good1 = good1 and m == m2 good2 = good2 and self.str[count:count + self.k].upper() == str(m2) count += 1 self.assertTrue(good1) self.assertTrue(good2) self.assertEqual(len(self.str) - self.k + 1, count)
def test_add(self): mer = jf.MerDNA() good = True for i in range(1000): mer.randomize() val = random.randrange(1000) good = good and self.hash.add(mer, val) if not good: break if i % 3 > 0: nval = random.randrange(1000) val = val + nval if i % 3 == 1: good = good and (not self.hash.add(mer, nval)) else: good = good and self.hash.update_add(mer, nval) if not good: break good = good and (val == self.hash.get(mer)) and (val == self.hash[mer]) if not good: break self.assertTrue(good)
def annotate_guides_with_score_parallel(candidates_count_dictionary, jellyfish_filename, priors, posteriors, max_hd, target_string, return_list): index = 0 list_candidates = [] for candidate in list(candidates_count_dictionary.keys()): strand_type = candidates_count_dictionary[candidate][0] trie = generate_adjacent_mers(candidate, max_hd) value1 = value2 = 0.0 flag = True for mer in trie.keys(): if strand_type == '+': cp = get_score(candidate, mer) else: cp = get_score(reverse_complement(candidate), reverse_complement(mer)) qf = jellyfish_filename merDNA = jellyfish.MerDNA(mer) merDNA.canonicalize() k = qf[merDNA] if k <= 0: continue if k >= max_k: flag = False break p = float(target_string.count(mer)) accum = 0.0 for count in range(1, max_limit_count): probability = get_probability(count, k) p_count = priors[count] p_k = posteriors[k] new_val = 1.0 * probability * count * p_count / p_k accum = accum + new_val value1 = value1 + cp * p value2 = value2 + cp * accum if value1 <= 0.0 or flag is False: continue score = 1.0 * value2 / value1 return_list[index] = score index = index + 1
def generate_k_spectrum_of_target_and_count(target_string, jellyfish_count_file, max_k_limit): """ k-spectrum of target, then count the k-mers found within the target, then generate the histogram :type max_k_limit: int :param target_string: the target string :param jellyfish_count_file: jellyfish binary file (jellyfish.QueryMerFile) :param max_k_limit: max value upto which the histogram is to be generated :return: the histogram data in a dictionary as k_spectrum, and the counts of k-mers indexed as positions """ # a pair is returned # pair.first = the k-spectrum histogram of k-mers taken only from the target region # pair.second = a hash-map that has keys:positions in target, values:count of a k-mer in that position k = candidate_length target = target_string length = len(target) a = set() counts_in_positions = {} k_spectrum = {} #qf = jellyfish.QueryMerFile(jellyfish_count_file) qf = jellyfish_count_file for i in range(length - k + 1): subst = target[i:i + k] mer = jellyfish.MerDNA(subst) mer.canonicalize() count = qf[mer] counts_in_positions[i] = count if count == 0: logging.info("Count = 0 for substring " + subst) continue if subst not in a: a.add(subst) if count in k_spectrum.keys(): k_spectrum[count] += 1 else: k_spectrum[count] = 1 return k_spectrum, counts_in_positions
def compute_hybrid(self, first_var, var_wgts): import dna_jellyfish r = self.r chrom = self.variants[first_var].chrom pos = self.variants[first_var].pos #if self.variants[first_var].pos < pos or self.variants[first_var].pos >= pos+r: # return # Number of variants in window starting at this one k = 1 while first_var + k < self.num_v and self.variants[ first_var + k].chrom == chrom and self.variants[first_var + k].pos < pos + r: k += 1 #if k > 14: # sys.stdout.write('Processing variant %d with %d neighbors' % (first_var, k)) if k > self.max_v_in_window: alt_freqs = [(sum(self.variants[first_var + j].probs), first_var + j) for j in range(1, k)] ids = [first_var] + [ f[1] for f in sorted(alt_freqs, reverse=True) [:self.max_v_in_window - 1] ] it = PseudocontigIterator(self.genome[chrom], [self.variants[v] for v in ids], self.r) else: ids = range(first_var, first_var + k) it = PseudocontigIterator(self.genome[chrom], self.variants[first_var:first_var + k], r) pseudocontig = it.next() while pseudocontig: vec = it.curr_vec p = self.prob_read(self.variants, ids, vec) for i in range(len(pseudocontig) - self.r + 1): mer = dna_jellyfish.MerDNA(pseudocontig[i:i + r]) mer.canonicalize() c_linear = self.h_ref[mer] if not c_linear: c_linear = 0 c_added = self.h_added[mer] if not c_added: c_added = 0 if c_added == 0: print( 'Error! Read %s from added pseudocontigs could not be found (SNPs %d - %d)' % (pseudocontig[i:i + r], first_var, first_var + k)) for j in range(first_var, first_var + k): print('%s: %d, %s --> %s' % (self.variants[j].chrom, self.variants[j].pos, self.variants[j].orig, ','.join(self.variants[j].alts))) exit() c_total = c_linear + c_added if c_total == 0: print('Variants %d -%d / %d' % (first_var, first_var + k - 1, self.num_v)) print('Vector: ' + str(vec)) print('Pseudocontig: ' + str(pseudocontig)) print('Read: ' + str(pseudocontig[i:i + r])) exit() # Average relative probability of this read's other mappings avg_wgt = c_linear * self.wgt_ref + (c_added - 1) * self.wgt_added hybrid_wgt = (p - avg_wgt) / (c_total) for j in range(len(ids)): if vec[j]: var_wgts[ids[j]] -= hybrid_wgt pseudocontig = it.next()
def string_to_kmer(self, sequence): binary_kmer = jf.MerDNA(sequence) return binary_kmer