Esempio n. 1
0
def annotate_guides_with_score(candidates_count_dictionary, jellyfish_filename,
                               priors, posteriors, max_hd, target_string,
                               target_coverage):
    iteration_count = 0
    list_candidates = []
    for candidate in list(candidates_count_dictionary.keys()):
        strand_type = candidates_count_dictionary[candidate]
        trie = generate_adjacent_mers(candidate, max_hd)
        value1 = value2 = 0.0
        print('processing candidate ' + candidate)
        flag = True
        for mer in trie.keys():
            if strand_type == '+':
                cp = get_score(candidate, mer)
            else:
                cp = get_score(reverse_complement(candidate),
                               reverse_complement(mer))
            qf = jellyfish.QueryMerFile(jellyfish_filename)
            merDNA = jellyfish.MerDNA(mer)
            rev_comp_merDNA = jellyfish.MerDNA(reverse_complement(mer))
            k = max(qf[merDNA], qf[rev_comp_merDNA])
            if k <= 0:
                continue
            if k >= max_k:
                flag = False
                break
            p = float(target_string.count(mer))
            accum = 0.0
            for count in range(1, max_limit_count):
                probability = get_probability(count, k)
                p_count = priors[count]
                p_k = posteriors[k]
                new_val = 1.0 * probability * count * p_count / p_k
                accum = accum + new_val
            value1 = value1 + cp * p
            value2 = value2 + cp * accum
        if value1 <= 0.0 or flag is False:
            continue
        score = 1.0 * value2 / (value1 * target_coverage)
        qf = jellyfish.QueryMerFile(jellyfish_filename)
        merDNA = jellyfish.MerDNA(candidate)
        k = max(qf[merDNA],
                qf[jellyfish.MerDNA(reverse_complement(candidate))])
        list_candidates.append((candidate, score, k, trie, strand_type))
        iteration_count = iteration_count + 1
        print('processed ' + str(iteration_count) + 'th gRNA: ' + candidate +
              ' with score= ' + str(score))
    print('DONE! Sorting...')
    list_candidates.sort(key=sort_second)
    print('Final list:')
    f = open('scores', 'w')
    for annotated_candidate in list_candidates:
        print(annotated_candidate)
        f.write(str(annotated_candidate[1]) + '\n')
    f.close()
    return list_candidates
Esempio n. 2
0
    def query(self, seq):
        """Fetch kmer count data from database."""

        kmer = jellyfish.MerDNA(seq)
        if (self.canonical):
            kmer.canonicalize()
        return self.jf[kmer]
Esempio n. 3
0
def generate_k_spectrum_of_target_and_count(target_string,
                                            jellyfish_count_file,
                                            max_k_limit=200,
                                            k=15):
    """
    k-spectrum of target, then count the k-mers found within the target, then generate the histogram
    :type max_k_limit: int
    :param target_string: the target string
    :param k: value of k
    :param jellyfish_count_file: jellyfish binary file name
    :param max_k_limit: max value upto which the histogram is to be generated
    :return: the histogram data in a dictionary
    """
    target = target_string
    length = len(target)
    a = set()
    for i in range(length - k):
        a.add(target[i:i + k])
    lst = []
    qf = jellyfish.QueryMerFile(jellyfish_count_file)
    for substr in a:
        mer = jellyfish.MerDNA(substr)
        count = qf[mer]
        lst.append(count)
    dic = {}
    for i in range(max_k_limit):
        dic[i + 1] = lst.count(i + 1)
    return dic
Esempio n. 4
0
def get_kmer_presence(kmerF):
    mer = jellyfish.MerDNA(kmerF)
    mer.canonicalize()
    kmer_pres = []
    pres = int(qjellies[mer] > 0)
    if pres:
        return None
    else:
        kmer_pres.append(1)
    return kmer_pres
Esempio n. 5
0
 def test_canonical_mers(self):
     good = True
     mers = jf.string_canonicals(self.str)
     for count, m in enumerate(mers):
         m2 = jf.MerDNA(self.str[count:count + self.k])
         rm2 = m2.get_reverse_complement()
         good = good and (m == m2 or m == rm2)
         good = good and (not (m > m2)) and (not (m > rm2))
         # count += 1
     self.assertTrue(good)
     self.assertEqual(len(self.str) - self.k + 0, count)
Esempio n. 6
0
 def test_all_mers(self):
     count = 0
     good1 = True
     good2 = True
     mers = jf.string_mers(self.str)
     for m in mers:
         m2 = jf.MerDNA(self.str[count:count + self.k])
         good1 = good1 and m == m2
         good2 = good2 and self.str[count:count + self.k].upper() == str(m2)
         count += 1
     self.assertTrue(good1)
     self.assertTrue(good2)
     self.assertEqual(len(self.str) - self.k + 1, count)
Esempio n. 7
0
 def test_add(self):
     mer  = jf.MerDNA()
     good = True
     for i in range(1000):
         mer.randomize()
         val = random.randrange(1000)
         good = good and self.hash.add(mer, val)
         if not good: break
         if i % 3 > 0:
             nval = random.randrange(1000)
             val  = val + nval
             if i % 3 == 1:
                 good = good and (not self.hash.add(mer, nval))
             else:
                 good = good and self.hash.update_add(mer, nval)
         if not good: break
         good = good and (val == self.hash.get(mer)) and (val == self.hash[mer])
         if not good: break
     self.assertTrue(good)
Esempio n. 8
0
def annotate_guides_with_score_parallel(candidates_count_dictionary, jellyfish_filename, priors, posteriors,
                                        max_hd,
                                        target_string,
                                        return_list):
    index = 0
    list_candidates = []
    for candidate in list(candidates_count_dictionary.keys()):
        strand_type = candidates_count_dictionary[candidate][0]
        trie = generate_adjacent_mers(candidate, max_hd)
        value1 = value2 = 0.0
        flag = True
        for mer in trie.keys():
            if strand_type == '+':
                cp = get_score(candidate, mer)
            else:
                cp = get_score(reverse_complement(candidate), reverse_complement(mer))
            qf = jellyfish_filename
            merDNA = jellyfish.MerDNA(mer)
            merDNA.canonicalize()
            k = qf[merDNA]
            if k <= 0:
                continue
            if k >= max_k:
                flag = False
                break
            p = float(target_string.count(mer))
            accum = 0.0
            for count in range(1, max_limit_count):
                probability = get_probability(count, k)
                p_count = priors[count]
                p_k = posteriors[k]
                new_val = 1.0 * probability * count * p_count / p_k
                accum = accum + new_val
            value1 = value1 + cp * p
            value2 = value2 + cp * accum
        if value1 <= 0.0 or flag is False:
            continue
        score = 1.0 * value2 / value1
        return_list[index] = score
        index = index + 1
Esempio n. 9
0
def generate_k_spectrum_of_target_and_count(target_string, jellyfish_count_file, max_k_limit):
    """
    k-spectrum of target, then count the k-mers found within the target, then generate the histogram
    :type max_k_limit: int
    :param target_string: the target string
    :param jellyfish_count_file: jellyfish binary file (jellyfish.QueryMerFile)
    :param max_k_limit: max value upto which the histogram is to be generated
    :return: the histogram data in a dictionary as k_spectrum, and the counts of k-mers indexed as positions
    """
    # a pair is returned
    # pair.first = the k-spectrum histogram of k-mers taken only from the target region
    # pair.second = a hash-map that has keys:positions in target, values:count of a k-mer in that position
    k = candidate_length
    target = target_string
    length = len(target)
    a = set()
    counts_in_positions = {}
    k_spectrum = {}
    #qf = jellyfish.QueryMerFile(jellyfish_count_file)
    qf = jellyfish_count_file
    for i in range(length - k + 1):
        subst = target[i:i + k]
        mer = jellyfish.MerDNA(subst)
        mer.canonicalize()
        count = qf[mer]
        counts_in_positions[i] = count
        if count == 0:
            logging.info("Count = 0 for substring " + subst)
            continue
        if subst not in a:
            a.add(subst)
            if count in k_spectrum.keys():
                k_spectrum[count] += 1
            else:
                k_spectrum[count] = 1
    return k_spectrum, counts_in_positions
Esempio n. 10
0
    def compute_hybrid(self, first_var, var_wgts):
        import dna_jellyfish

        r = self.r
        chrom = self.variants[first_var].chrom
        pos = self.variants[first_var].pos

        #if self.variants[first_var].pos < pos or self.variants[first_var].pos >= pos+r:
        #    return

        # Number of variants in window starting at this one
        k = 1
        while first_var + k < self.num_v and self.variants[
                first_var +
                k].chrom == chrom and self.variants[first_var +
                                                    k].pos < pos + r:
            k += 1

        #if k > 14:
        #    sys.stdout.write('Processing variant %d with %d neighbors' % (first_var, k))

        if k > self.max_v_in_window:
            alt_freqs = [(sum(self.variants[first_var + j].probs),
                          first_var + j) for j in range(1, k)]
            ids = [first_var] + [
                f[1] for f in sorted(alt_freqs, reverse=True)
                [:self.max_v_in_window - 1]
            ]
            it = PseudocontigIterator(self.genome[chrom],
                                      [self.variants[v] for v in ids], self.r)
        else:
            ids = range(first_var, first_var + k)
            it = PseudocontigIterator(self.genome[chrom],
                                      self.variants[first_var:first_var + k],
                                      r)

        pseudocontig = it.next()
        while pseudocontig:
            vec = it.curr_vec

            p = self.prob_read(self.variants, ids, vec)
            for i in range(len(pseudocontig) - self.r + 1):
                mer = dna_jellyfish.MerDNA(pseudocontig[i:i + r])
                mer.canonicalize()
                c_linear = self.h_ref[mer]
                if not c_linear:
                    c_linear = 0
                c_added = self.h_added[mer]
                if not c_added:
                    c_added = 0
                    if c_added == 0:
                        print(
                            'Error! Read %s from added pseudocontigs could not be found (SNPs %d - %d)'
                            %
                            (pseudocontig[i:i + r], first_var, first_var + k))
                        for j in range(first_var, first_var + k):
                            print('%s: %d, %s --> %s' %
                                  (self.variants[j].chrom,
                                   self.variants[j].pos, self.variants[j].orig,
                                   ','.join(self.variants[j].alts)))
                        exit()
                c_total = c_linear + c_added

                if c_total == 0:
                    print('Variants %d -%d / %d' %
                          (first_var, first_var + k - 1, self.num_v))
                    print('Vector:       ' + str(vec))
                    print('Pseudocontig: ' + str(pseudocontig))
                    print('Read:         ' + str(pseudocontig[i:i + r]))
                    exit()

                # Average relative probability of this read's other mappings
                avg_wgt = c_linear * self.wgt_ref + (c_added -
                                                     1) * self.wgt_added
                hybrid_wgt = (p - avg_wgt) / (c_total)
                for j in range(len(ids)):
                    if vec[j]:
                        var_wgts[ids[j]] -= hybrid_wgt

            pseudocontig = it.next()
Esempio n. 11
0
 def string_to_kmer(self, sequence):
     binary_kmer = jf.MerDNA(sequence)
     return binary_kmer