예제 #1
0
파일: detect.py 프로젝트: jdidion/atropos
 def _filter_seq(self, seq):
     if sequence_complexity(seq) <= 1.0:
         return None
     match = POLY_A.search(seq)
     if match:
         seq = seq[: match.start()]
     if len(seq) < self.k:
         return None
     return seq
예제 #2
0
파일: detect.py 프로젝트: llllaaaa/atropos
 def _filter_seq(self, seq):
     if sequence_complexity(seq) <= 1.0:
         return None
     match = POLY_A.search(seq)
     if match:
         seq = seq[:match.start()]
     if len(seq) < self.k:
         return None
     return seq
예제 #3
0
 def _filter_seq(self, seq):
     if sequence_complexity(seq) <= 1.0:
         return None
     if self._past_end_regexp:
         match = self._past_end_regexp.search(seq)
         if match:
             seq = seq[:match.start()]
     if len(seq) < self.kmer_size:
         return None
     return seq
예제 #4
0
파일: detect.py 프로젝트: jdidion/atropos
 def seq_complexity(self):
     return sequence_complexity(self.seq)
예제 #5
0
파일: detect.py 프로젝트: jdidion/atropos
    def _get_contaminants(self):
        def _min_count(k):
            return math.ceil(
                self.n_reads * max(self.min_freq, (self._read_length - k + 1) * self.overrep_cutoff / float(4 ** k))
            )

        k = self.k
        kmers = defaultdict(lambda: [0, set()])

        for seq in self._read_sequences:
            for i in range(len(seq) - k + 1):
                kmer = seq[i : (i + k)]
                kmers[kmer][0] += 1
                kmers[kmer][1].add(seq)

        prev = None
        cur = {}
        results = {}
        result_seqs = defaultdict(lambda: set())
        min_count = _min_count(k)

        # Identify candidate kmers for increasing values of k
        while True:
            all_seqs = set()
            for kmer, (count, seqs) in kmers.items():
                if count > min_count:
                    cur[kmer] = (count, seqs)
                    all_seqs.update(seqs)

            if len(all_seqs) == 0:
                break

            if prev:
                for kmer, (count, seqs) in prev.items():
                    if not any(seq in cur for seq in seqs) and sequence_complexity(kmer) > 1.0:
                        results[kmer] = count
                        result_seqs[kmer].update(seqs)

            k += 1
            kmers = defaultdict(lambda: [0, set()])
            for seq in all_seqs:
                for i in range(len(seq) - k + 1):
                    kmer = seq[i : (i + k)]
                    kmers[kmer][0] += 1
                    kmers[kmer][1].add(seq)

            min_count = _min_count(k)
            prev = cur
            cur = {}

        results = list(results.items())

        # Now merge overlapping sequences by length and frequency to eliminate
        # redundancy in the set of candidate kmers.
        results.sort(key=lambda i: len(i[0]) * math.log(i[1]), reverse=True)
        cur = results[0]
        merged = []
        unmerged = []
        while len(results) > 1:
            seq1, count1 = results[0]
            for j in range(1, len(results)):
                seq2, count2 = results[j]
                if len(seq1) >= len(seq2) and seq2 in seq1:
                    count1 += count2
                elif seq1 in seq2:
                    # if they are close in count, keep the longer sequence
                    if count1 < (2 * count2):
                        seq1 = seq2
                    count1 += count2
                else:
                    unmerged.append(results[j])
            merged.append([seq1, count1])
            results = unmerged
            unmerged = []
        results = merged + results

        if len(results) == 0:
            return []

        # TODO: For each retained match, pull out the longest sequence that
        # matches to have a better shot of identifying long adapters that
        # appear in full very infrequently

        # Re-sort by frequency
        results.sort(key=lambda i: i[1], reverse=True)
        # Keep anything that's within 50% of the top hit
        # TODO: make this user-configurable?
        min_count = int(results[0][1] * 0.5)
        results = (x for x in results if x[1] >= min_count)
        # Convert to matches
        matches = [Match(x[0], x[1], reads=result_seqs[x[0]]) for x in results]

        if self.known_contaminants:
            # Match to known sequences
            contaminants = create_contaminant_matchers(self.known_contaminants, self.k)
            known = {}
            unknown = []

            def find_best_match(seq, best_matches, best_match_frac):
                seqrc = reverse_complement(seq)
                for contam in contaminants:
                    match_frac1, match_frac2, compare_seq = contam.match(seq, seqrc)
                    if match_frac1 < best_match_frac[0]:
                        continue
                    if contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac):
                        if match_frac1 > best_match_frac[0] or (
                            match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1]
                        ):
                            best_matches = {}
                            best_match_frac = (match_frac1, match_frac2)
                        best_matches[contam] = (match, (match_frac1, match_frac2))
                return (best_matches, best_match_frac)

            for match in matches:
                best_matches, best_match_frac = find_best_match(match.seq, {}, (self.min_contaminant_match_frac, 0))

                if match.longest_match:
                    best_matches, best_match_frac = find_best_match(
                        match.longest_match[0], best_matches, best_match_frac
                    )

                if best_matches:
                    for contam, match in best_matches.items():
                        if contam not in known or match[1] > known[contam][1]:
                            known[contam] = match
                else:
                    unknown.append(match)

            # resolve many-many relationships

            new_matches = defaultdict(lambda: [])
            for contam, (match, match_frac) in known.items():
                new_matches[match].append((contam, match_frac))

            known = []
            for match, contams in new_matches.items():
                if len(contams) == 1:
                    contam, match_frac = contams[0]
                    match.set_contaminant(contam, *match_frac)
                else:
                    contams.sort(key=lambda x: x[1], reverse=True)
                    contam, match_frac = contams[0]
                    equiv = [c for c in contams[1:] if c[1] == match_frac]
                    if len(equiv) == 0:
                        match.set_contaminant(contam, *match_frac)
                    else:
                        names = set(contam.names)
                        seqs = set((contam.seq,))
                        for e in equiv:
                            names.update(e[0].names)
                            seqs.add(e[0].seq)
                        match.set_known(list(names), list(seqs), *match_frac)
                known.append(match)

            matches = known + unknown

        return matches
예제 #6
0
파일: detect.py 프로젝트: llllaaaa/atropos
 def seq_complexity(self):
     return sequence_complexity(self.seq)
예제 #7
0
파일: detect.py 프로젝트: llllaaaa/atropos
    def _get_contaminants(self):
        def _min_count(k):
            return math.ceil(self.n_reads *
                             max(self.min_freq, (self._read_length - k + 1) *
                                 self.overrep_cutoff / float(4**k)))

        k = self.k
        kmers = defaultdict(lambda: [0, set()])

        for seq in self._read_sequences:
            for i in range(len(seq) - k + 1):
                kmer = seq[i:(i + k)]
                kmers[kmer][0] += 1
                kmers[kmer][1].add(seq)

        prev = None
        cur = {}
        results = {}
        result_seqs = defaultdict(lambda: set())
        min_count = _min_count(k)

        # Identify candidate kmers for increasing values of k
        while True:
            all_seqs = set()
            for kmer, (count, seqs) in kmers.items():
                if count > min_count:
                    cur[kmer] = (count, seqs)
                    all_seqs.update(seqs)

            if len(all_seqs) == 0:
                break

            if prev:
                for kmer, (count, seqs) in prev.items():
                    if not any(seq in cur for seq in
                               seqs) and sequence_complexity(kmer) > 1.0:
                        results[kmer] = count
                        result_seqs[kmer].update(seqs)

            k += 1
            kmers = defaultdict(lambda: [0, set()])
            for seq in all_seqs:
                for i in range(len(seq) - k + 1):
                    kmer = seq[i:(i + k)]
                    kmers[kmer][0] += 1
                    kmers[kmer][1].add(seq)

            min_count = _min_count(k)
            prev = cur
            cur = {}

        results = list(results.items())

        # Now merge overlapping sequences by length and frequency to eliminate
        # redundancy in the set of candidate kmers.
        results.sort(key=lambda i: len(i[0]) * math.log(i[1]), reverse=True)
        cur = results[0]
        merged = []
        unmerged = []
        while len(results) > 1:
            seq1, count1 = results[0]
            for j in range(1, len(results)):
                seq2, count2 = results[j]
                if len(seq1) >= len(seq2) and seq2 in seq1:
                    count1 += count2
                elif seq1 in seq2:
                    # if they are close in count, keep the longer sequence
                    if count1 < (2 * count2):
                        seq1 = seq2
                    count1 += count2
                else:
                    unmerged.append(results[j])
            merged.append([seq1, count1])
            results = unmerged
            unmerged = []
        results = merged + results

        if len(results) == 0:
            return []

        # TODO: For each retained match, pull out the longest sequence that
        # matches to have a better shot of identifying long adapters that
        # appear in full very infrequently

        # Re-sort by frequency
        results.sort(key=lambda i: i[1], reverse=True)
        # Keep anything that's within 50% of the top hit
        # TODO: make this user-configurable?
        min_count = int(results[0][1] * 0.5)
        results = (x for x in results if x[1] >= min_count)
        # Convert to matches
        matches = [Match(x[0], x[1], reads=result_seqs[x[0]]) for x in results]

        if self.known_contaminants:
            # Match to known sequences
            contaminants = create_contaminant_matchers(self.known_contaminants,
                                                       self.k)
            known = {}
            unknown = []

            def find_best_match(seq, best_matches, best_match_frac):
                seqrc = reverse_complement(seq)
                for contam in contaminants:
                    match_frac1, match_frac2, compare_seq = contam.match(
                        seq, seqrc)
                    if match_frac1 < best_match_frac[0]:
                        continue
                    if (contam.seq in compare_seq
                            or align(compare_seq, contam.seq,
                                     self.min_contaminant_match_frac)):
                        if (match_frac1 > best_match_frac[0]
                                or (match_frac1 == best_match_frac[0]
                                    and match_frac2 > best_match_frac[1])):
                            best_matches = {}
                            best_match_frac = (match_frac1, match_frac2)
                        best_matches[contam] = (match, (match_frac1,
                                                        match_frac2))
                return (best_matches, best_match_frac)

            for match in matches:
                best_matches, best_match_frac = find_best_match(
                    match.seq, {}, (self.min_contaminant_match_frac, 0))

                if match.longest_match:
                    best_matches, best_match_frac = find_best_match(
                        match.longest_match[0], best_matches, best_match_frac)

                if best_matches:
                    for contam, match in best_matches.items():
                        if contam not in known or match[1] > known[contam][1]:
                            known[contam] = match
                else:
                    unknown.append(match)

            # resolve many-many relationships

            new_matches = defaultdict(lambda: [])
            for contam, (match, match_frac) in known.items():
                new_matches[match].append((contam, match_frac))

            known = []
            for match, contams in new_matches.items():
                if len(contams) == 1:
                    contam, match_frac = contams[0]
                    match.set_contaminant(contam, *match_frac)
                else:
                    contams.sort(key=lambda x: x[1], reverse=True)
                    contam, match_frac = contams[0]
                    equiv = [c for c in contams[1:] if c[1] == match_frac]
                    if len(equiv) == 0:
                        match.set_contaminant(contam, *match_frac)
                    else:
                        names = set(contam.names)
                        seqs = set((contam.seq, ))
                        for e in equiv:
                            names.update(e[0].names)
                            seqs.add(e[0].seq)
                        match.set_known(list(names), list(seqs), *match_frac)
                known.append(match)

            matches = known + unknown

        return matches
예제 #8
0
 def seq_complexity(self):
     """The complexity of the sequence (0=homopolymer, 2=random).
     """
     return sequence_complexity(self.seq)