def _filter_seq(self, seq): if sequence_complexity(seq) <= 1.0: return None match = POLY_A.search(seq) if match: seq = seq[: match.start()] if len(seq) < self.k: return None return seq
def _filter_seq(self, seq): if sequence_complexity(seq) <= 1.0: return None match = POLY_A.search(seq) if match: seq = seq[:match.start()] if len(seq) < self.k: return None return seq
def _filter_seq(self, seq): if sequence_complexity(seq) <= 1.0: return None if self._past_end_regexp: match = self._past_end_regexp.search(seq) if match: seq = seq[:match.start()] if len(seq) < self.kmer_size: return None return seq
def seq_complexity(self): return sequence_complexity(self.seq)
def _get_contaminants(self): def _min_count(k): return math.ceil( self.n_reads * max(self.min_freq, (self._read_length - k + 1) * self.overrep_cutoff / float(4 ** k)) ) k = self.k kmers = defaultdict(lambda: [0, set()]) for seq in self._read_sequences: for i in range(len(seq) - k + 1): kmer = seq[i : (i + k)] kmers[kmer][0] += 1 kmers[kmer][1].add(seq) prev = None cur = {} results = {} result_seqs = defaultdict(lambda: set()) min_count = _min_count(k) # Identify candidate kmers for increasing values of k while True: all_seqs = set() for kmer, (count, seqs) in kmers.items(): if count > min_count: cur[kmer] = (count, seqs) all_seqs.update(seqs) if len(all_seqs) == 0: break if prev: for kmer, (count, seqs) in prev.items(): if not any(seq in cur for seq in seqs) and sequence_complexity(kmer) > 1.0: results[kmer] = count result_seqs[kmer].update(seqs) k += 1 kmers = defaultdict(lambda: [0, set()]) for seq in all_seqs: for i in range(len(seq) - k + 1): kmer = seq[i : (i + k)] kmers[kmer][0] += 1 kmers[kmer][1].add(seq) min_count = _min_count(k) prev = cur cur = {} results = list(results.items()) # Now merge overlapping sequences by length and frequency to eliminate # redundancy in the set of candidate kmers. results.sort(key=lambda i: len(i[0]) * math.log(i[1]), reverse=True) cur = results[0] merged = [] unmerged = [] while len(results) > 1: seq1, count1 = results[0] for j in range(1, len(results)): seq2, count2 = results[j] if len(seq1) >= len(seq2) and seq2 in seq1: count1 += count2 elif seq1 in seq2: # if they are close in count, keep the longer sequence if count1 < (2 * count2): seq1 = seq2 count1 += count2 else: unmerged.append(results[j]) merged.append([seq1, count1]) results = unmerged unmerged = [] results = merged + results if len(results) == 0: return [] # TODO: For each retained match, pull out the longest sequence that # matches to have a better shot of identifying long adapters that # appear in full very infrequently # Re-sort by frequency results.sort(key=lambda i: i[1], reverse=True) # Keep anything that's within 50% of the top hit # TODO: make this user-configurable? min_count = int(results[0][1] * 0.5) results = (x for x in results if x[1] >= min_count) # Convert to matches matches = [Match(x[0], x[1], reads=result_seqs[x[0]]) for x in results] if self.known_contaminants: # Match to known sequences contaminants = create_contaminant_matchers(self.known_contaminants, self.k) known = {} unknown = [] def find_best_match(seq, best_matches, best_match_frac): seqrc = reverse_complement(seq) for contam in contaminants: match_frac1, match_frac2, compare_seq = contam.match(seq, seqrc) if match_frac1 < best_match_frac[0]: continue if contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac): if match_frac1 > best_match_frac[0] or ( match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1] ): best_matches = {} best_match_frac = (match_frac1, match_frac2) best_matches[contam] = (match, (match_frac1, match_frac2)) return (best_matches, best_match_frac) for match in matches: best_matches, best_match_frac = find_best_match(match.seq, {}, (self.min_contaminant_match_frac, 0)) if match.longest_match: best_matches, best_match_frac = find_best_match( match.longest_match[0], best_matches, best_match_frac ) if best_matches: for contam, match in best_matches.items(): if contam not in known or match[1] > known[contam][1]: known[contam] = match else: unknown.append(match) # resolve many-many relationships new_matches = defaultdict(lambda: []) for contam, (match, match_frac) in known.items(): new_matches[match].append((contam, match_frac)) known = [] for match, contams in new_matches.items(): if len(contams) == 1: contam, match_frac = contams[0] match.set_contaminant(contam, *match_frac) else: contams.sort(key=lambda x: x[1], reverse=True) contam, match_frac = contams[0] equiv = [c for c in contams[1:] if c[1] == match_frac] if len(equiv) == 0: match.set_contaminant(contam, *match_frac) else: names = set(contam.names) seqs = set((contam.seq,)) for e in equiv: names.update(e[0].names) seqs.add(e[0].seq) match.set_known(list(names), list(seqs), *match_frac) known.append(match) matches = known + unknown return matches
def _get_contaminants(self): def _min_count(k): return math.ceil(self.n_reads * max(self.min_freq, (self._read_length - k + 1) * self.overrep_cutoff / float(4**k))) k = self.k kmers = defaultdict(lambda: [0, set()]) for seq in self._read_sequences: for i in range(len(seq) - k + 1): kmer = seq[i:(i + k)] kmers[kmer][0] += 1 kmers[kmer][1].add(seq) prev = None cur = {} results = {} result_seqs = defaultdict(lambda: set()) min_count = _min_count(k) # Identify candidate kmers for increasing values of k while True: all_seqs = set() for kmer, (count, seqs) in kmers.items(): if count > min_count: cur[kmer] = (count, seqs) all_seqs.update(seqs) if len(all_seqs) == 0: break if prev: for kmer, (count, seqs) in prev.items(): if not any(seq in cur for seq in seqs) and sequence_complexity(kmer) > 1.0: results[kmer] = count result_seqs[kmer].update(seqs) k += 1 kmers = defaultdict(lambda: [0, set()]) for seq in all_seqs: for i in range(len(seq) - k + 1): kmer = seq[i:(i + k)] kmers[kmer][0] += 1 kmers[kmer][1].add(seq) min_count = _min_count(k) prev = cur cur = {} results = list(results.items()) # Now merge overlapping sequences by length and frequency to eliminate # redundancy in the set of candidate kmers. results.sort(key=lambda i: len(i[0]) * math.log(i[1]), reverse=True) cur = results[0] merged = [] unmerged = [] while len(results) > 1: seq1, count1 = results[0] for j in range(1, len(results)): seq2, count2 = results[j] if len(seq1) >= len(seq2) and seq2 in seq1: count1 += count2 elif seq1 in seq2: # if they are close in count, keep the longer sequence if count1 < (2 * count2): seq1 = seq2 count1 += count2 else: unmerged.append(results[j]) merged.append([seq1, count1]) results = unmerged unmerged = [] results = merged + results if len(results) == 0: return [] # TODO: For each retained match, pull out the longest sequence that # matches to have a better shot of identifying long adapters that # appear in full very infrequently # Re-sort by frequency results.sort(key=lambda i: i[1], reverse=True) # Keep anything that's within 50% of the top hit # TODO: make this user-configurable? min_count = int(results[0][1] * 0.5) results = (x for x in results if x[1] >= min_count) # Convert to matches matches = [Match(x[0], x[1], reads=result_seqs[x[0]]) for x in results] if self.known_contaminants: # Match to known sequences contaminants = create_contaminant_matchers(self.known_contaminants, self.k) known = {} unknown = [] def find_best_match(seq, best_matches, best_match_frac): seqrc = reverse_complement(seq) for contam in contaminants: match_frac1, match_frac2, compare_seq = contam.match( seq, seqrc) if match_frac1 < best_match_frac[0]: continue if (contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac)): if (match_frac1 > best_match_frac[0] or (match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1])): best_matches = {} best_match_frac = (match_frac1, match_frac2) best_matches[contam] = (match, (match_frac1, match_frac2)) return (best_matches, best_match_frac) for match in matches: best_matches, best_match_frac = find_best_match( match.seq, {}, (self.min_contaminant_match_frac, 0)) if match.longest_match: best_matches, best_match_frac = find_best_match( match.longest_match[0], best_matches, best_match_frac) if best_matches: for contam, match in best_matches.items(): if contam not in known or match[1] > known[contam][1]: known[contam] = match else: unknown.append(match) # resolve many-many relationships new_matches = defaultdict(lambda: []) for contam, (match, match_frac) in known.items(): new_matches[match].append((contam, match_frac)) known = [] for match, contams in new_matches.items(): if len(contams) == 1: contam, match_frac = contams[0] match.set_contaminant(contam, *match_frac) else: contams.sort(key=lambda x: x[1], reverse=True) contam, match_frac = contams[0] equiv = [c for c in contams[1:] if c[1] == match_frac] if len(equiv) == 0: match.set_contaminant(contam, *match_frac) else: names = set(contam.names) seqs = set((contam.seq, )) for e in equiv: names.update(e[0].names) seqs.add(e[0].seq) match.set_known(list(names), list(seqs), *match_frac) known.append(match) matches = known + unknown return matches
def seq_complexity(self): """The complexity of the sequence (0=homopolymer, 2=random). """ return sequence_complexity(self.seq)