def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 aligned1 = aligned2 = None else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def map_sequence(source_seq, dest_seq): """ Find the portion of source_seq that dest_seq maps to. :return: a list of 1-based positions in source_seq that it mapped to. """ gap_open = 15 gap_extend = 5 use_terminal_gap_penalty = 1 aligned_source, aligned_dest, _score = gotoh.align_it( source_seq, dest_seq, gap_open, gap_extend, use_terminal_gap_penalty) positions = [] source_pos = 1 for source_nuc, dest_nuc in zip(aligned_source, aligned_dest): if dest_nuc != '-': positions.append(source_pos if source_nuc != '-' else None) if source_nuc != '-': source_pos += 1 hit_count = sum(position is not None for position in positions) if hit_count < len(dest_seq) / 2: pieces = re.split('(N+)', dest_seq) sizes = [(len(p), i) for i, p in enumerate(pieces) if p and 'N' not in p] if sizes: sizes.sort(reverse=True) big_piece = sizes[0][1] offset = sum(len(p) for p in pieces[:big_piece]) positions = offset * [None] + map_sequence(source_seq, pieces[big_piece]) return positions
def merge_reads(reads): """ Generator over merged reads. :param reads: iterable of reads from FastqReader :return: a generator with items (merged_bases may be None if merge fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), merged_bases) """ for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads: if not (seq1 and seq2): score = -1 else: seq2_rev = reverse_and_complement(seq2) aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-': aligned_qual1 = align_quality(aligned1, qual1) aligned_qual2 = align_quality(aligned2, reversed(qual2)) merged = merge_pairs(aligned1, aligned2, aligned_qual1, aligned_qual2, q_cutoff=Q_CUTOFF) else: merged = None yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2), merged)
def map_sequence(source_seq, dest_seq): """ Find the portion of source_seq that dest_seq maps to. :return: a list of 1-based positions in source_seq that it mapped to. """ gap_open = 15 gap_extend = 5 use_terminal_gap_penalty = 1 aligned_source, aligned_dest, _score = gotoh.align_it(source_seq, dest_seq, gap_open, gap_extend, use_terminal_gap_penalty) positions = [] source_pos = 1 for source_nuc, dest_nuc in zip(aligned_source, aligned_dest): if dest_nuc != '-': positions.append(source_pos if source_nuc != '-' else None) if source_nuc != '-': source_pos += 1 hit_count = sum(position is not None for position in positions) if hit_count < len(dest_seq) / 2: pieces = re.split('(N+)', dest_seq) sizes = [(len(p), i) for i, p in enumerate(pieces) if p and 'N' not in p] if sizes: sizes.sort(reverse=True) big_piece = sizes[0][1] offset = sum(len(p) for p in pieces[:big_piece]) positions = offset*[None] + map_sequence(source_seq, pieces[big_piece]) return positions
def align_untrimmed_reads(fastq): v3loop_ref = extract_v3loop_ref() score_counts = Counter() for _, (_, nucs, _) in FastqReader.get_reads(fastq): _, _, score = align_it(v3loop_ref, nucs, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) score_counts[score] += 1 return score_counts
def align(query_seq: str, target_seq: str): result = { 'score': None, 'aligned_query': None, 'aligned_target': None, 'start': None, 'is_valid': False, 'dist': None, 'end_dist': None, } gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 best_acontig = best_atarget = best_target = best_score = None for target_nucs in PrimerFinder.expand_mixtures(target_seq): aligned_query, aligned_target, score = align_it( query_seq, target_nucs, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) if best_score is None or score > best_score: best_acontig = aligned_query best_atarget = aligned_target best_target = target_nucs best_score = score if not best_acontig: result['valid'] = False return None aligned_query = best_acontig aligned_target = best_atarget target_nucs = best_target result['score'] = best_score match = re.match('-*([^-](.*[^-])?)', aligned_target) result['aligned_query'] = aligned_query result['aligned_target'] = aligned_target result['start'] = match.start(1) end = match.end(1) result['query_match'] = aligned_query[result['start']:end].replace( '-', '') result['dist'] = Levenshtein.distance(target_nucs, result['query_match']) stripped_contig = aligned_query.lstrip('-') overhang = len(aligned_query) - len(stripped_contig) if overhang > 0: stripped_target = target_nucs[overhang:] result['end_dist'] = Levenshtein.distance(stripped_target, result['query_match']) else: stripped_contig = aligned_query.rstrip('-') overhang = len(aligned_query) - len(stripped_contig) if overhang == 0: result['end_dist'] = result['dist'] else: stripped_target = target_nucs[:-overhang] result['end_dist'] = Levenshtein.distance( stripped_target, result['query_match']) return result
def write_aligned_reads(counts, aligned_csv, hiv_seed, v3loop_ref): """ Write reads, aligned to the HIV seed sequence. :param counts: [((aligned_ref, aligned_seq), count)] :param aligned_csv: open CSV file to write the aligned reads to :param hiv_seed: seed reference to align the V3LOOP ref to :param v3loop_ref: reference the reads were all aligned to """ writer = csv.DictWriter( aligned_csv, ['refname', 'qcut', 'rank', 'count', 'offset', 'seq'], lineterminator=os.linesep) writer.writeheader() seed_vs_v3, v3_vs_seed, score = align_it(hiv_seed, v3loop_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) # Count dashes at start of aligned_v3loop v3_offset = sum(1 for _ in takewhile(lambda c: c == '-', v3_vs_seed)) seed_positions = list(zip(seed_vs_v3[v3_offset:], v3_vs_seed[v3_offset:])) for rank, read_count in enumerate(counts): yield read_count (v3_vs_read, read_vs_v3), count = read_count is_started = False seq_offset = 0 seq = '' read_positions = iter(zip(v3_vs_read, read_vs_v3)) for seed_char, v3_vs_seed_char in seed_positions: if v3_vs_seed_char == '-': seq += '-' continue try: while True: v3_vs_read_char, read_char = next(read_positions) if v3_vs_read_char != '-': break except StopIteration: break if seed_char != '-': if read_char == '-' and not is_started: seq_offset += 1 else: is_started = True seq += read_char seq = seq.rstrip('-') writer.writerow(dict(refname=G2P_SEED_NAME, qcut=Q_CUTOFF, rank=rank, count=count, offset=v3_offset + seq_offset, seq=seq))
def align_untrimmed_reads(fastq): v3loop_ref = extract_v3loop_ref() score_counts = Counter() for _, (_, nucs, _) in FastqReader.get_reads(fastq): _, _, score = align_it(v3loop_ref, nucs, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) score_counts[score] += 1 return score_counts
def write_aligned_reads(counts, aligned_csv, hiv_seed, v3loop_ref): """ Write reads, aligned to the HIV seed sequence. :param counts: [((aligned_ref, aligned_seq), count)] :param aligned_csv: open CSV file to write the aligned reads to :param hiv_seed: seed reference to align the V3LOOP ref to :param v3loop_ref: reference the reads were all aligned to """ writer = csv.DictWriter( aligned_csv, ['refname', 'qcut', 'rank', 'count', 'offset', 'seq'], lineterminator=os.linesep) writer.writeheader() seed_vs_v3, v3_vs_seed, score = align_it(hiv_seed, v3loop_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) # Count dashes at start of aligned_v3loop v3_offset = sum(1 for _ in takewhile(lambda c: c == '-', v3_vs_seed)) seed_positions = list(zip(seed_vs_v3[v3_offset:], v3_vs_seed[v3_offset:])) for rank, read_count in enumerate(counts): yield read_count (v3_vs_read, read_vs_v3), count = read_count is_started = False seq_offset = 0 seq = '' read_positions = iter(zip(v3_vs_read, read_vs_v3)) for seed_char, v3_vs_seed_char in seed_positions: if v3_vs_seed_char == '-': seq += '-' continue try: while True: v3_vs_read_char, read_char = next(read_positions) if v3_vs_read_char != '-': break except StopIteration: break if seed_char != '-': if read_char == '-' and not is_started: seq_offset += 1 else: is_started = True seq += read_char seq = seq.rstrip('-') writer.writerow( dict(refname=G2P_SEED_NAME, qcut=Q_CUTOFF, rank=rank, count=count, offset=v3_offset + seq_offset, seq=seq))
def __init__(self, contig_seq: str, target_seq: str): self.valid = True gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 best_acontig = best_atarget = best_target = best_score = None best_reversed = None for target_nucs, is_reversed in unpack_mixtures_and_reverse( target_seq): aligned_contig, aligned_target, score = align_it( contig_seq, target_nucs, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) if best_score is None or score > best_score: best_acontig = aligned_contig best_atarget = aligned_target best_target = target_nucs best_score = score best_reversed = is_reversed if not best_acontig: self.valid = False return None aligned_contig = best_acontig aligned_target = best_atarget target_nucs = best_target self.score = best_score self.is_reversed = best_reversed if self.is_reversed: aligned_contig = reverse_and_complement(aligned_contig) aligned_target = reverse_and_complement(aligned_target) match = re.match('-*([^-](.*[^-])?)', aligned_target) self.aligned_contig = aligned_contig self.aligned_target = aligned_target self.start = match.start(1) self.end = match.end(1) self.contig_match = aligned_contig[self.start:self.end].replace( '-', '') self.dist = Levenshtein.distance(target_nucs, self.contig_match) stripped_contig = aligned_contig.lstrip('-') overhang = len(aligned_contig) - len(stripped_contig) if overhang > 0: stripped_target = target_nucs[overhang:] self.end_dist = Levenshtein.distance(stripped_target, self.contig_match) else: stripped_contig = aligned_contig.rstrip('-') overhang = len(aligned_contig) - len(stripped_contig) if overhang == 0: self.end_dist = self.dist else: stripped_target = target_nucs[:-overhang] self.end_dist = Levenshtein.distance(stripped_target, self.contig_match)
def align_nucs(seq1: str, seq2: str) -> typing.Tuple[str, str, int]: """ Align two sequences of nucleotides with default parameters. :return: the two aligned sequences, plus an alignment score """ gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 aligned1, aligned2, score = align_it(seq1, seq2, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) return aligned1, aligned2, score
def read_contigs(contigs_csv, excluded_seeds=None): gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 contig_groups = defaultdict( list) # {group_ref_name: [seq, index, index...]} conseqs = {} projects = ProjectConfig.loadDefault() with contigs_csv: contigs_reader = DictReader(contigs_csv) for i, row in reversed(list(enumerate(contigs_reader, 1))): contig_seq = row['contig'] match_fraction = float(row['match']) is_match = 0.25 <= match_fraction is_reversed = match_fraction < 0 if not (ARE_CONTIGS_MERGED and is_match): contig_name = get_contig_name(i, row['ref'], is_match, is_reversed, excluded_seeds) conseqs[contig_name] = contig_seq continue group_ref_name = row['group_ref'] contig_group = contig_groups[group_ref_name] if not contig_group: contig_group.append(projects.getReference(group_ref_name)) contig_group.append(str(i)) group_seq = contig_group[0] agroup, acontig, score = align_it(group_seq, contig_seq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) match = re.match('-*([^-](.*[^-])?)', acontig) start = match.start(1) end = match.end(1) merged_seq = agroup[:start] + contig_seq + agroup[end:] left_trim = len(agroup) - len(agroup.lstrip('-')) right_trim = len(agroup) - len(agroup.rstrip('-')) contig_group[0] = merged_seq[left_trim:-right_trim or None] is_match = True is_reversed = False for group_ref_name, contig_group in contig_groups.items(): (group_seq, *contig_nums) = contig_group prefix = '_'.join(reversed(contig_nums)) contig_name = get_contig_name(prefix, group_ref_name, is_match, is_reversed, excluded_seeds) conseqs[contig_name] = group_seq return conseqs
def align_it(self, seqa, seqb, gap_ini, gap_ext, use_terminal_gap_penalty=False, emulate_rb=False): ''' Returns aligned sequences (with gaps) from the Gotoh algorithm. Expects nucleotide sequences, see align_it_aa() for amino acid. Parameters: seqa (string): Nucleotide sequence (standard) seqb (string): Another nucleotide sequence gap_init (int): Gap initialization penalty gap_extend (int): Gap extension penalty use_terminal_gap_penalty (bool): penalize trailing gaps? emulate_rb (bool): use original (Ruby) match/mismatch scores? Returns: seqa (string): Aligned sequence a seqb (string): Aligned sequence b score (int): alignment score (gap penalties + match/mismatch) exit_status (AlignItResult): ok, illegal_char, internal_error ''' sa = "" sb = "" score = 0 al_status = AlignItResult.internal_error try: if not bool( self.valid_nu.search(seqa) and self.valid_nu.search(seqb)): al_status = AlignItResult.illegal_char else: if emulate_rb: [sa, sb] = gotoh.align_it_rb(seqa, seqb, gap_ini, gap_ext) score = 0 else: [sa, sb, score] = gotoh.align_it(seqa, seqb, gap_ini, gap_ext, int(use_terminal_gap_penalty)) al_status = AlignItResult.ok except: al_status = AlignItResult.internal_error return sa, sb, score, al_status
def calculate_keys(reference): keys = key_sections.get(reference, None) if keys is not None: return keys GAP_INIT_PENALTY = 10 GAP_EXTEND_PENALTY = 10 USE_TERMINAL_GAP_PENALTY = False keys = [] for key in key_references: # s1 is large sequence, s2 is key region aligned_source, aligned_key, _score = align_it( reference, key, GAP_INIT_PENALTY, GAP_EXTEND_PENALTY, USE_TERMINAL_GAP_PENALTY) match = re.match('^-*(.*?)-*$', aligned_key) excerpt = aligned_source[match.start(1):match.end(1)].replace('-', '') keys.append(excerpt) key_sections[reference] = keys return keys
def calculate_keys(reference): keys = key_sections.get(reference, None) if keys is not None: return keys GAP_INIT_PENALTY = 10 GAP_EXTEND_PENALTY = 10 USE_TERMINAL_GAP_PENALTY = False keys = [] for key in key_references: # s1 is large sequence, s2 is key region aligned_source, aligned_key, _score = align_it(reference, key, GAP_INIT_PENALTY, GAP_EXTEND_PENALTY, USE_TERMINAL_GAP_PENALTY) match = re.match('^-*(.*?)-*$', aligned_key) excerpt = aligned_source[match.start(1):match.end(1)].replace('-', '') keys.append(excerpt) key_sections[reference] = keys return keys
def trim_reads(reads, v3loop_ref, score_counts=None): """ Generator over reads that are aligned to the reference and trimmed. :param reads: generator from merge_reads() :param v3loop_ref: nucleotide sequence for V3LOOP :param score_counts: {score: count} to report on the alignment score distribution :return: Generator items (aligned_ref and aligned_seq may be None if merge or trim fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), (aligned_ref, aligned_seq)) """ # Measured as roughly halfway between HCV reads and V3LOOP reads min_v3_alignment_score = 2*len(v3loop_ref) for pair_name, read1, read2, seq in reads: trimmed_aligned_ref = trimmed_aligned_seq = None if seq is not None: aligned_ref, aligned_seq, score = align_it(v3loop_ref, seq, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score_counts is not None: score_counts[score] += 1 if score >= min_v3_alignment_score: left_padding = right_padding = 0 for left_padding, nuc in enumerate(aligned_ref): if nuc != '-': break for right_padding, nuc in enumerate(reversed(aligned_ref)): if nuc != '-': break start, end = left_padding, -right_padding or None trimmed_aligned_ref = aligned_ref[start:end] trimmed_aligned_seq = aligned_seq[start:end] yield pair_name, read1, read2, (trimmed_aligned_ref, trimmed_aligned_seq)
def trim_reads(reads, v3loop_ref, score_counts=None): """ Generator over reads that are aligned to the reference and trimmed. :param reads: generator from merge_reads() :param v3loop_ref: nucleotide sequence for V3LOOP :param score_counts: {score: count} to report on the alignment score distribution :return: Generator items (aligned_ref and aligned_seq may be None if merge or trim fails): (pair_name, (read1_name, bases, quality), (read2_name, bases, quality), (aligned_ref, aligned_seq)) """ # Measured as roughly halfway between HCV reads and V3LOOP reads min_v3_alignment_score = 2 * len(v3loop_ref) for pair_name, read1, read2, seq in reads: trimmed_aligned_ref = trimmed_aligned_seq = None if seq is not None: aligned_ref, aligned_seq, score = align_it(v3loop_ref, seq, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) if score_counts is not None: score_counts[score] += 1 if score >= min_v3_alignment_score: left_padding = right_padding = 0 for left_padding, nuc in enumerate(aligned_ref): if nuc != '-': break for right_padding, nuc in enumerate(reversed(aligned_ref)): if nuc != '-': break start, end = left_padding, -right_padding or None trimmed_aligned_ref = aligned_ref[start:end] trimmed_aligned_seq = aligned_seq[start:end] yield pair_name, read1, read2, (trimmed_aligned_ref, trimmed_aligned_seq)
def sam_to_conseqs(samfile, quality_cutoff=0, debug_reports=None, seeds=None, is_filtered=False, worker_pool=None, filter_coverage=1, distance_report=None): """ Build consensus sequences for each reference from a SAM file. @param samfile: an open file in the SAM format containing reads with their mapped position and quality scores @param quality_cutoff: minimum quality score for a base to be counted @param debug_reports: {(rname, pos): None} a dictionary with keys for all of the regions and positions that you want a report for. The value will be set to a string describing the counts and qualities at that position. @param seeds: {name: sequence} If this is set, any positions without coverage will be set to the base from the seed reference. If there are no reads mapped to a reference, it will not be included as a new consensus. @param is_filtered: if True, then any consensus that has migrated so far from its seed that it is closer to a different seed, will not be included as a new consensus. @param worker_pool: a pool to do some distributed processing @param filter_coverage: when filtering on consensus distance, only include portions with at least this depth of coverage @param distance_report: empty dictionary or None. Dictionary will return: {rname: {'seed_dist': seed_dist, 'other_dist': other_dist, 'other_seed': other_seed}} @return: {reference_name: consensus_sequence} """ if debug_reports: for key in debug_reports.iterkeys(): debug_reports[key] = Counter() # refmap structure: {refname: {pos: {nuc: count}}} refmap = {} pairs = matchmaker(samfile, include_singles=True) if worker_pool is None: merged_reads = itertools.imap(partial(merge_reads, quality_cutoff), pairs) else: merged_reads = worker_pool.imap_unordered(partial( merge_reads, quality_cutoff), pairs, chunksize=100) read_counts = Counter() for merged_read in merged_reads: if merged_read is None: continue rname, mseq, merged_inserts, qual1, qual2 = merged_read read_counts[rname] += 1 pos_nucs = refmap.get(rname) if pos_nucs is None: pos_nucs = refmap[rname] = defaultdict(Counter) if seeds: for i, nuc in enumerate(seeds[rname], 1): pos_nucs[i][nuc] = 0 update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs, debug_reports) if debug_reports: for key, counts in debug_reports.iteritems(): mixtures = [] nucs = set() qualities = set() for nuc, quality in counts.iterkeys(): nucs.add(nuc) qualities.add(quality) qualities = sorted(qualities) for min_quality in qualities: filtered_counts = Counter() for (nuc, nuc_qual), count in counts.iteritems(): if nuc_qual >= min_quality: filtered_counts[nuc] += count mixture = [] for nuc, count in filtered_counts.iteritems(): mixture.append('{}: {}'.format(nuc, count)) mixtures.append('{}{{{}}}'.format(min_quality, ', '.join(mixture))) debug_reports[key] = ', '.join(mixtures) new_conseqs = counts_to_conseqs(refmap) if not (seeds and is_filtered) or len(new_conseqs) < 2: return new_conseqs gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 filtered_conseqs = {} for name in sorted(new_conseqs.iterkeys()): conseq = new_conseqs[name] counts = refmap[name] relevant_conseq = u'' for pos, c in enumerate(conseq, 1): pos_counts = sum(counts[pos].itervalues()) if pos_counts >= filter_coverage: relevant_conseq += c if not relevant_conseq: # None of the coverage was acceptable. continue other_seed = other_dist = None for seed_name in sorted(new_conseqs.iterkeys()): seed_ref = seeds[seed_name] aligned_seed, aligned_conseq, _score = align_it( seed_ref, relevant_conseq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed) d = Levenshtein.distance(relevant_seed, relevant_conseq) if seed_name == name: seed_dist = d elif other_dist is None or d < other_dist: other_seed = seed_name other_dist = d if seed_dist <= other_dist: # Consensus is closer to starting seed than any other seed: keep it. filtered_conseqs[name] = conseq if distance_report is not None: distance_report[name] = dict(seed_dist=seed_dist, other_dist=other_dist, other_seed=other_seed) if not filtered_conseqs: # No reference had acceptable coverage, choose one with most reads. best_ref = read_counts.most_common(1)[0][0] filtered_conseqs[best_ref] = new_conseqs[best_ref] return filtered_conseqs
def sam_to_conseqs(samfile, quality_cutoff=0, debug_reports=None, seeds=None, is_filtered=False, filter_coverage=1, distance_report=None, original_seeds=None): """ Build consensus sequences for each reference from a SAM file. @param samfile: an open file in the SAM format containing reads with their mapped position and quality scores @param quality_cutoff: minimum quality score for a base to be counted @param debug_reports: {(rname, pos): None} a dictionary with keys for all of the regions and positions that you want a report for. The value will be set to a string describing the counts and qualities at that position. @param seeds: {name: sequence} If this is set, any positions without coverage will be set to the base from the seed reference. If there are no reads mapped to a reference, it will not be included as a new consensus. @param is_filtered: if True, then any consensus that has migrated so far from its seed that it is closer to a different seed, will not be included as a new consensus. @param filter_coverage: when filtering on consensus distance, only include portions with at least this depth of coverage @param distance_report: empty dictionary or None. Dictionary will return: {rname: {'seed_dist': seed_dist, 'other_dist': other_dist, 'other_seed': other_seed}} @param original_seeds: {name: sequence} Original seed references used in the distance report. @return: {reference_name: consensus_sequence} """ if debug_reports: for key in debug_reports.keys(): debug_reports[key] = Counter() # refmap structure: {refname: {pos: Counter({nuc: count})}} refmap = {} pairs = matchmaker(samfile, include_singles=True) merged_reads = map(partial(merge_reads, quality_cutoff), pairs) read_counts = Counter() for merged_read in merged_reads: if merged_read is None: continue rname, mseq, merged_inserts, qual1, qual2 = merged_read read_counts[rname] += 1 pos_nucs = refmap.get(rname) if pos_nucs is None: pos_nucs = refmap[rname] = defaultdict(Counter) update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs, debug_reports) if debug_reports: for key, counts in debug_reports.items(): mixtures = [] nucs = set() qualities = set() for nuc, quality in counts.keys(): nucs.add(nuc) qualities.add(quality) qualities = sorted(qualities) for min_quality in qualities: filtered_counts = Counter() for (nuc, nuc_qual), count in counts.items(): if nuc_qual >= min_quality: filtered_counts[nuc] += count mixture = [] for nuc, count in filtered_counts.items(): mixture.append('{}: {}'.format(nuc, count)) mixtures.append('{}{{{}}}'.format(min_quality, ', '.join(sorted(mixture)))) debug_reports[key] = ', '.join(sorted(mixtures)) new_conseqs = counts_to_conseqs(refmap, seeds) relevant_conseqs = None is_filtering = original_seeds and is_filtered gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 while is_filtering and len(new_conseqs) > 1: drifted_seeds = [] # [(count, name)] if relevant_conseqs is None: relevant_conseqs = {} for name in sorted(new_conseqs.keys()): conseq = new_conseqs[name] counts = refmap[name] relevant_conseq = u'' for pos, c in enumerate(conseq, 1): pos_counts = sum(counts[pos].values()) if pos_counts >= filter_coverage: relevant_conseq += c relevant_conseqs[name] = relevant_conseq for name in sorted(new_conseqs.keys()): relevant_conseq = relevant_conseqs[name] if not relevant_conseq: # None of the coverage was acceptable. drifted_seeds.append((read_counts[name], name)) continue other_seed = other_dist = seed_dist = None for seed_name in sorted(new_conseqs.keys()): seed_ref = original_seeds[seed_name] aligned_seed, aligned_conseq, _score = align_it(seed_ref, relevant_conseq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed) d = Levenshtein.distance(relevant_seed, relevant_conseq) if seed_name == name: seed_dist = d elif other_dist is None or d < other_dist: other_seed = seed_name other_dist = d if seed_dist > other_dist: # Consensus is farther from starting seed than another seed: drop it? drifted_seeds.append((read_counts[name], name)) if distance_report is not None: distance_report[name] = dict(seed_dist=seed_dist, other_dist=other_dist, other_seed=other_seed) distance_report = None # Only update during first iteration. if drifted_seeds: drifted_seeds.sort() dropped_seed = drifted_seeds[0][1] del new_conseqs[dropped_seed] is_filtering = len(drifted_seeds) > 1 return new_conseqs
def find_probes(contigs_csv, probes_csv): reader = DictReader(contigs_csv) columns = ['sample', 'contig'] for target_name in TARGET_SEQUENCES: for column_type in [ 'in_contig_start', 'in_contig_size', 'in_hxb2_start', 'in_hxb2_size', 'merged_hxb2_start', 'merged_hxb2_size', 'dist', 'end_dist', 'score', 'is_reversed', 'seq' ]: columns.append(target_name + '_' + column_type) writer = DictWriter(probes_csv, columns) writer.writeheader() # projects = ProjectConfig.loadDefault() # hxb2 = projects.getReference('HIV1-B-FR-K03455-seed') hxb2 = utils.hxb2 gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 for sample_name, sample_rows in groupby(reader, itemgetter('sample')): contig_num = 0 for row in sample_rows: seed_name = row.get('genotype') or row.get('ref') or row['region'] conseq_cutoff = row.get('consensus-percent-cutoff') if conseq_cutoff and conseq_cutoff != 'MAX': continue contig_num += 1 contig_name = f'{contig_num}-{seed_name}' contig_seq: str = row.get('contig') or row['sequence'] aligned_hxb2, aligned_contig_to_hxb2, _ = align_it( hxb2, contig_seq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) new_row = dict(sample=sample_name, contig=contig_name) for target_name, target_seq in TARGET_SEQUENCES.items(): finder = ProbeFinder(contig_seq, target_seq) if not finder.valid: return None size = len(finder.contig_match) start_pos = finder.start + 1 end_pos = finder.start + size hxb2_pos = contig_pos = 0 merged_hxb2_start = merged_hxb2_size = None for hxb2_nuc, contig_nuc in zip(aligned_hxb2, aligned_contig_to_hxb2): if hxb2_nuc != '-': hxb2_pos += 1 if contig_nuc != '-': contig_pos += 1 if contig_pos == start_pos: merged_hxb2_start = hxb2_pos if contig_pos == end_pos: merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1 break aligned_ref, aligned_match, _ = align_it( hxb2, finder.contig_match, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) lstripped_match = aligned_match.lstrip('-') in_hxb2_start = len(aligned_match) - len(lstripped_match) tail_len = len(lstripped_match) - len( lstripped_match.rstrip('-')) ref_match = aligned_ref[in_hxb2_start:-tail_len or None] in_hxb2_size = len(ref_match.replace('-', '')) prefix = target_name + '_' new_row[prefix + 'in_contig_start'] = start_pos new_row[prefix + 'in_contig_size'] = size new_row[prefix + 'in_hxb2_start'] = in_hxb2_start new_row[prefix + 'in_hxb2_size'] = in_hxb2_size new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size new_row[prefix + 'dist'] = finder.dist new_row[prefix + 'end_dist'] = finder.end_dist new_row[prefix + 'score'] = finder.score new_row[prefix + 'is_reversed'] = ('Y' if finder.is_reversed else 'N') new_row[prefix + 'seq'] = finder.contig_match writer.writerow(new_row)
def sam_to_conseqs(samfile, quality_cutoff=0, debug_reports=None, seeds=None, is_filtered=False, filter_coverage=1, distance_report=None, original_seeds=None): """ Build consensus sequences for each reference from a SAM file. @param samfile: an open file in the SAM format containing reads with their mapped position and quality scores @param quality_cutoff: minimum quality score for a base to be counted @param debug_reports: {(rname, pos): None} a dictionary with keys for all of the regions and positions that you want a report for. The value will be set to a string describing the counts and qualities at that position. @param seeds: {name: sequence} If this is set, any positions without coverage will be set to the base from the seed reference. If there are no reads mapped to a reference, it will not be included as a new consensus. @param is_filtered: if True, then any consensus that has migrated so far from its seed that it is closer to a different seed, will not be included as a new consensus. @param filter_coverage: when filtering on consensus distance, only include portions with at least this depth of coverage @param distance_report: empty dictionary or None. Dictionary will return: {rname: {'seed_dist': seed_dist, 'other_dist': other_dist, 'other_seed': other_seed}} @param original_seeds: {name: sequence} Original seed references used in the distance report. @return: {reference_name: consensus_sequence} """ if debug_reports: for key in debug_reports.keys(): debug_reports[key] = Counter() # refmap structure: {refname: {pos: Counter({nuc: count})}} refmap = {} pairs = matchmaker(samfile, include_singles=True) merged_reads = map(partial(merge_reads, quality_cutoff), pairs) read_counts = Counter() for merged_read in merged_reads: if merged_read is None: continue rname, mseq, merged_inserts, qual1, qual2 = merged_read read_counts[rname] += 1 pos_nucs = refmap.get(rname) if pos_nucs is None: pos_nucs = refmap[rname] = defaultdict(Counter) update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs, debug_reports) if debug_reports: for key, counts in debug_reports.items(): mixtures = [] nucs = set() qualities = set() for nuc, quality in counts.keys(): nucs.add(nuc) qualities.add(quality) qualities = sorted(qualities) for min_quality in qualities: filtered_counts = Counter() for (nuc, nuc_qual), count in counts.items(): if nuc_qual >= min_quality: filtered_counts[nuc] += count mixture = [] for nuc, count in filtered_counts.items(): mixture.append('{}: {}'.format(nuc, count)) mixtures.append('{}{{{}}}'.format(min_quality, ', '.join(sorted(mixture)))) debug_reports[key] = ', '.join(sorted(mixtures)) new_conseqs = counts_to_conseqs(refmap, seeds) relevant_conseqs = None is_filtering = original_seeds and is_filtered gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 while is_filtering and len(new_conseqs) > 1: drifted_seeds = [] # [(count, name)] if relevant_conseqs is None: relevant_conseqs = {} for name in sorted(new_conseqs.keys()): conseq = new_conseqs[name] counts = refmap[name] relevant_conseq = u'' for pos, c in enumerate(conseq, 1): pos_counts = sum(counts[pos].values()) if pos_counts >= filter_coverage: relevant_conseq += c relevant_conseqs[name] = relevant_conseq for name in sorted(new_conseqs.keys()): relevant_conseq = relevant_conseqs[name] if not relevant_conseq: # None of the coverage was acceptable. drifted_seeds.append((read_counts[name], name)) continue other_seed = other_dist = None for seed_name in sorted(new_conseqs.keys()): seed_ref = original_seeds[seed_name] aligned_seed, aligned_conseq, _score = align_it( seed_ref, relevant_conseq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed) d = Levenshtein.distance(relevant_seed, relevant_conseq) if seed_name == name: seed_dist = d elif other_dist is None or d < other_dist: other_seed = seed_name other_dist = d if seed_dist > other_dist: # Consensus is farther from starting seed than another seed: drop it? drifted_seeds.append((read_counts[name], name)) if distance_report is not None: distance_report[name] = dict(seed_dist=seed_dist, other_dist=other_dist, other_seed=other_seed) distance_report = None # Only update during first iteration. if drifted_seeds: drifted_seeds.sort() dropped_seed = drifted_seeds[0][1] del new_conseqs[dropped_seed] is_filtering = len(drifted_seeds) > 1 return new_conseqs
def sam_to_conseqs(samfile, quality_cutoff=0, debug_reports=None, seeds=None, is_filtered=False, worker_pool=None, filter_coverage=1, distance_report=None): """ Build consensus sequences for each reference from a SAM file. @param samfile: an open file in the SAM format containing reads with their mapped position and quality scores @param quality_cutoff: minimum quality score for a base to be counted @param debug_reports: {(rname, pos): None} a dictionary with keys for all of the regions and positions that you want a report for. The value will be set to a string describing the counts and qualities at that position. @param seeds: {name: sequence} If this is set, any positions without coverage will be set to the base from the seed reference. If there are no reads mapped to a reference, it will not be included as a new consensus. @param is_filtered: if True, then any consensus that has migrated so far from its seed that it is closer to a different seed, will not be included as a new consensus. @param worker_pool: a pool to do some distributed processing @param filter_coverage: when filtering on consensus distance, only include portions with at least this depth of coverage @param distance_report: empty dictionary or None. Dictionary will return: {rname: {'seed_dist': seed_dist, 'other_dist': other_dist, 'other_seed': other_seed}} @return: {reference_name: consensus_sequence} """ if debug_reports: for key in debug_reports.iterkeys(): debug_reports[key] = Counter() # refmap structure: {refname: {pos: {nuc: count}}} refmap = {} pairs = matchmaker(samfile, include_singles=True) if worker_pool is None: merged_reads = itertools.imap( partial(merge_reads, quality_cutoff), pairs) else: merged_reads = worker_pool.imap_unordered( partial(merge_reads, quality_cutoff), pairs, chunksize=100) read_counts = Counter() for merged_read in merged_reads: if merged_read is None: continue rname, mseq, merged_inserts, qual1, qual2 = merged_read read_counts[rname] += 1 pos_nucs = refmap.get(rname) if pos_nucs is None: pos_nucs = refmap[rname] = defaultdict(Counter) if seeds: for i, nuc in enumerate(seeds[rname], 1): pos_nucs[i][nuc] = 0 update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs, debug_reports) if debug_reports: for key, counts in debug_reports.iteritems(): mixtures = [] nucs = set() qualities = set() for nuc, quality in counts.iterkeys(): nucs.add(nuc) qualities.add(quality) qualities = sorted(qualities) for min_quality in qualities: filtered_counts = Counter() for (nuc, nuc_qual), count in counts.iteritems(): if nuc_qual >= min_quality: filtered_counts[nuc] += count mixture = [] for nuc, count in filtered_counts.iteritems(): mixture.append('{}: {}'.format(nuc, count)) mixtures.append('{}{{{}}}'.format(min_quality, ', '.join(mixture))) debug_reports[key] = ', '.join(mixtures) new_conseqs = counts_to_conseqs(refmap) if not (seeds and is_filtered) or len(new_conseqs) < 2: return new_conseqs gap_open_penalty = 15 gap_extend_penalty = 3 use_terminal_gap_penalty = 1 filtered_conseqs = {} for name in sorted(new_conseqs.iterkeys()): conseq = new_conseqs[name] counts = refmap[name] relevant_conseq = u'' for pos, c in enumerate(conseq, 1): pos_counts = sum(counts[pos].itervalues()) if pos_counts >= filter_coverage: relevant_conseq += c if not relevant_conseq: # None of the coverage was acceptable. continue other_seed = other_dist = None for seed_name in sorted(new_conseqs.iterkeys()): seed_ref = seeds[seed_name] aligned_seed, aligned_conseq, _score = align_it(seed_ref, relevant_conseq, gap_open_penalty, gap_extend_penalty, use_terminal_gap_penalty) relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed) d = Levenshtein.distance(relevant_seed, relevant_conseq) if seed_name == name: seed_dist = d elif other_dist is None or d < other_dist: other_seed = seed_name other_dist = d if seed_dist <= other_dist: # Consensus is closer to starting seed than any other seed: keep it. filtered_conseqs[name] = conseq if distance_report is not None: distance_report[name] = dict(seed_dist=seed_dist, other_dist=other_dist, other_seed=other_seed) if not filtered_conseqs: # No reference had acceptable coverage, choose one with most reads. best_ref = read_counts.most_common(1)[0][0] filtered_conseqs[best_ref] = new_conseqs[best_ref] return filtered_conseqs