Exemple #1
0
def merge_reads(reads):
    """ Generator over merged reads.

    :param reads: iterable of reads from FastqReader
    :return: a generator with items (merged_bases may be None if merge fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     merged_bases)
    """
    for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads:
        if not (seq1 and seq2):
            score = -1
            aligned1 = aligned2 = None
        else:
            seq2_rev = reverse_and_complement(seq2)
            aligned1, aligned2, score = align_it(seq1,
                                                 seq2_rev,
                                                 GAP_OPEN_COST,
                                                 GAP_EXTEND_COST,
                                                 USE_TERMINAL_COST)
        if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-':
            aligned_qual1 = align_quality(aligned1, qual1)
            aligned_qual2 = align_quality(aligned2, reversed(qual2))
            merged = merge_pairs(aligned1,
                                 aligned2,
                                 aligned_qual1,
                                 aligned_qual2,
                                 q_cutoff=Q_CUTOFF)
        else:
            merged = None
        yield (pair_name,
               (r1_name, seq1, qual1),
               (r2_name, seq2, qual2),
               merged)
Exemple #2
0
def map_sequence(source_seq, dest_seq):
    """ Find the portion of source_seq that dest_seq maps to.

    :return: a list of 1-based positions in source_seq that it mapped to.
    """
    gap_open = 15
    gap_extend = 5
    use_terminal_gap_penalty = 1
    aligned_source, aligned_dest, _score = gotoh.align_it(
        source_seq, dest_seq, gap_open, gap_extend, use_terminal_gap_penalty)
    positions = []
    source_pos = 1
    for source_nuc, dest_nuc in zip(aligned_source, aligned_dest):
        if dest_nuc != '-':
            positions.append(source_pos if source_nuc != '-' else None)
        if source_nuc != '-':
            source_pos += 1
    hit_count = sum(position is not None for position in positions)
    if hit_count < len(dest_seq) / 2:
        pieces = re.split('(N+)', dest_seq)
        sizes = [(len(p), i) for i, p in enumerate(pieces)
                 if p and 'N' not in p]
        if sizes:
            sizes.sort(reverse=True)
            big_piece = sizes[0][1]
            offset = sum(len(p) for p in pieces[:big_piece])
            positions = offset * [None] + map_sequence(source_seq,
                                                       pieces[big_piece])

    return positions
Exemple #3
0
def merge_reads(reads):
    """ Generator over merged reads.

    :param reads: iterable of reads from FastqReader
    :return: a generator with items (merged_bases may be None if merge fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     merged_bases)
    """
    for pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2) in reads:
        if not (seq1 and seq2):
            score = -1
        else:
            seq2_rev = reverse_and_complement(seq2)
            aligned1, aligned2, score = align_it(seq1, seq2_rev, GAP_OPEN_COST,
                                                 GAP_EXTEND_COST,
                                                 USE_TERMINAL_COST)
        if score >= MIN_PAIR_ALIGNMENT_SCORE and aligned1[0] != '-':
            aligned_qual1 = align_quality(aligned1, qual1)
            aligned_qual2 = align_quality(aligned2, reversed(qual2))
            merged = merge_pairs(aligned1,
                                 aligned2,
                                 aligned_qual1,
                                 aligned_qual2,
                                 q_cutoff=Q_CUTOFF)
        else:
            merged = None
        yield (pair_name, (r1_name, seq1, qual1), (r2_name, seq2, qual2),
               merged)
Exemple #4
0
def map_sequence(source_seq, dest_seq):
    """ Find the portion of source_seq that dest_seq maps to.

    :return: a list of 1-based positions in source_seq that it mapped to.
    """
    gap_open = 15
    gap_extend = 5
    use_terminal_gap_penalty = 1
    aligned_source, aligned_dest, _score = gotoh.align_it(source_seq,
                                                          dest_seq,
                                                          gap_open,
                                                          gap_extend,
                                                          use_terminal_gap_penalty)
    positions = []
    source_pos = 1
    for source_nuc, dest_nuc in zip(aligned_source, aligned_dest):
        if dest_nuc != '-':
            positions.append(source_pos if source_nuc != '-' else None)
        if source_nuc != '-':
            source_pos += 1
    hit_count = sum(position is not None for position in positions)
    if hit_count < len(dest_seq) / 2:
        pieces = re.split('(N+)', dest_seq)
        sizes = [(len(p), i) for i, p in enumerate(pieces) if p and 'N' not in p]
        if sizes:
            sizes.sort(reverse=True)
            big_piece = sizes[0][1]
            offset = sum(len(p) for p in pieces[:big_piece])
            positions = offset*[None] + map_sequence(source_seq,
                                                     pieces[big_piece])

    return positions
Exemple #5
0
def align_untrimmed_reads(fastq):
    v3loop_ref = extract_v3loop_ref()
    score_counts = Counter()
    for _, (_, nucs, _) in FastqReader.get_reads(fastq):
        _, _, score = align_it(v3loop_ref, nucs, GAP_OPEN_COST,
                               GAP_EXTEND_COST, USE_TERMINAL_COST)
        score_counts[score] += 1
    return score_counts
Exemple #6
0
 def align(query_seq: str, target_seq: str):
     result = {
         'score': None,
         'aligned_query': None,
         'aligned_target': None,
         'start': None,
         'is_valid': False,
         'dist': None,
         'end_dist': None,
     }
     gap_open_penalty = 15
     gap_extend_penalty = 3
     use_terminal_gap_penalty = 1
     best_acontig = best_atarget = best_target = best_score = None
     for target_nucs in PrimerFinder.expand_mixtures(target_seq):
         aligned_query, aligned_target, score = align_it(
             query_seq, target_nucs, gap_open_penalty, gap_extend_penalty,
             use_terminal_gap_penalty)
         if best_score is None or score > best_score:
             best_acontig = aligned_query
             best_atarget = aligned_target
             best_target = target_nucs
             best_score = score
     if not best_acontig:
         result['valid'] = False
         return None
     aligned_query = best_acontig
     aligned_target = best_atarget
     target_nucs = best_target
     result['score'] = best_score
     match = re.match('-*([^-](.*[^-])?)', aligned_target)
     result['aligned_query'] = aligned_query
     result['aligned_target'] = aligned_target
     result['start'] = match.start(1)
     end = match.end(1)
     result['query_match'] = aligned_query[result['start']:end].replace(
         '-', '')
     result['dist'] = Levenshtein.distance(target_nucs,
                                           result['query_match'])
     stripped_contig = aligned_query.lstrip('-')
     overhang = len(aligned_query) - len(stripped_contig)
     if overhang > 0:
         stripped_target = target_nucs[overhang:]
         result['end_dist'] = Levenshtein.distance(stripped_target,
                                                   result['query_match'])
     else:
         stripped_contig = aligned_query.rstrip('-')
         overhang = len(aligned_query) - len(stripped_contig)
         if overhang == 0:
             result['end_dist'] = result['dist']
         else:
             stripped_target = target_nucs[:-overhang]
             result['end_dist'] = Levenshtein.distance(
                 stripped_target, result['query_match'])
     return result
Exemple #7
0
def write_aligned_reads(counts, aligned_csv, hiv_seed, v3loop_ref):
    """ Write reads, aligned to the HIV seed sequence.

    :param counts: [((aligned_ref, aligned_seq), count)]
    :param aligned_csv: open CSV file to write the aligned reads to
    :param hiv_seed: seed reference to align the V3LOOP ref to
    :param v3loop_ref: reference the reads were all aligned to
    """
    writer = csv.DictWriter(
        aligned_csv,
        ['refname', 'qcut', 'rank', 'count', 'offset', 'seq'],
        lineterminator=os.linesep)
    writer.writeheader()

    seed_vs_v3, v3_vs_seed, score = align_it(hiv_seed,
                                             v3loop_ref,
                                             GAP_OPEN_COST,
                                             GAP_EXTEND_COST,
                                             USE_TERMINAL_COST)

    # Count dashes at start of aligned_v3loop
    v3_offset = sum(1 for _ in takewhile(lambda c: c == '-', v3_vs_seed))
    seed_positions = list(zip(seed_vs_v3[v3_offset:], v3_vs_seed[v3_offset:]))

    for rank, read_count in enumerate(counts):
        yield read_count
        (v3_vs_read, read_vs_v3), count = read_count
        is_started = False
        seq_offset = 0
        seq = ''
        read_positions = iter(zip(v3_vs_read, read_vs_v3))
        for seed_char, v3_vs_seed_char in seed_positions:
            if v3_vs_seed_char == '-':
                seq += '-'
                continue
            try:
                while True:
                    v3_vs_read_char, read_char = next(read_positions)
                    if v3_vs_read_char != '-':
                        break
            except StopIteration:
                break
            if seed_char != '-':
                if read_char == '-' and not is_started:
                    seq_offset += 1
                else:
                    is_started = True
                    seq += read_char
        seq = seq.rstrip('-')
        writer.writerow(dict(refname=G2P_SEED_NAME,
                             qcut=Q_CUTOFF,
                             rank=rank,
                             count=count,
                             offset=v3_offset + seq_offset,
                             seq=seq))
Exemple #8
0
def align_untrimmed_reads(fastq):
    v3loop_ref = extract_v3loop_ref()
    score_counts = Counter()
    for _, (_, nucs, _) in FastqReader.get_reads(fastq):
        _, _, score = align_it(v3loop_ref,
                               nucs,
                               GAP_OPEN_COST,
                               GAP_EXTEND_COST,
                               USE_TERMINAL_COST)
        score_counts[score] += 1
    return score_counts
Exemple #9
0
def write_aligned_reads(counts, aligned_csv, hiv_seed, v3loop_ref):
    """ Write reads, aligned to the HIV seed sequence.

    :param counts: [((aligned_ref, aligned_seq), count)]
    :param aligned_csv: open CSV file to write the aligned reads to
    :param hiv_seed: seed reference to align the V3LOOP ref to
    :param v3loop_ref: reference the reads were all aligned to
    """
    writer = csv.DictWriter(
        aligned_csv, ['refname', 'qcut', 'rank', 'count', 'offset', 'seq'],
        lineterminator=os.linesep)
    writer.writeheader()

    seed_vs_v3, v3_vs_seed, score = align_it(hiv_seed, v3loop_ref,
                                             GAP_OPEN_COST, GAP_EXTEND_COST,
                                             USE_TERMINAL_COST)

    # Count dashes at start of aligned_v3loop
    v3_offset = sum(1 for _ in takewhile(lambda c: c == '-', v3_vs_seed))
    seed_positions = list(zip(seed_vs_v3[v3_offset:], v3_vs_seed[v3_offset:]))

    for rank, read_count in enumerate(counts):
        yield read_count
        (v3_vs_read, read_vs_v3), count = read_count
        is_started = False
        seq_offset = 0
        seq = ''
        read_positions = iter(zip(v3_vs_read, read_vs_v3))
        for seed_char, v3_vs_seed_char in seed_positions:
            if v3_vs_seed_char == '-':
                seq += '-'
                continue
            try:
                while True:
                    v3_vs_read_char, read_char = next(read_positions)
                    if v3_vs_read_char != '-':
                        break
            except StopIteration:
                break
            if seed_char != '-':
                if read_char == '-' and not is_started:
                    seq_offset += 1
                else:
                    is_started = True
                    seq += read_char
        seq = seq.rstrip('-')
        writer.writerow(
            dict(refname=G2P_SEED_NAME,
                 qcut=Q_CUTOFF,
                 rank=rank,
                 count=count,
                 offset=v3_offset + seq_offset,
                 seq=seq))
Exemple #10
0
 def __init__(self, contig_seq: str, target_seq: str):
     self.valid = True
     gap_open_penalty = 15
     gap_extend_penalty = 3
     use_terminal_gap_penalty = 1
     best_acontig = best_atarget = best_target = best_score = None
     best_reversed = None
     for target_nucs, is_reversed in unpack_mixtures_and_reverse(
             target_seq):
         aligned_contig, aligned_target, score = align_it(
             contig_seq, target_nucs, gap_open_penalty, gap_extend_penalty,
             use_terminal_gap_penalty)
         if best_score is None or score > best_score:
             best_acontig = aligned_contig
             best_atarget = aligned_target
             best_target = target_nucs
             best_score = score
             best_reversed = is_reversed
     if not best_acontig:
         self.valid = False
         return None
     aligned_contig = best_acontig
     aligned_target = best_atarget
     target_nucs = best_target
     self.score = best_score
     self.is_reversed = best_reversed
     if self.is_reversed:
         aligned_contig = reverse_and_complement(aligned_contig)
         aligned_target = reverse_and_complement(aligned_target)
     match = re.match('-*([^-](.*[^-])?)', aligned_target)
     self.aligned_contig = aligned_contig
     self.aligned_target = aligned_target
     self.start = match.start(1)
     self.end = match.end(1)
     self.contig_match = aligned_contig[self.start:self.end].replace(
         '-', '')
     self.dist = Levenshtein.distance(target_nucs, self.contig_match)
     stripped_contig = aligned_contig.lstrip('-')
     overhang = len(aligned_contig) - len(stripped_contig)
     if overhang > 0:
         stripped_target = target_nucs[overhang:]
         self.end_dist = Levenshtein.distance(stripped_target,
                                              self.contig_match)
     else:
         stripped_contig = aligned_contig.rstrip('-')
         overhang = len(aligned_contig) - len(stripped_contig)
         if overhang == 0:
             self.end_dist = self.dist
         else:
             stripped_target = target_nucs[:-overhang]
             self.end_dist = Levenshtein.distance(stripped_target,
                                                  self.contig_match)
Exemple #11
0
def align_nucs(seq1: str, seq2: str) -> typing.Tuple[str, str, int]:
    """ Align two sequences of nucleotides with default parameters.

    :return: the two aligned sequences, plus an alignment score
    """
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    aligned1, aligned2, score = align_it(seq1, seq2, gap_open_penalty,
                                         gap_extend_penalty,
                                         use_terminal_gap_penalty)

    return aligned1, aligned2, score
Exemple #12
0
def read_contigs(contigs_csv, excluded_seeds=None):
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    contig_groups = defaultdict(
        list)  # {group_ref_name: [seq, index, index...]}
    conseqs = {}
    projects = ProjectConfig.loadDefault()
    with contigs_csv:
        contigs_reader = DictReader(contigs_csv)
        for i, row in reversed(list(enumerate(contigs_reader, 1))):
            contig_seq = row['contig']
            match_fraction = float(row['match'])
            is_match = 0.25 <= match_fraction
            is_reversed = match_fraction < 0
            if not (ARE_CONTIGS_MERGED and is_match):
                contig_name = get_contig_name(i, row['ref'], is_match,
                                              is_reversed, excluded_seeds)
                conseqs[contig_name] = contig_seq
                continue
            group_ref_name = row['group_ref']
            contig_group = contig_groups[group_ref_name]
            if not contig_group:
                contig_group.append(projects.getReference(group_ref_name))
            contig_group.append(str(i))
            group_seq = contig_group[0]
            agroup, acontig, score = align_it(group_seq, contig_seq,
                                              gap_open_penalty,
                                              gap_extend_penalty,
                                              use_terminal_gap_penalty)
            match = re.match('-*([^-](.*[^-])?)', acontig)
            start = match.start(1)
            end = match.end(1)
            merged_seq = agroup[:start] + contig_seq + agroup[end:]
            left_trim = len(agroup) - len(agroup.lstrip('-'))
            right_trim = len(agroup) - len(agroup.rstrip('-'))
            contig_group[0] = merged_seq[left_trim:-right_trim or None]

    is_match = True
    is_reversed = False
    for group_ref_name, contig_group in contig_groups.items():
        (group_seq, *contig_nums) = contig_group
        prefix = '_'.join(reversed(contig_nums))
        contig_name = get_contig_name(prefix, group_ref_name, is_match,
                                      is_reversed, excluded_seeds)
        conseqs[contig_name] = group_seq
    return conseqs
Exemple #13
0
    def align_it(self,
                 seqa,
                 seqb,
                 gap_ini,
                 gap_ext,
                 use_terminal_gap_penalty=False,
                 emulate_rb=False):
        '''
        Returns aligned sequences (with gaps) from the Gotoh algorithm.
        Expects nucleotide sequences, see align_it_aa() for amino acid.

                Parameters:
                        seqa (string): Nucleotide sequence (standard)
                        seqb (string): Another nucleotide sequence
                        gap_init (int): Gap initialization penalty
                        gap_extend (int): Gap extension penalty
                        use_terminal_gap_penalty (bool): penalize trailing gaps?
                        emulate_rb (bool): use original (Ruby) match/mismatch scores?

                Returns:
                        seqa (string): Aligned sequence a
                        seqb (string): Aligned sequence b
                        score (int): alignment score (gap penalties + match/mismatch)
                        exit_status (AlignItResult): ok, illegal_char, internal_error
        '''
        sa = ""
        sb = ""
        score = 0
        al_status = AlignItResult.internal_error

        try:
            if not bool(
                    self.valid_nu.search(seqa) and self.valid_nu.search(seqb)):
                al_status = AlignItResult.illegal_char
            else:
                if emulate_rb:
                    [sa, sb] = gotoh.align_it_rb(seqa, seqb, gap_ini, gap_ext)
                    score = 0
                else:
                    [sa, sb,
                     score] = gotoh.align_it(seqa, seqb, gap_ini, gap_ext,
                                             int(use_terminal_gap_penalty))
                al_status = AlignItResult.ok
        except:
            al_status = AlignItResult.internal_error

        return sa, sb, score, al_status
def calculate_keys(reference):
    keys = key_sections.get(reference, None)
    if keys is not None:
        return keys

    GAP_INIT_PENALTY = 10
    GAP_EXTEND_PENALTY = 10
    USE_TERMINAL_GAP_PENALTY = False
    keys = []
    for key in key_references:
        # s1 is large sequence, s2 is key region
        aligned_source, aligned_key, _score = align_it(
            reference, key, GAP_INIT_PENALTY, GAP_EXTEND_PENALTY,
            USE_TERMINAL_GAP_PENALTY)
        match = re.match('^-*(.*?)-*$', aligned_key)
        excerpt = aligned_source[match.start(1):match.end(1)].replace('-', '')
        keys.append(excerpt)
    key_sections[reference] = keys
    return keys
Exemple #15
0
def calculate_keys(reference):
    keys = key_sections.get(reference, None)
    if keys is not None:
        return keys

    GAP_INIT_PENALTY = 10
    GAP_EXTEND_PENALTY = 10
    USE_TERMINAL_GAP_PENALTY = False
    keys = []
    for key in key_references:
        # s1 is large sequence, s2 is key region
        aligned_source, aligned_key, _score = align_it(reference,
                                                       key,
                                                       GAP_INIT_PENALTY,
                                                       GAP_EXTEND_PENALTY,
                                                       USE_TERMINAL_GAP_PENALTY)
        match = re.match('^-*(.*?)-*$', aligned_key)
        excerpt = aligned_source[match.start(1):match.end(1)].replace('-', '')
        keys.append(excerpt)
    key_sections[reference] = keys
    return keys
Exemple #16
0
def trim_reads(reads, v3loop_ref, score_counts=None):
    """ Generator over reads that are aligned to the reference and trimmed.

    :param reads: generator from merge_reads()
    :param v3loop_ref: nucleotide sequence for V3LOOP
    :param score_counts: {score: count} to report on the alignment score
        distribution
    :return: Generator items (aligned_ref and aligned_seq may be None if merge
    or trim fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     (aligned_ref, aligned_seq))
    """
    # Measured as roughly halfway between HCV reads and V3LOOP reads
    min_v3_alignment_score = 2*len(v3loop_ref)

    for pair_name, read1, read2, seq in reads:
        trimmed_aligned_ref = trimmed_aligned_seq = None
        if seq is not None:
            aligned_ref, aligned_seq, score = align_it(v3loop_ref,
                                                       seq,
                                                       GAP_OPEN_COST,
                                                       GAP_EXTEND_COST,
                                                       USE_TERMINAL_COST)
            if score_counts is not None:
                score_counts[score] += 1
            if score >= min_v3_alignment_score:
                left_padding = right_padding = 0
                for left_padding, nuc in enumerate(aligned_ref):
                    if nuc != '-':
                        break
                for right_padding, nuc in enumerate(reversed(aligned_ref)):
                    if nuc != '-':
                        break
                start, end = left_padding, -right_padding or None
                trimmed_aligned_ref = aligned_ref[start:end]
                trimmed_aligned_seq = aligned_seq[start:end]
        yield pair_name, read1, read2, (trimmed_aligned_ref,
                                        trimmed_aligned_seq)
Exemple #17
0
def trim_reads(reads, v3loop_ref, score_counts=None):
    """ Generator over reads that are aligned to the reference and trimmed.

    :param reads: generator from merge_reads()
    :param v3loop_ref: nucleotide sequence for V3LOOP
    :param score_counts: {score: count} to report on the alignment score
        distribution
    :return: Generator items (aligned_ref and aligned_seq may be None if merge
    or trim fails):
    (pair_name,
     (read1_name, bases, quality),
     (read2_name, bases, quality),
     (aligned_ref, aligned_seq))
    """
    # Measured as roughly halfway between HCV reads and V3LOOP reads
    min_v3_alignment_score = 2 * len(v3loop_ref)

    for pair_name, read1, read2, seq in reads:
        trimmed_aligned_ref = trimmed_aligned_seq = None
        if seq is not None:
            aligned_ref, aligned_seq, score = align_it(v3loop_ref, seq,
                                                       GAP_OPEN_COST,
                                                       GAP_EXTEND_COST,
                                                       USE_TERMINAL_COST)
            if score_counts is not None:
                score_counts[score] += 1
            if score >= min_v3_alignment_score:
                left_padding = right_padding = 0
                for left_padding, nuc in enumerate(aligned_ref):
                    if nuc != '-':
                        break
                for right_padding, nuc in enumerate(reversed(aligned_ref)):
                    if nuc != '-':
                        break
                start, end = left_padding, -right_padding or None
                trimmed_aligned_ref = aligned_ref[start:end]
                trimmed_aligned_seq = aligned_seq[start:end]
        yield pair_name, read1, read2, (trimmed_aligned_ref,
                                        trimmed_aligned_seq)
Exemple #18
0
def sam_to_conseqs(samfile,
                   quality_cutoff=0,
                   debug_reports=None,
                   seeds=None,
                   is_filtered=False,
                   worker_pool=None,
                   filter_coverage=1,
                   distance_report=None):
    """ Build consensus sequences for each reference from a SAM file.

    @param samfile: an open file in the SAM format containing reads with their
        mapped position and quality scores
    @param quality_cutoff: minimum quality score for a base to be counted
    @param debug_reports: {(rname, pos): None} a dictionary with keys for all
        of the regions and positions that you want a report for. The value
        will be set to a string describing the counts and qualities at that
        position.
    @param seeds: {name: sequence} If this is set,
        any positions without coverage will be set to the base from the seed
        reference. If there are no reads mapped to a reference, it will not
        be included as a new consensus.
    @param is_filtered: if True, then any consensus that has migrated so far
        from its seed that it is closer to a different seed, will not be
        included as a new consensus.
    @param worker_pool: a pool to do some distributed processing
    @param filter_coverage: when filtering on consensus distance, only include
        portions with at least this depth of coverage
    @param distance_report: empty dictionary or None. Dictionary will return:
        {rname: {'seed_dist': seed_dist, 'other_dist': other_dist,
        'other_seed': other_seed}}
    @return: {reference_name: consensus_sequence}
    """

    if debug_reports:
        for key in debug_reports.iterkeys():
            debug_reports[key] = Counter()

    # refmap structure: {refname: {pos: {nuc: count}}}
    refmap = {}

    pairs = matchmaker(samfile, include_singles=True)
    if worker_pool is None:
        merged_reads = itertools.imap(partial(merge_reads, quality_cutoff),
                                      pairs)
    else:
        merged_reads = worker_pool.imap_unordered(partial(
            merge_reads, quality_cutoff),
                                                  pairs,
                                                  chunksize=100)
    read_counts = Counter()
    for merged_read in merged_reads:
        if merged_read is None:
            continue
        rname, mseq, merged_inserts, qual1, qual2 = merged_read
        read_counts[rname] += 1
        pos_nucs = refmap.get(rname)
        if pos_nucs is None:
            pos_nucs = refmap[rname] = defaultdict(Counter)
            if seeds:
                for i, nuc in enumerate(seeds[rname], 1):
                    pos_nucs[i][nuc] = 0
        update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs,
                      debug_reports)

    if debug_reports:
        for key, counts in debug_reports.iteritems():
            mixtures = []
            nucs = set()
            qualities = set()
            for nuc, quality in counts.iterkeys():
                nucs.add(nuc)
                qualities.add(quality)
            qualities = sorted(qualities)
            for min_quality in qualities:
                filtered_counts = Counter()
                for (nuc, nuc_qual), count in counts.iteritems():
                    if nuc_qual >= min_quality:
                        filtered_counts[nuc] += count
                mixture = []
                for nuc, count in filtered_counts.iteritems():
                    mixture.append('{}: {}'.format(nuc, count))
                mixtures.append('{}{{{}}}'.format(min_quality,
                                                  ', '.join(mixture)))
            debug_reports[key] = ', '.join(mixtures)

    new_conseqs = counts_to_conseqs(refmap)
    if not (seeds and is_filtered) or len(new_conseqs) < 2:
        return new_conseqs

    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    filtered_conseqs = {}
    for name in sorted(new_conseqs.iterkeys()):
        conseq = new_conseqs[name]
        counts = refmap[name]
        relevant_conseq = u''
        for pos, c in enumerate(conseq, 1):
            pos_counts = sum(counts[pos].itervalues())
            if pos_counts >= filter_coverage:
                relevant_conseq += c
        if not relevant_conseq:
            # None of the coverage was acceptable.
            continue

        other_seed = other_dist = None
        for seed_name in sorted(new_conseqs.iterkeys()):
            seed_ref = seeds[seed_name]
            aligned_seed, aligned_conseq, _score = align_it(
                seed_ref, relevant_conseq, gap_open_penalty,
                gap_extend_penalty, use_terminal_gap_penalty)
            relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed)
            d = Levenshtein.distance(relevant_seed, relevant_conseq)
            if seed_name == name:
                seed_dist = d
            elif other_dist is None or d < other_dist:
                other_seed = seed_name
                other_dist = d

        if seed_dist <= other_dist:
            # Consensus is closer to starting seed than any other seed: keep it.
            filtered_conseqs[name] = conseq
        if distance_report is not None:
            distance_report[name] = dict(seed_dist=seed_dist,
                                         other_dist=other_dist,
                                         other_seed=other_seed)
    if not filtered_conseqs:
        # No reference had acceptable coverage, choose one with most reads.
        best_ref = read_counts.most_common(1)[0][0]
        filtered_conseqs[best_ref] = new_conseqs[best_ref]
    return filtered_conseqs
Exemple #19
0
def sam_to_conseqs(samfile,
                   quality_cutoff=0,
                   debug_reports=None,
                   seeds=None,
                   is_filtered=False,
                   filter_coverage=1,
                   distance_report=None,
                   original_seeds=None):
    """ Build consensus sequences for each reference from a SAM file.

    @param samfile: an open file in the SAM format containing reads with their
        mapped position and quality scores
    @param quality_cutoff: minimum quality score for a base to be counted
    @param debug_reports: {(rname, pos): None} a dictionary with keys for all
        of the regions and positions that you want a report for. The value
        will be set to a string describing the counts and qualities at that
        position.
    @param seeds: {name: sequence} If this is set,
        any positions without coverage will be set to the base from the seed
        reference. If there are no reads mapped to a reference, it will not
        be included as a new consensus.
    @param is_filtered: if True, then any consensus that has migrated so far
        from its seed that it is closer to a different seed, will not be
        included as a new consensus.
    @param filter_coverage: when filtering on consensus distance, only include
        portions with at least this depth of coverage
    @param distance_report: empty dictionary or None. Dictionary will return:
        {rname: {'seed_dist': seed_dist, 'other_dist': other_dist,
        'other_seed': other_seed}}
    @param original_seeds: {name: sequence} Original seed references used in
        the distance report.
    @return: {reference_name: consensus_sequence}
    """

    if debug_reports:
        for key in debug_reports.keys():
            debug_reports[key] = Counter()

    # refmap structure: {refname: {pos: Counter({nuc: count})}}
    refmap = {}

    pairs = matchmaker(samfile, include_singles=True)
    merged_reads = map(partial(merge_reads, quality_cutoff), pairs)
    read_counts = Counter()
    for merged_read in merged_reads:
        if merged_read is None:
            continue
        rname, mseq, merged_inserts, qual1, qual2 = merged_read
        read_counts[rname] += 1
        pos_nucs = refmap.get(rname)
        if pos_nucs is None:
            pos_nucs = refmap[rname] = defaultdict(Counter)
        update_counts(rname,
                      qual1,
                      qual2,
                      mseq,
                      merged_inserts,
                      pos_nucs,
                      debug_reports)

    if debug_reports:
        for key, counts in debug_reports.items():
            mixtures = []
            nucs = set()
            qualities = set()
            for nuc, quality in counts.keys():
                nucs.add(nuc)
                qualities.add(quality)
            qualities = sorted(qualities)
            for min_quality in qualities:
                filtered_counts = Counter()
                for (nuc, nuc_qual), count in counts.items():
                    if nuc_qual >= min_quality:
                        filtered_counts[nuc] += count
                mixture = []
                for nuc, count in filtered_counts.items():
                    mixture.append('{}: {}'.format(nuc, count))
                mixtures.append('{}{{{}}}'.format(min_quality,
                                                  ', '.join(sorted(mixture))))
            debug_reports[key] = ', '.join(sorted(mixtures))

    new_conseqs = counts_to_conseqs(refmap, seeds)
    relevant_conseqs = None
    is_filtering = original_seeds and is_filtered

    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    while is_filtering and len(new_conseqs) > 1:
        drifted_seeds = []  # [(count, name)]
        if relevant_conseqs is None:
            relevant_conseqs = {}
            for name in sorted(new_conseqs.keys()):
                conseq = new_conseqs[name]
                counts = refmap[name]
                relevant_conseq = u''
                for pos, c in enumerate(conseq, 1):
                    pos_counts = sum(counts[pos].values())
                    if pos_counts >= filter_coverage:
                        relevant_conseq += c
                relevant_conseqs[name] = relevant_conseq
        for name in sorted(new_conseqs.keys()):
            relevant_conseq = relevant_conseqs[name]
            if not relevant_conseq:
                # None of the coverage was acceptable.
                drifted_seeds.append((read_counts[name], name))
                continue

            other_seed = other_dist = seed_dist = None
            for seed_name in sorted(new_conseqs.keys()):
                seed_ref = original_seeds[seed_name]
                aligned_seed, aligned_conseq, _score = align_it(seed_ref,
                                                                relevant_conseq,
                                                                gap_open_penalty,
                                                                gap_extend_penalty,
                                                                use_terminal_gap_penalty)
                relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed)
                d = Levenshtein.distance(relevant_seed, relevant_conseq)
                if seed_name == name:
                    seed_dist = d
                elif other_dist is None or d < other_dist:
                    other_seed = seed_name
                    other_dist = d

            if seed_dist > other_dist:
                # Consensus is farther from starting seed than another seed: drop it?
                drifted_seeds.append((read_counts[name], name))
            if distance_report is not None:
                distance_report[name] = dict(seed_dist=seed_dist,
                                             other_dist=other_dist,
                                             other_seed=other_seed)
        distance_report = None  # Only update during first iteration.
        if drifted_seeds:
            drifted_seeds.sort()
            dropped_seed = drifted_seeds[0][1]
            del new_conseqs[dropped_seed]
        is_filtering = len(drifted_seeds) > 1
    return new_conseqs
Exemple #20
0
def find_probes(contigs_csv, probes_csv):
    reader = DictReader(contigs_csv)
    columns = ['sample', 'contig']
    for target_name in TARGET_SEQUENCES:
        for column_type in [
                'in_contig_start', 'in_contig_size', 'in_hxb2_start',
                'in_hxb2_size', 'merged_hxb2_start', 'merged_hxb2_size',
                'dist', 'end_dist', 'score', 'is_reversed', 'seq'
        ]:
            columns.append(target_name + '_' + column_type)
    writer = DictWriter(probes_csv, columns)
    writer.writeheader()
    # projects = ProjectConfig.loadDefault()
    # hxb2 = projects.getReference('HIV1-B-FR-K03455-seed')
    hxb2 = utils.hxb2
    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    for sample_name, sample_rows in groupby(reader, itemgetter('sample')):
        contig_num = 0
        for row in sample_rows:
            seed_name = row.get('genotype') or row.get('ref') or row['region']
            conseq_cutoff = row.get('consensus-percent-cutoff')
            if conseq_cutoff and conseq_cutoff != 'MAX':
                continue
            contig_num += 1
            contig_name = f'{contig_num}-{seed_name}'
            contig_seq: str = row.get('contig') or row['sequence']
            aligned_hxb2, aligned_contig_to_hxb2, _ = align_it(
                hxb2, contig_seq, gap_open_penalty, gap_extend_penalty,
                use_terminal_gap_penalty)
            new_row = dict(sample=sample_name, contig=contig_name)
            for target_name, target_seq in TARGET_SEQUENCES.items():
                finder = ProbeFinder(contig_seq, target_seq)
                if not finder.valid:
                    return None
                size = len(finder.contig_match)
                start_pos = finder.start + 1
                end_pos = finder.start + size
                hxb2_pos = contig_pos = 0
                merged_hxb2_start = merged_hxb2_size = None
                for hxb2_nuc, contig_nuc in zip(aligned_hxb2,
                                                aligned_contig_to_hxb2):
                    if hxb2_nuc != '-':
                        hxb2_pos += 1
                    if contig_nuc != '-':
                        contig_pos += 1
                        if contig_pos == start_pos:
                            merged_hxb2_start = hxb2_pos
                        if contig_pos == end_pos:
                            merged_hxb2_size = hxb2_pos - merged_hxb2_start + 1
                            break

                aligned_ref, aligned_match, _ = align_it(
                    hxb2, finder.contig_match, gap_open_penalty,
                    gap_extend_penalty, use_terminal_gap_penalty)
                lstripped_match = aligned_match.lstrip('-')
                in_hxb2_start = len(aligned_match) - len(lstripped_match)
                tail_len = len(lstripped_match) - len(
                    lstripped_match.rstrip('-'))
                ref_match = aligned_ref[in_hxb2_start:-tail_len or None]
                in_hxb2_size = len(ref_match.replace('-', ''))

                prefix = target_name + '_'
                new_row[prefix + 'in_contig_start'] = start_pos
                new_row[prefix + 'in_contig_size'] = size
                new_row[prefix + 'in_hxb2_start'] = in_hxb2_start
                new_row[prefix + 'in_hxb2_size'] = in_hxb2_size
                new_row[prefix + 'merged_hxb2_start'] = merged_hxb2_start
                new_row[prefix + 'merged_hxb2_size'] = merged_hxb2_size
                new_row[prefix + 'dist'] = finder.dist
                new_row[prefix + 'end_dist'] = finder.end_dist
                new_row[prefix + 'score'] = finder.score
                new_row[prefix +
                        'is_reversed'] = ('Y' if finder.is_reversed else 'N')
                new_row[prefix + 'seq'] = finder.contig_match
            writer.writerow(new_row)
Exemple #21
0
def sam_to_conseqs(samfile,
                   quality_cutoff=0,
                   debug_reports=None,
                   seeds=None,
                   is_filtered=False,
                   filter_coverage=1,
                   distance_report=None,
                   original_seeds=None):
    """ Build consensus sequences for each reference from a SAM file.

    @param samfile: an open file in the SAM format containing reads with their
        mapped position and quality scores
    @param quality_cutoff: minimum quality score for a base to be counted
    @param debug_reports: {(rname, pos): None} a dictionary with keys for all
        of the regions and positions that you want a report for. The value
        will be set to a string describing the counts and qualities at that
        position.
    @param seeds: {name: sequence} If this is set,
        any positions without coverage will be set to the base from the seed
        reference. If there are no reads mapped to a reference, it will not
        be included as a new consensus.
    @param is_filtered: if True, then any consensus that has migrated so far
        from its seed that it is closer to a different seed, will not be
        included as a new consensus.
    @param filter_coverage: when filtering on consensus distance, only include
        portions with at least this depth of coverage
    @param distance_report: empty dictionary or None. Dictionary will return:
        {rname: {'seed_dist': seed_dist, 'other_dist': other_dist,
        'other_seed': other_seed}}
    @param original_seeds: {name: sequence} Original seed references used in
        the distance report.
    @return: {reference_name: consensus_sequence}
    """

    if debug_reports:
        for key in debug_reports.keys():
            debug_reports[key] = Counter()

    # refmap structure: {refname: {pos: Counter({nuc: count})}}
    refmap = {}

    pairs = matchmaker(samfile, include_singles=True)
    merged_reads = map(partial(merge_reads, quality_cutoff), pairs)
    read_counts = Counter()
    for merged_read in merged_reads:
        if merged_read is None:
            continue
        rname, mseq, merged_inserts, qual1, qual2 = merged_read
        read_counts[rname] += 1
        pos_nucs = refmap.get(rname)
        if pos_nucs is None:
            pos_nucs = refmap[rname] = defaultdict(Counter)
        update_counts(rname, qual1, qual2, mseq, merged_inserts, pos_nucs,
                      debug_reports)

    if debug_reports:
        for key, counts in debug_reports.items():
            mixtures = []
            nucs = set()
            qualities = set()
            for nuc, quality in counts.keys():
                nucs.add(nuc)
                qualities.add(quality)
            qualities = sorted(qualities)
            for min_quality in qualities:
                filtered_counts = Counter()
                for (nuc, nuc_qual), count in counts.items():
                    if nuc_qual >= min_quality:
                        filtered_counts[nuc] += count
                mixture = []
                for nuc, count in filtered_counts.items():
                    mixture.append('{}: {}'.format(nuc, count))
                mixtures.append('{}{{{}}}'.format(min_quality,
                                                  ', '.join(sorted(mixture))))
            debug_reports[key] = ', '.join(sorted(mixtures))

    new_conseqs = counts_to_conseqs(refmap, seeds)
    relevant_conseqs = None
    is_filtering = original_seeds and is_filtered

    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    while is_filtering and len(new_conseqs) > 1:
        drifted_seeds = []  # [(count, name)]
        if relevant_conseqs is None:
            relevant_conseqs = {}
            for name in sorted(new_conseqs.keys()):
                conseq = new_conseqs[name]
                counts = refmap[name]
                relevant_conseq = u''
                for pos, c in enumerate(conseq, 1):
                    pos_counts = sum(counts[pos].values())
                    if pos_counts >= filter_coverage:
                        relevant_conseq += c
                relevant_conseqs[name] = relevant_conseq
        for name in sorted(new_conseqs.keys()):
            relevant_conseq = relevant_conseqs[name]
            if not relevant_conseq:
                # None of the coverage was acceptable.
                drifted_seeds.append((read_counts[name], name))
                continue

            other_seed = other_dist = None
            for seed_name in sorted(new_conseqs.keys()):
                seed_ref = original_seeds[seed_name]
                aligned_seed, aligned_conseq, _score = align_it(
                    seed_ref, relevant_conseq, gap_open_penalty,
                    gap_extend_penalty, use_terminal_gap_penalty)
                relevant_seed = extract_relevant_seed(aligned_conseq,
                                                      aligned_seed)
                d = Levenshtein.distance(relevant_seed, relevant_conseq)
                if seed_name == name:
                    seed_dist = d
                elif other_dist is None or d < other_dist:
                    other_seed = seed_name
                    other_dist = d

            if seed_dist > other_dist:
                # Consensus is farther from starting seed than another seed: drop it?
                drifted_seeds.append((read_counts[name], name))
            if distance_report is not None:
                distance_report[name] = dict(seed_dist=seed_dist,
                                             other_dist=other_dist,
                                             other_seed=other_seed)
        distance_report = None  # Only update during first iteration.
        if drifted_seeds:
            drifted_seeds.sort()
            dropped_seed = drifted_seeds[0][1]
            del new_conseqs[dropped_seed]
        is_filtering = len(drifted_seeds) > 1
    return new_conseqs
Exemple #22
0
def sam_to_conseqs(samfile,
                   quality_cutoff=0,
                   debug_reports=None,
                   seeds=None,
                   is_filtered=False,
                   worker_pool=None,
                   filter_coverage=1,
                   distance_report=None):
    """ Build consensus sequences for each reference from a SAM file.

    @param samfile: an open file in the SAM format containing reads with their
        mapped position and quality scores
    @param quality_cutoff: minimum quality score for a base to be counted
    @param debug_reports: {(rname, pos): None} a dictionary with keys for all
        of the regions and positions that you want a report for. The value
        will be set to a string describing the counts and qualities at that
        position.
    @param seeds: {name: sequence} If this is set,
        any positions without coverage will be set to the base from the seed
        reference. If there are no reads mapped to a reference, it will not
        be included as a new consensus.
    @param is_filtered: if True, then any consensus that has migrated so far
        from its seed that it is closer to a different seed, will not be
        included as a new consensus.
    @param worker_pool: a pool to do some distributed processing
    @param filter_coverage: when filtering on consensus distance, only include
        portions with at least this depth of coverage
    @param distance_report: empty dictionary or None. Dictionary will return:
        {rname: {'seed_dist': seed_dist, 'other_dist': other_dist,
        'other_seed': other_seed}}
    @return: {reference_name: consensus_sequence}
    """

    if debug_reports:
        for key in debug_reports.iterkeys():
            debug_reports[key] = Counter()

    # refmap structure: {refname: {pos: {nuc: count}}}
    refmap = {}

    pairs = matchmaker(samfile, include_singles=True)
    if worker_pool is None:
        merged_reads = itertools.imap(
            partial(merge_reads, quality_cutoff),
            pairs)
    else:
        merged_reads = worker_pool.imap_unordered(
            partial(merge_reads, quality_cutoff),
            pairs,
            chunksize=100)
    read_counts = Counter()
    for merged_read in merged_reads:
        if merged_read is None:
            continue
        rname, mseq, merged_inserts, qual1, qual2 = merged_read
        read_counts[rname] += 1
        pos_nucs = refmap.get(rname)
        if pos_nucs is None:
            pos_nucs = refmap[rname] = defaultdict(Counter)
            if seeds:
                for i, nuc in enumerate(seeds[rname], 1):
                    pos_nucs[i][nuc] = 0
        update_counts(rname,
                      qual1,
                      qual2,
                      mseq,
                      merged_inserts,
                      pos_nucs,
                      debug_reports)

    if debug_reports:
        for key, counts in debug_reports.iteritems():
            mixtures = []
            nucs = set()
            qualities = set()
            for nuc, quality in counts.iterkeys():
                nucs.add(nuc)
                qualities.add(quality)
            qualities = sorted(qualities)
            for min_quality in qualities:
                filtered_counts = Counter()
                for (nuc, nuc_qual), count in counts.iteritems():
                    if nuc_qual >= min_quality:
                        filtered_counts[nuc] += count
                mixture = []
                for nuc, count in filtered_counts.iteritems():
                    mixture.append('{}: {}'.format(nuc, count))
                mixtures.append('{}{{{}}}'.format(min_quality,
                                                  ', '.join(mixture)))
            debug_reports[key] = ', '.join(mixtures)

    new_conseqs = counts_to_conseqs(refmap)
    if not (seeds and is_filtered) or len(new_conseqs) < 2:
        return new_conseqs

    gap_open_penalty = 15
    gap_extend_penalty = 3
    use_terminal_gap_penalty = 1
    filtered_conseqs = {}
    for name in sorted(new_conseqs.iterkeys()):
        conseq = new_conseqs[name]
        counts = refmap[name]
        relevant_conseq = u''
        for pos, c in enumerate(conseq, 1):
            pos_counts = sum(counts[pos].itervalues())
            if pos_counts >= filter_coverage:
                relevant_conseq += c
        if not relevant_conseq:
            # None of the coverage was acceptable.
            continue

        other_seed = other_dist = None
        for seed_name in sorted(new_conseqs.iterkeys()):
            seed_ref = seeds[seed_name]
            aligned_seed, aligned_conseq, _score = align_it(seed_ref,
                                                            relevant_conseq,
                                                            gap_open_penalty,
                                                            gap_extend_penalty,
                                                            use_terminal_gap_penalty)
            relevant_seed = extract_relevant_seed(aligned_conseq, aligned_seed)
            d = Levenshtein.distance(relevant_seed, relevant_conseq)
            if seed_name == name:
                seed_dist = d
            elif other_dist is None or d < other_dist:
                other_seed = seed_name
                other_dist = d

        if seed_dist <= other_dist:
            # Consensus is closer to starting seed than any other seed: keep it.
            filtered_conseqs[name] = conseq
        if distance_report is not None:
            distance_report[name] = dict(seed_dist=seed_dist,
                                         other_dist=other_dist,
                                         other_seed=other_seed)
    if not filtered_conseqs:
        # No reference had acceptable coverage, choose one with most reads.
        best_ref = read_counts.most_common(1)[0][0]
        filtered_conseqs[best_ref] = new_conseqs[best_ref]
    return filtered_conseqs