コード例 #1
0
    def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar):
        q_seq = q_seq[q_st:q_en]

        a = pysam.AlignedSegment()
        a.query_name = read_id
        a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq)
        a.flag = 0 if strand == 1 else 16
        a.reference_id = map_fp.get_tid(chrm)
        a.reference_start = r_st
        a.cigartuples = [(op, op_l) for op_l, op in cigar]
        a.template_length = q_en - q_st
        map_fp.write(a)

        nalign, nmatch, ndel, nins = [
            0,
        ] * 4
        for op_len, op in cigar:
            if op not in (4, 5): nalign += op_len
            if op in (0, 7): nmatch += op_len
            elif op in (2, 3): ndel += op_len
            elif op == 1: nins += op_len
        # compute alignment stats
        summ_fp.write('{}\t{:.2f}\t{}\t{}\t{}\t{}\n'.format(
            read_id, 100 * nmatch / float(nalign), nalign, nmatch, ndel, nins))
        summ_fp.flush()

        return
コード例 #2
0
    def write_alignment(map_res):
        # convert tuple back to namedtuple
        map_res = MAP_RES(*map_res)
        nalign, nmatch, ndel, nins = [
            0,
        ] * 4
        for op_len, op in map_res.cigar:
            if op not in (4, 5):
                nalign += op_len
            if op in (0, 7):
                nmatch += op_len
            elif op in (2, 3):
                ndel += op_len
            elif op == 1:
                nins += op_len
        bc_len = len(map_res.q_seq)
        q_seq = map_res.q_seq[map_res.q_st:map_res.q_en]

        a = prepare_mapping(
            map_res.read_id,
            q_seq if map_res.strand == 1 else mh.revcomp(q_seq),
            flag=get_map_flag(map_res.strand, map_res.map_num),
            ref_id=map_fp.get_tid(map_res.ctg),
            ref_st=map_res.r_st,
            map_qual=map_res.mapq,
            cigartuples=[(op, op_l) for op_l, op in map_res.cigar],
            tags=[('NM', nalign - nmatch)])
        map_fp.write(a)

        # compute alignment stats
        r_map_summ = MAP_SUMM(
            read_id=map_res.read_id,
            pct_identity=100 * nmatch / float(nalign),
            num_align=nalign,
            num_match=nmatch,
            num_del=ndel,
            num_ins=nins,
            read_pct_coverage=((map_res.q_en - map_res.q_st) * 100 /
                               float(bc_len)),
            chrom=map_res.ctg,
            strand=mh.int_strand_to_str(map_res.strand),
            start=map_res.r_st,
            end=map_res.r_st + nalign - nins,
            query_start=map_res.q_st,
            query_end=map_res.q_en,
            map_sig_start=map_res.map_sig_start,
            map_sig_end=map_res.map_sig_end,
            sig_len=map_res.sig_len,
            map_num=map_res.map_num)
        summ_fp.write(MAP_SUMM_TMPLT.format(r_map_summ))

        if ref_out_info.do_output.pr_refs and read_passes_filters(
                ref_out_info.filt_params, len(map_res.q_seq), map_res.q_st,
                map_res.q_en, map_res.cigar) and map_res.map_num == 0:
            pr_ref_fp.write('>{}\n{}\n'.format(map_res.read_id,
                                               map_res.ref_seq))
コード例 #3
0
ファイル: mapping.py プロジェクト: wanhsuan-Lee/megalodon
def align_read(q_seq, aligner, map_thr_buf, read_id=None):
    try:
        # enumerate all alignments to avoid memory leak from mappy
        r_algn = list(aligner.map(str(q_seq), buf=map_thr_buf))[0]
    except IndexError:
        # alignment not produced
        return None

    ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en)
    if r_algn.strand == -1:
        ref_seq = mh.revcomp(ref_seq)
    return MAP_RES(
        read_id=read_id, q_seq=q_seq, ref_seq=ref_seq, ctg=r_algn.ctg,
        strand=r_algn.strand, r_st=r_algn.r_st, r_en=r_algn.r_en,
        q_st=r_algn.q_st, q_en=r_algn.q_en, cigar=r_algn.cigar)
コード例 #4
0
ファイル: mapping.py プロジェクト: lcerdeira/megalodon
def align_read(q_seq, aligner, map_thr_buf, read_id=None):
    try:
        # enumerate all alignments to avoid memory leak from mappy
        r_algn = list(aligner.map(str(q_seq), buf=map_thr_buf))[0]
    except IndexError:
        # alignment not produced
        return [None, None], None

    ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en)
    if r_algn.strand == -1:
        ref_seq = mh.revcomp(ref_seq)
    r_algn_data = [
        r_algn.ctg, r_algn.strand, r_algn.r_st, r_algn.r_en, r_algn.q_st,
        r_algn.q_en, r_algn.cigar
    ]
    return [ref_seq,
            r_algn_data], (read_id, q_seq, r_algn.ctg, r_algn.strand,
                           r_algn.r_st, r_algn.q_st, r_algn.q_en, r_algn.cigar)
コード例 #5
0
 def parse_alignment(r_algn, map_num=0):
     ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en)
     if r_algn.strand == -1:
         ref_seq = mh.revcomp(ref_seq)
     r_map_res = MAP_RES(read_id=read_id,
                         q_seq=q_seq,
                         ref_seq=ref_seq,
                         ctg=r_algn.ctg,
                         strand=r_algn.strand,
                         r_st=r_algn.r_st,
                         r_en=r_algn.r_en,
                         q_st=r_algn.q_st,
                         q_en=r_algn.q_en,
                         cigar=r_algn.cigar,
                         map_num=map_num,
                         mapq=r_algn.mapq)
     if return_tuple:
         return tuple(r_map_res)
     return r_map_res
コード例 #6
0
ファイル: mapping.py プロジェクト: lcerdeira/megalodon
    def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar):
        nalign, nmatch, ndel, nins = [
            0,
        ] * 4
        for op_len, op in cigar:
            if op not in (4, 5):
                nalign += op_len
            if op in (0, 7):
                nmatch += op_len
            elif op in (2, 3):
                ndel += op_len
            elif op == 1:
                nins += op_len
        bc_len = len(q_seq)
        q_seq = q_seq[q_st:q_en]

        a = pysam.AlignedSegment()
        a.query_name = read_id
        a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq)
        a.flag = 0 if strand == 1 else 16
        a.reference_id = map_fp.get_tid(chrm)
        a.reference_start = r_st
        a.cigartuples = [(op, op_l) for op_l, op in cigar]
        a.template_length = q_en - q_st
        # add NM tag containing edit distance to the reference
        a.tags = (('NM', nalign - nmatch), )
        map_fp.write(a)

        # compute alignment stats
        r_map_summ = MAP_SUMM(read_id=read_id,
                              pct_identity=100 * nmatch / float(nalign),
                              num_align=nalign,
                              num_match=nmatch,
                              num_del=ndel,
                              num_ins=nins,
                              read_pct_coverage=(q_en - q_st) * 100 /
                              float(bc_len),
                              chrom=chrm,
                              strand=mh.int_strand_to_str(strand),
                              start=r_st,
                              end=r_st + nalign - nins)
        summ_fp.write(MAP_SUMM_TMPLT.format(r_map_summ))
コード例 #7
0
    def write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm, strand,
                                 r_st, snp_cigar):
        a = pysam.AlignedSegment()
        a.query_name = read_id
        a.flag = 0 if strand == 1 else 16
        a.reference_id = whatshap_map_fp.get_tid(chrm)
        a.reference_start = r_st
        a.template_length = len(snp_seq)
        a.mapping_quality = WHATSHAP_MAX_QUAL
        a.set_tags([('RG', WHATSHAP_RG_ID)])

        # convert to reference based sequence
        if strand == -1:
            snp_seq = mh.revcomp(snp_seq)
            snp_quals = snp_quals[::-1]
            snp_cigar = snp_cigar[::-1]
        a.query_sequence = snp_seq
        a.query_qualities = array('B', snp_quals)
        a.cigartuples = snp_cigar
        whatshap_map_fp.write(a)

        return
コード例 #8
0
def call_read_snps(snps_data, r_ref_pos, edge_buffer, r_ref_seq, rl_cumsum,
                   r_to_q_poss, r_post, post_mapped_start):
    # call all snps overlapping this read
    r_snp_calls = []
    for (snp_ref_seq, snp_alt_seqs, snp_id,
         snp_ref_pos) in snps_data.iter_overlapping_snps(
             r_ref_pos, edge_buffer):

        if r_ref_pos.strand == 1:
            read_pos = snp_ref_pos - r_ref_pos.start
            read_ref_seq = snp_ref_seq
            read_alt_seqs = snp_alt_seqs
        else:
            read_pos = r_ref_pos.end - snp_ref_pos - len(snp_ref_seq)
            read_ref_seq = mh.revcomp(snp_ref_seq)
            read_alt_seqs = [mh.revcomp(alt_seq) for alt_seq in snp_alt_seqs]

        # select single base SNP or indel context width
        snp_context_bases = snps_data.indel_context if all(
            len(snp_ref_seq) == len(snp_alt_seq)
            for snp_alt_seq in snp_alt_seqs) else snps_data.snp_context
        pos_bb = min(snp_context_bases, read_pos)
        pos_ab = min(snp_context_bases,
                     r_ref_seq.shape[0] - read_pos - len(read_ref_seq))
        pos_ref_seq = r_ref_seq[read_pos - pos_bb:read_pos + pos_ab +
                                len(read_ref_seq)]
        # TODO move this to an initial check of a small number of variants
        # against the reference
        if any(pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)] != np.array(
            [mh.ALPHABET.find(b) for b in read_ref_seq])):
            # variant reference sequence does not match fasta reference
            logger = logging.get_logger()
            logger.debug(
                '*' * 10 +
                'Refernce seq at {} expected {}[{}]{} got "{}"'.format(
                    snp_ref_pos,
                    ''.join(mh.ALPHABET[b]
                            for b in pos_ref_seq[pos_bb - 3:pos_bb]),
                    ''.join(mh.ALPHABET[b]
                            for b in pos_ref_seq[pos_bb:pos_bb +
                                                 len(snp_ref_seq)]),
                    ''.join(
                        mh.ALPHABET[b]
                        for b in pos_ref_seq[pos_bb + len(snp_ref_seq):pos_bb +
                                             len(snp_ref_seq) + 3]),
                    read_ref_seq,
                ) + '*' * 10)
            continue
        blk_start = rl_cumsum[r_to_q_poss[read_pos - pos_bb]]
        blk_end = rl_cumsum[r_to_q_poss[read_pos + pos_ab] + 1]
        if blk_end - blk_start < max(
                len(pos_ref_seq),
                max(len(read_alt_seq) for read_alt_seq in read_alt_seqs)):
            # no valid mapping over large inserted query bases
            # i.e. need as many "events/strides" as bases for valid mapping
            continue

        loc_ref_score = score_seq(r_post, pos_ref_seq,
                                  post_mapped_start + blk_start,
                                  post_mapped_start + blk_end,
                                  snps_data.all_paths)
        loc_alt_llrs = []
        for read_alt_seq in read_alt_seqs:
            pos_alt_seq = np.concatenate([
                pos_ref_seq[:pos_bb],
                np.array([mh.ALPHABET.find(b) for b in read_alt_seq],
                         dtype=np.uintp),
                pos_ref_seq[pos_bb + len(snp_ref_seq):]
            ])
            loc_alt_score = score_seq(r_post, pos_alt_seq,
                                      post_mapped_start + blk_start,
                                      post_mapped_start + blk_end,
                                      snps_data.all_paths)
            # calibrate log probs
            loc_alt_llrs.append(
                snps_data.calibrate_llr(loc_ref_score - loc_alt_score,
                                        read_ref_seq, read_alt_seq))

        # due to calibration mutli-allelic log likelihoods could result in
        # inferred negative reference likelihood, so re-normalize here
        loc_alt_log_ps = calibration.compute_log_probs(np.array(loc_alt_llrs))

        r_snp_calls.append(
            (snp_ref_pos, loc_alt_log_ps, snp_ref_seq, snp_alt_seqs, snp_id))

    return r_snp_calls