def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar): q_seq = q_seq[q_st:q_en] a = pysam.AlignedSegment() a.query_name = read_id a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq) a.flag = 0 if strand == 1 else 16 a.reference_id = map_fp.get_tid(chrm) a.reference_start = r_st a.cigartuples = [(op, op_l) for op_l, op in cigar] a.template_length = q_en - q_st map_fp.write(a) nalign, nmatch, ndel, nins = [ 0, ] * 4 for op_len, op in cigar: if op not in (4, 5): nalign += op_len if op in (0, 7): nmatch += op_len elif op in (2, 3): ndel += op_len elif op == 1: nins += op_len # compute alignment stats summ_fp.write('{}\t{:.2f}\t{}\t{}\t{}\t{}\n'.format( read_id, 100 * nmatch / float(nalign), nalign, nmatch, ndel, nins)) summ_fp.flush() return
def write_alignment(map_res): # convert tuple back to namedtuple map_res = MAP_RES(*map_res) nalign, nmatch, ndel, nins = [ 0, ] * 4 for op_len, op in map_res.cigar: if op not in (4, 5): nalign += op_len if op in (0, 7): nmatch += op_len elif op in (2, 3): ndel += op_len elif op == 1: nins += op_len bc_len = len(map_res.q_seq) q_seq = map_res.q_seq[map_res.q_st:map_res.q_en] a = prepare_mapping( map_res.read_id, q_seq if map_res.strand == 1 else mh.revcomp(q_seq), flag=get_map_flag(map_res.strand, map_res.map_num), ref_id=map_fp.get_tid(map_res.ctg), ref_st=map_res.r_st, map_qual=map_res.mapq, cigartuples=[(op, op_l) for op_l, op in map_res.cigar], tags=[('NM', nalign - nmatch)]) map_fp.write(a) # compute alignment stats r_map_summ = MAP_SUMM( read_id=map_res.read_id, pct_identity=100 * nmatch / float(nalign), num_align=nalign, num_match=nmatch, num_del=ndel, num_ins=nins, read_pct_coverage=((map_res.q_en - map_res.q_st) * 100 / float(bc_len)), chrom=map_res.ctg, strand=mh.int_strand_to_str(map_res.strand), start=map_res.r_st, end=map_res.r_st + nalign - nins, query_start=map_res.q_st, query_end=map_res.q_en, map_sig_start=map_res.map_sig_start, map_sig_end=map_res.map_sig_end, sig_len=map_res.sig_len, map_num=map_res.map_num) summ_fp.write(MAP_SUMM_TMPLT.format(r_map_summ)) if ref_out_info.do_output.pr_refs and read_passes_filters( ref_out_info.filt_params, len(map_res.q_seq), map_res.q_st, map_res.q_en, map_res.cigar) and map_res.map_num == 0: pr_ref_fp.write('>{}\n{}\n'.format(map_res.read_id, map_res.ref_seq))
def align_read(q_seq, aligner, map_thr_buf, read_id=None): try: # enumerate all alignments to avoid memory leak from mappy r_algn = list(aligner.map(str(q_seq), buf=map_thr_buf))[0] except IndexError: # alignment not produced return None ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en) if r_algn.strand == -1: ref_seq = mh.revcomp(ref_seq) return MAP_RES( read_id=read_id, q_seq=q_seq, ref_seq=ref_seq, ctg=r_algn.ctg, strand=r_algn.strand, r_st=r_algn.r_st, r_en=r_algn.r_en, q_st=r_algn.q_st, q_en=r_algn.q_en, cigar=r_algn.cigar)
def align_read(q_seq, aligner, map_thr_buf, read_id=None): try: # enumerate all alignments to avoid memory leak from mappy r_algn = list(aligner.map(str(q_seq), buf=map_thr_buf))[0] except IndexError: # alignment not produced return [None, None], None ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en) if r_algn.strand == -1: ref_seq = mh.revcomp(ref_seq) r_algn_data = [ r_algn.ctg, r_algn.strand, r_algn.r_st, r_algn.r_en, r_algn.q_st, r_algn.q_en, r_algn.cigar ] return [ref_seq, r_algn_data], (read_id, q_seq, r_algn.ctg, r_algn.strand, r_algn.r_st, r_algn.q_st, r_algn.q_en, r_algn.cigar)
def parse_alignment(r_algn, map_num=0): ref_seq = aligner.seq(r_algn.ctg, r_algn.r_st, r_algn.r_en) if r_algn.strand == -1: ref_seq = mh.revcomp(ref_seq) r_map_res = MAP_RES(read_id=read_id, q_seq=q_seq, ref_seq=ref_seq, ctg=r_algn.ctg, strand=r_algn.strand, r_st=r_algn.r_st, r_en=r_algn.r_en, q_st=r_algn.q_st, q_en=r_algn.q_en, cigar=r_algn.cigar, map_num=map_num, mapq=r_algn.mapq) if return_tuple: return tuple(r_map_res) return r_map_res
def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar): nalign, nmatch, ndel, nins = [ 0, ] * 4 for op_len, op in cigar: if op not in (4, 5): nalign += op_len if op in (0, 7): nmatch += op_len elif op in (2, 3): ndel += op_len elif op == 1: nins += op_len bc_len = len(q_seq) q_seq = q_seq[q_st:q_en] a = pysam.AlignedSegment() a.query_name = read_id a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq) a.flag = 0 if strand == 1 else 16 a.reference_id = map_fp.get_tid(chrm) a.reference_start = r_st a.cigartuples = [(op, op_l) for op_l, op in cigar] a.template_length = q_en - q_st # add NM tag containing edit distance to the reference a.tags = (('NM', nalign - nmatch), ) map_fp.write(a) # compute alignment stats r_map_summ = MAP_SUMM(read_id=read_id, pct_identity=100 * nmatch / float(nalign), num_align=nalign, num_match=nmatch, num_del=ndel, num_ins=nins, read_pct_coverage=(q_en - q_st) * 100 / float(bc_len), chrom=chrm, strand=mh.int_strand_to_str(strand), start=r_st, end=r_st + nalign - nins) summ_fp.write(MAP_SUMM_TMPLT.format(r_map_summ))
def write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm, strand, r_st, snp_cigar): a = pysam.AlignedSegment() a.query_name = read_id a.flag = 0 if strand == 1 else 16 a.reference_id = whatshap_map_fp.get_tid(chrm) a.reference_start = r_st a.template_length = len(snp_seq) a.mapping_quality = WHATSHAP_MAX_QUAL a.set_tags([('RG', WHATSHAP_RG_ID)]) # convert to reference based sequence if strand == -1: snp_seq = mh.revcomp(snp_seq) snp_quals = snp_quals[::-1] snp_cigar = snp_cigar[::-1] a.query_sequence = snp_seq a.query_qualities = array('B', snp_quals) a.cigartuples = snp_cigar whatshap_map_fp.write(a) return
def call_read_snps(snps_data, r_ref_pos, edge_buffer, r_ref_seq, rl_cumsum, r_to_q_poss, r_post, post_mapped_start): # call all snps overlapping this read r_snp_calls = [] for (snp_ref_seq, snp_alt_seqs, snp_id, snp_ref_pos) in snps_data.iter_overlapping_snps( r_ref_pos, edge_buffer): if r_ref_pos.strand == 1: read_pos = snp_ref_pos - r_ref_pos.start read_ref_seq = snp_ref_seq read_alt_seqs = snp_alt_seqs else: read_pos = r_ref_pos.end - snp_ref_pos - len(snp_ref_seq) read_ref_seq = mh.revcomp(snp_ref_seq) read_alt_seqs = [mh.revcomp(alt_seq) for alt_seq in snp_alt_seqs] # select single base SNP or indel context width snp_context_bases = snps_data.indel_context if all( len(snp_ref_seq) == len(snp_alt_seq) for snp_alt_seq in snp_alt_seqs) else snps_data.snp_context pos_bb = min(snp_context_bases, read_pos) pos_ab = min(snp_context_bases, r_ref_seq.shape[0] - read_pos - len(read_ref_seq)) pos_ref_seq = r_ref_seq[read_pos - pos_bb:read_pos + pos_ab + len(read_ref_seq)] # TODO move this to an initial check of a small number of variants # against the reference if any(pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)] != np.array( [mh.ALPHABET.find(b) for b in read_ref_seq])): # variant reference sequence does not match fasta reference logger = logging.get_logger() logger.debug( '*' * 10 + 'Refernce seq at {} expected {}[{}]{} got "{}"'.format( snp_ref_pos, ''.join(mh.ALPHABET[b] for b in pos_ref_seq[pos_bb - 3:pos_bb]), ''.join(mh.ALPHABET[b] for b in pos_ref_seq[pos_bb:pos_bb + len(snp_ref_seq)]), ''.join( mh.ALPHABET[b] for b in pos_ref_seq[pos_bb + len(snp_ref_seq):pos_bb + len(snp_ref_seq) + 3]), read_ref_seq, ) + '*' * 10) continue blk_start = rl_cumsum[r_to_q_poss[read_pos - pos_bb]] blk_end = rl_cumsum[r_to_q_poss[read_pos + pos_ab] + 1] if blk_end - blk_start < max( len(pos_ref_seq), max(len(read_alt_seq) for read_alt_seq in read_alt_seqs)): # no valid mapping over large inserted query bases # i.e. need as many "events/strides" as bases for valid mapping continue loc_ref_score = score_seq(r_post, pos_ref_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, snps_data.all_paths) loc_alt_llrs = [] for read_alt_seq in read_alt_seqs: pos_alt_seq = np.concatenate([ pos_ref_seq[:pos_bb], np.array([mh.ALPHABET.find(b) for b in read_alt_seq], dtype=np.uintp), pos_ref_seq[pos_bb + len(snp_ref_seq):] ]) loc_alt_score = score_seq(r_post, pos_alt_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, snps_data.all_paths) # calibrate log probs loc_alt_llrs.append( snps_data.calibrate_llr(loc_ref_score - loc_alt_score, read_ref_seq, read_alt_seq)) # due to calibration mutli-allelic log likelihoods could result in # inferred negative reference likelihood, so re-normalize here loc_alt_log_ps = calibration.compute_log_probs(np.array(loc_alt_llrs)) r_snp_calls.append( (snp_ref_pos, loc_alt_log_ps, snp_ref_seq, snp_alt_seqs, snp_id)) return r_snp_calls