def annotate_softclips(cv, read, ci_file): sc_idxs = [idx for idx, clip in enumerate(read.cigar) if clip[0] == constants.CIGAR['soft-clip'] \ and clip[1] >= MIN_CLIP] for sc_idx in sc_idxs: cigar = read.cigar[sc_idx] cv.cvtype, cv.vsize, cv.cvsize = 'UN', int(cigar[1]), int(cigar[1]) sc_left = sc_idx == 0 block_idx = 0 if sc_idx == 0 else np.max( np.where(np.array([b[0] for b in cv.blocks]) < sc_idx)[0]) block = cv.blocks[block_idx][1] cv.pos = int(block[0]) + 1 if sc_left else int(block[1]) rcigar = read.cigar[::-1] if cv.strand == '-' else read.cigar cv.cpos = sum( [v for c, v in rcigar[:sc_idx] if c in constants.AFFECT_CONTIG]) varseq = read.query_sequence[cv.cpos:(cv.cpos + cv.cvsize)] refseq = read.get_reference_sequence() cv.ref = refseq[0] if sc_left else refseq[-1] cv.alt = '%s]%s:%d]%s' % (varseq, cv.chrom, cv.pos-1, cv.ref) if sc_left else \ '%s[%s:%d[%s' % (cv.ref, cv.chrom, cv.pos+1, varseq) cv.vid = get_next_id(read.query_name) CrypticVariant.write_contig_info(ci_file, cv) print(cv.vcf_output())
def main(): args = parse_args() set_globals(args) init_logging(args.log) print(VCF.get_header(args.sample)) CrypticVariant.write_contig_header(args.contig_info_output) annotate_contigs(args)
def annotate_gaps(cv, read, ci_file): ''' Annotate deletions and insertions ''' gap_idxs = [idx for idx, gap in enumerate(read.cigar) if gap[0] in constants.GAPS and gap[1] >= MIN_GAP] for gap_idx in gap_idxs: cigar = read.cigar[gap_idx] cv.vsize = int(cigar[1]) block_idx = 0 if gap_idx == 0 else np.max(np.where(np.array([b[0] for b in cv.blocks]) < gap_idx)[0]) block = cv.blocks[block_idx][1] cv.pos = int(block[1]) # position of variant on contig cv.cpos = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_CONTIG]) cv.cvsize = cv.vsize if read.cigar[gap_idx][0] == constants.CIGAR['insertion'] else 0 # ^ only insertions affect contig pos if cigar[0] == constants.CIGAR['insertion']: cv.cvtype = 'INS' seq_pos1 = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_CONTIG \ and c != constants.CIGAR['hard-clip']]) seq_pos2 = seq_pos1 + cv.cvsize seq = read.query_sequence[(seq_pos1-1):seq_pos2] cv.ref, cv.alt = seq[:1], seq else: cv.cvtype = 'DEL' seq_pos1 = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_REF]) seq_pos2 = seq_pos1 + cv.vsize seq = read.get_reference_sequence()[(seq_pos1-1):seq_pos2] cv.ref, cv.alt = seq, seq[:1] cv.vid = get_next_id(read.query_name) CrypticVariant.write_contig_info(ci_file, cv) print(cv.vcf_output())
def annotate_single_read(args, read, juncs, ex_ref, ref_trees, outbam=None, genes=''): ''' Annotate insertions, deletions and soft-clips on a single read ''' ci_file = args.contig_info_output genes = get_overlapping_genes(read, ref_trees) if genes == '' else genes fusion = any([op == constants.CIGAR['hard-clip'] and val >= MIN_CLIP for op, val in read.cigar]) if genes == '' and not fusion: logging.info('No gene(s) intersecting read %s; skipping' % read.query_name) return # check for contig gaps or soft-clips has_gaps = any([op in constants.GAPS and val >= MIN_GAP for op, val in read.cigar]) has_scs = any([op == constants.CIGAR['soft-clip'] and val >= MIN_CLIP for op, val in read.cigar]) is_spliced = any([op == constants.CIGAR['skipped'] for op, val in read.cigar]) # check junctions tx_juncs = get_tx_juncs(read) unknown_juncs = ['%s:%s-%s' % (c, s, e) not in juncs[0] for c, s, e in tx_juncs] has_novel_juncs = any(unknown_juncs) # check for novel blocks chr_ref = get_chr_ref(read, ex_ref) has_novel_blocks = any([bh.is_novel_block(block, chr_ref, MIN_CLIP) for block in read.get_blocks()]) if has_gaps or has_scs or has_novel_juncs or has_novel_blocks: cv = CrypticVariant().from_read(read) cv.genes = genes if has_gaps: annotate_gaps(cv, read, ci_file) if has_scs: annotate_softclips(cv, read, ci_file) if has_novel_juncs: novel_juncs = [list(x) for x in np.array(tx_juncs)[unknown_juncs]] annotate_juncs(cv, read, juncs[1], novel_juncs, ci_file) if has_novel_blocks: annotate_blocks(cv, read, chr_ref, ci_file) if outbam: outbam.write(read) else: logging.info('Nothing to annotate for read %s (read matches reference)' % read.query_name)
def annotate_juncs(cv, read, locs, novel_juncs, ci_file): for junc in novel_juncs: pos1, pos2 = int(junc[1]), int(junc[2]) junc_idx = [idx for idx, block in cv.blocks if block[1] == pos1][0] junc_type = read.cigar[junc_idx + 1][0] if junc_type in constants.GAPS or junc_type == constants.CIGAR[ 'soft-clip']: continue cp = CrypticVariant().from_read(read) # partner variant cp.genes = cv.genes varseq, refseq = '', read.get_reference_sequence() cpos = sum([ v for c, v in read.cigar[:(junc_idx + 1)] if c in constants.AFFECT_CONTIG ]) rpos = sum([ v for c, v in read.cigar[:(junc_idx + 1)] if c in constants.AFFECT_REF ]) cv.cpos, cp.cpos = cpos, cpos cv.pos, cp.pos = pos1, pos2 + 1 cv.ref, cp.ref = refseq[rpos - 1], refseq[rpos] cv.alt = '%s[%s:%d[%s' % (cv.ref, cv.chrom, cv.pos, varseq) cp.alt = '%s]%s:%d]%s' % (varseq, cp.chrom, cp.pos, cp.ref) cv.vid = get_next_id(read.query_name) cp.vid = get_next_id(read.query_name) cv.parid, cp.parid = cp.vid, cv.vid loc_left = '%s:%d' % (cv.chrom, pos1) loc_right = '%s:%d' % (cv.chrom, pos2) if not (loc_left in locs) and not (loc_right in locs): # neither end annotated, novel exon junction cv.cvtype, cp.cvtype = 'NEJ', 'NEJ' elif not (loc_left in locs and loc_right in locs): # one end unannotated, partial novel junction cv.cvtype, cp.cvtype = 'PNJ', 'PNJ' else: # both ends annotated, alternative splice site cv.cvtype, cp.cvtype = 'AS', 'AS' print(cv.vcf_output()) print(cp.vcf_output()) CrypticVariant.write_contig_info(ci_file, cv, cp)
def annotate_fusion(args, read, juncs, bam_idx, ex_ref, ref_trees, outbam): try: r1, r2 = bam_idx.find(read.query_name) except ValueError: logging.info( 'WARNING: found >2 reads matching hard-clipped read %s; cannot process' % read.query_name) return ci_file = args.contig_info_output cv1 = CrypticVariant().from_read(r1) cv2 = CrypticVariant().from_read(r2) cv1.cvtype, cv2.cvtype = 'FUS', 'FUS' cv1.genes = get_overlapping_genes(r1, ref_trees) cv2.genes = get_overlapping_genes(r2, ref_trees) if cv1.genes == cv2.genes: # intra-genic rearrangement cv1.cvtype, cv2.cvtype = 'IGR', 'IGR' if cv1.genes == '' and cv2.genes == '': # no intersecting gene, this is not an interesting fusion logging.info( 'No gene(s) intersecting candidate fusion contig %s; skipping' % read.query_name) record[read.query_name] = [] return hc_idx1 = [ idx for idx, clip in enumerate(r1.cigar) if clip[0] == constants.CIGAR['hard-clip'] ][0] hc_left1 = hc_idx1 == 0 block_idx1 = 0 if hc_left1 else np.max( np.where(np.array([b[0] for b in cv1.blocks]) < hc_idx1)[0]) block1 = cv1.blocks[block_idx1][1] cv1.pos = int(block1[0]) if hc_left1 else int(block1[1]) hc_idx2 = [ idx for idx, clip in enumerate(r2.cigar) if clip[0] == constants.CIGAR['hard-clip'] ][0] hc_left2 = hc_idx2 == 0 block_idx2 = 0 if hc_left2 else np.max( np.where(np.array([b[0] for b in cv2.blocks]) < hc_idx2)[0]) block2 = cv2.blocks[block_idx2][1] cv2.pos = int(block2[0]) if hc_left2 else int(block2[1]) #TODO: handle inserted sequence btw fusion varseq1, varseq2 = '', '' refseq1 = r1.get_reference_sequence() refseq2 = r2.get_reference_sequence() bracket_dir1 = '[' if r1.is_reverse == r2.is_reverse else ']' bracket_dir2 = ']' if r1.is_reverse == r2.is_reverse else ']' cv1.ref = refseq1[0] if hc_left1 else refseq1[-1] cv2.ref = refseq2[0] if hc_left2 else refseq1[-1] if r1.is_reverse == r2.is_reverse: cv1.alt = '%s]%s:%d]%s' % (varseq1, cv2.chrom, cv2.pos-1, cv1.ref) if hc_left1 else \ '%s[%s:%d[%s' % (cv1.ref, cv2.chrom, cv2.pos+1, varseq1) cv2.alt = '%s]%s:%d]%s' % (varseq2, cv1.chrom, cv1.pos-1, cv2.ref) if hc_left2 else \ '%s[%s:%d[%s' % (cv2.ref, cv1.chrom, cv1.pos+1, varseq2) else: # contigs align on opposite strands cv1.alt = '%s[%s:%d[%s' % (varseq1, cv2.chrom, cv2.pos-1, cv1.ref) if hc_left1 else \ '%s]%s:%d]%s' % (cv1.ref, cv2.chrom, cv2.pos+1, varseq1) cv2.alt = '%s[%s:%d[%s' % (varseq2, cv1.chrom, cv1.pos-1, cv2.ref) if hc_left2 else \ '%s]%s:%d]%s' % (cv2.ref, cv1.chrom, cv1.pos+1, varseq2) cv1.vid = get_next_id(r1.query_name) cv2.vid = get_next_id(r2.query_name) cv1.parid, cv2.parid = cv2.vid, cv1.vid print(cv1.vcf_output()) print(cv2.vcf_output()) outbam.write(r1) outbam.write(r2) CrypticVariant.write_contig_info(ci_file, cv1, cv2) annotate_single_read(args, r1, juncs, ex_ref, ref_trees, genes=cv1.genes) annotate_single_read(args, r2, juncs, ex_ref, ref_trees, genes=cv2.genes)
def annotate_blocks(cv, read, chr_ref, ci_file): ''' Annotate any sequence that is outside of exonic regions ''' cv.parid = '.' # blocks don't have pairs novel_blocks = [(idx, block) for idx, block in cv.blocks if bh.is_novel_block(block, chr_ref, MIN_CLIP)] for block_idx, block in novel_blocks: cpos1 = sum([ v for c, v in read.cigar[:block_idx] if c in constants.AFFECT_CONTIG ]) cpos2 = sum([ v for c, v in read.cigar[:block_idx + 1] if c in constants.AFFECT_CONTIG ]) # whether sequence block is overlapping, or on left or right side of contig block olapping = chr_ref[np.logical_and(block[0] < chr_ref.start, block[1] > chr_ref.end)] left = chr_ref[np.logical_and(block[1] > chr_ref.start, block[1] <= chr_ref.end)] right = chr_ref[np.logical_and(block[0] >= chr_ref.start, block[0] < chr_ref.end)] if len(left) > 0 and len(right) > 0: # retained intron cv.cvtype = 'RI' qseq, rseq = bh.get_block_sequence(read, block_idx) seq_right_pos = block[1] - min(left.start) seq_left_pos = max(right.end) - block[0] cv.pos = block[0] + seq_left_pos + 1 cv.ref = rseq[seq_left_pos:(-seq_right_pos)] cv.alt = ']' + qseq[seq_left_pos:(-seq_right_pos)] + '[' cv.cpos = cpos1 + seq_left_pos cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 - len(cv.ref)), len(cv.alt) - 2 cv.vid = get_next_id(read.query_name) elif len(olapping) > 0: # annotate left side cv.cvtype = 'EE' cv = annotate_block_left(cv, read, cpos2, olapping, block, block_idx) cv.vid = get_next_id(read.query_name) print(cv.vcf_output()) CrypticVariant.write_contig_info(ci_file, cv) # annotate right side cv = annotate_block_right(cv, read, cpos1, olapping, block, block_idx) cv.vid = get_next_id(read.query_name) elif len(left) > 0: # annotate left side cv.cvtype = 'EE' cv = annotate_block_left(cv, read, cpos2, left, block, block_idx) cv.vid = get_next_id(read.query_name) elif len(right) > 0: # annotate right side cv.cvtype = 'EE' cv = annotate_block_right(cv, read, cpos1, right, block, block_idx) cv.vid = get_next_id(read.query_name) else: # block does not cross any annotation qseq, rseq = bh.get_block_sequence(read, block_idx) cv.ref, cv.alt = rseq, '[' + qseq + ']' cv.pos, cv.cvtype = block[0] + 1, 'NE' cv.cpos = cpos1 cv.vid = get_next_id(read.query_name) cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 - len(cv.ref)), len(cv.alt) - 2 print(cv.vcf_output()) CrypticVariant.write_contig_info(ci_file, cv)