Beispiel #1
0
def annotate_single_read(args, read, juncs, ex_ref, ref_trees, outbam=None, genes=''):
    '''
    Annotate insertions, deletions and soft-clips on a single read
    '''
    ci_file = args.contig_info_output
    genes = get_overlapping_genes(read, ref_trees) if genes == '' else genes
    fusion = any([op == constants.CIGAR['hard-clip'] and val >= MIN_CLIP for op, val in read.cigar])
    if genes == '' and not fusion:
        logging.info('No gene(s) intersecting read %s; skipping' % read.query_name)
        return

    # check for contig gaps or soft-clips
    has_gaps = any([op in constants.GAPS and val >= MIN_GAP for op, val in read.cigar])
    has_scs = any([op == constants.CIGAR['soft-clip'] and val >= MIN_CLIP for op, val in read.cigar])
    is_spliced = any([op == constants.CIGAR['skipped'] for op, val in read.cigar])

    # check junctions
    tx_juncs = get_tx_juncs(read)
    unknown_juncs = ['%s:%s-%s' % (c, s, e) not in juncs[0] for c, s, e in tx_juncs]
    has_novel_juncs = any(unknown_juncs)

    # check for novel blocks
    chr_ref = get_chr_ref(read, ex_ref)
    has_novel_blocks = any([bh.is_novel_block(block, chr_ref, MIN_CLIP) for block in read.get_blocks()])

    if has_gaps or has_scs or has_novel_juncs or has_novel_blocks:
        cv = CrypticVariant().from_read(read)
        cv.genes = genes
        if has_gaps:
            annotate_gaps(cv, read, ci_file)
        if has_scs:
            annotate_softclips(cv, read, ci_file)
        if has_novel_juncs:
            novel_juncs = [list(x) for x in np.array(tx_juncs)[unknown_juncs]]
            annotate_juncs(cv, read, juncs[1], novel_juncs, ci_file)
        if has_novel_blocks:
            annotate_blocks(cv, read, chr_ref, ci_file)
        if outbam:
            outbam.write(read)
    else:
        logging.info('Nothing to annotate for read %s (read matches reference)' % read.query_name)
Beispiel #2
0
def annotate_blocks(cv, read, chr_ref, ci_file):
    '''
    Annotate any sequence that is outside of exonic regions
    '''
    cv.parid = '.'  # blocks don't have pairs
    novel_blocks = [(idx, block) for idx, block in cv.blocks
                    if bh.is_novel_block(block, chr_ref, MIN_CLIP)]
    for block_idx, block in novel_blocks:
        cpos1 = sum([
            v for c, v in read.cigar[:block_idx]
            if c in constants.AFFECT_CONTIG
        ])
        cpos2 = sum([
            v for c, v in read.cigar[:block_idx + 1]
            if c in constants.AFFECT_CONTIG
        ])

        # whether sequence block is overlapping, or on left or right side of contig block
        olapping = chr_ref[np.logical_and(block[0] < chr_ref.start,
                                          block[1] > chr_ref.end)]
        left = chr_ref[np.logical_and(block[1] > chr_ref.start,
                                      block[1] <= chr_ref.end)]
        right = chr_ref[np.logical_and(block[0] >= chr_ref.start,
                                       block[0] < chr_ref.end)]

        if len(left) > 0 and len(right) > 0:
            # retained intron
            cv.cvtype = 'RI'
            qseq, rseq = bh.get_block_sequence(read, block_idx)
            seq_right_pos = block[1] - min(left.start)
            seq_left_pos = max(right.end) - block[0]

            cv.pos = block[0] + seq_left_pos + 1
            cv.ref = rseq[seq_left_pos:(-seq_right_pos)]
            cv.alt = ']' + qseq[seq_left_pos:(-seq_right_pos)] + '['
            cv.cpos = cpos1 + seq_left_pos

            cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 -
                                      len(cv.ref)), len(cv.alt) - 2
            cv.vid = get_next_id(read.query_name)
        elif len(olapping) > 0:
            # annotate left side
            cv.cvtype = 'EE'
            cv = annotate_block_left(cv, read, cpos2, olapping, block,
                                     block_idx)
            cv.vid = get_next_id(read.query_name)

            print(cv.vcf_output())
            CrypticVariant.write_contig_info(ci_file, cv)

            # annotate right side
            cv = annotate_block_right(cv, read, cpos1, olapping, block,
                                      block_idx)
            cv.vid = get_next_id(read.query_name)
        elif len(left) > 0:
            # annotate left side
            cv.cvtype = 'EE'
            cv = annotate_block_left(cv, read, cpos2, left, block, block_idx)
            cv.vid = get_next_id(read.query_name)
        elif len(right) > 0:
            # annotate right side
            cv.cvtype = 'EE'
            cv = annotate_block_right(cv, read, cpos1, right, block, block_idx)
            cv.vid = get_next_id(read.query_name)
        else:
            # block does not cross any annotation
            qseq, rseq = bh.get_block_sequence(read, block_idx)
            cv.ref, cv.alt = rseq, '[' + qseq + ']'
            cv.pos, cv.cvtype = block[0] + 1, 'NE'
            cv.cpos = cpos1
            cv.vid = get_next_id(read.query_name)
            cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 -
                                      len(cv.ref)), len(cv.alt) - 2

        print(cv.vcf_output())
        CrypticVariant.write_contig_info(ci_file, cv)
Beispiel #3
0
def test_is_novel_block(coord, expected):
    assert bh.is_novel_block(coord, chr_ref, MIN_CLIP) == expected