コード例 #1
0
def annotate_softclips(cv, read, ci_file):
    sc_idxs = [idx for idx, clip in enumerate(read.cigar) if clip[0] == constants.CIGAR['soft-clip'] \
                                                             and clip[1] >= MIN_CLIP]
    for sc_idx in sc_idxs:
        cigar = read.cigar[sc_idx]
        cv.cvtype, cv.vsize, cv.cvsize = 'UN', int(cigar[1]), int(cigar[1])
        sc_left = sc_idx == 0

        block_idx = 0 if sc_idx == 0 else np.max(
            np.where(np.array([b[0] for b in cv.blocks]) < sc_idx)[0])
        block = cv.blocks[block_idx][1]
        cv.pos = int(block[0]) + 1 if sc_left else int(block[1])

        rcigar = read.cigar[::-1] if cv.strand == '-' else read.cigar
        cv.cpos = sum(
            [v for c, v in rcigar[:sc_idx] if c in constants.AFFECT_CONTIG])

        varseq = read.query_sequence[cv.cpos:(cv.cpos + cv.cvsize)]
        refseq = read.get_reference_sequence()
        cv.ref = refseq[0] if sc_left else refseq[-1]
        cv.alt = '%s]%s:%d]%s' % (varseq, cv.chrom, cv.pos-1, cv.ref) if sc_left else \
                 '%s[%s:%d[%s' % (cv.ref, cv.chrom, cv.pos+1, varseq)

        cv.vid = get_next_id(read.query_name)
        CrypticVariant.write_contig_info(ci_file, cv)
        print(cv.vcf_output())
コード例 #2
0
def main():
    args = parse_args()
    set_globals(args)
    init_logging(args.log)
    print(VCF.get_header(args.sample))
    CrypticVariant.write_contig_header(args.contig_info_output)
    annotate_contigs(args)
コード例 #3
0
def annotate_gaps(cv, read, ci_file):
    '''
    Annotate deletions and insertions
    '''
    gap_idxs = [idx for idx, gap in enumerate(read.cigar) if gap[0] in constants.GAPS and gap[1] >= MIN_GAP]
    for gap_idx in gap_idxs:
        cigar = read.cigar[gap_idx]
        cv.vsize = int(cigar[1])

        block_idx = 0 if gap_idx == 0 else np.max(np.where(np.array([b[0] for b in cv.blocks]) < gap_idx)[0])
        block = cv.blocks[block_idx][1]
        cv.pos = int(block[1])

        # position of variant on contig
        cv.cpos = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_CONTIG])
        cv.cvsize = cv.vsize if read.cigar[gap_idx][0] == constants.CIGAR['insertion'] else 0
        # ^ only insertions affect contig pos

        if cigar[0] == constants.CIGAR['insertion']:
            cv.cvtype = 'INS'
            seq_pos1 = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_CONTIG \
                                                                 and c != constants.CIGAR['hard-clip']])
            seq_pos2 = seq_pos1 + cv.cvsize
            seq = read.query_sequence[(seq_pos1-1):seq_pos2]
            cv.ref, cv.alt = seq[:1], seq
        else:
            cv.cvtype = 'DEL'
            seq_pos1 = sum([v for c,v in read.cigar[:gap_idx] if c in constants.AFFECT_REF])
            seq_pos2 = seq_pos1 + cv.vsize
            seq = read.get_reference_sequence()[(seq_pos1-1):seq_pos2]
            cv.ref, cv.alt  = seq, seq[:1]

        cv.vid = get_next_id(read.query_name)
        CrypticVariant.write_contig_info(ci_file, cv)
        print(cv.vcf_output())
コード例 #4
0
def annotate_single_read(args, read, juncs, ex_ref, ref_trees, outbam=None, genes=''):
    '''
    Annotate insertions, deletions and soft-clips on a single read
    '''
    ci_file = args.contig_info_output
    genes = get_overlapping_genes(read, ref_trees) if genes == '' else genes
    fusion = any([op == constants.CIGAR['hard-clip'] and val >= MIN_CLIP for op, val in read.cigar])
    if genes == '' and not fusion:
        logging.info('No gene(s) intersecting read %s; skipping' % read.query_name)
        return

    # check for contig gaps or soft-clips
    has_gaps = any([op in constants.GAPS and val >= MIN_GAP for op, val in read.cigar])
    has_scs = any([op == constants.CIGAR['soft-clip'] and val >= MIN_CLIP for op, val in read.cigar])
    is_spliced = any([op == constants.CIGAR['skipped'] for op, val in read.cigar])

    # check junctions
    tx_juncs = get_tx_juncs(read)
    unknown_juncs = ['%s:%s-%s' % (c, s, e) not in juncs[0] for c, s, e in tx_juncs]
    has_novel_juncs = any(unknown_juncs)

    # check for novel blocks
    chr_ref = get_chr_ref(read, ex_ref)
    has_novel_blocks = any([bh.is_novel_block(block, chr_ref, MIN_CLIP) for block in read.get_blocks()])

    if has_gaps or has_scs or has_novel_juncs or has_novel_blocks:
        cv = CrypticVariant().from_read(read)
        cv.genes = genes
        if has_gaps:
            annotate_gaps(cv, read, ci_file)
        if has_scs:
            annotate_softclips(cv, read, ci_file)
        if has_novel_juncs:
            novel_juncs = [list(x) for x in np.array(tx_juncs)[unknown_juncs]]
            annotate_juncs(cv, read, juncs[1], novel_juncs, ci_file)
        if has_novel_blocks:
            annotate_blocks(cv, read, chr_ref, ci_file)
        if outbam:
            outbam.write(read)
    else:
        logging.info('Nothing to annotate for read %s (read matches reference)' % read.query_name)
コード例 #5
0
def annotate_juncs(cv, read, locs, novel_juncs, ci_file):
    for junc in novel_juncs:
        pos1, pos2 = int(junc[1]), int(junc[2])
        junc_idx = [idx for idx, block in cv.blocks if block[1] == pos1][0]
        junc_type = read.cigar[junc_idx + 1][0]
        if junc_type in constants.GAPS or junc_type == constants.CIGAR[
                'soft-clip']:
            continue

        cp = CrypticVariant().from_read(read)  # partner variant
        cp.genes = cv.genes
        varseq, refseq = '', read.get_reference_sequence()
        cpos = sum([
            v for c, v in read.cigar[:(junc_idx + 1)]
            if c in constants.AFFECT_CONTIG
        ])
        rpos = sum([
            v for c, v in read.cigar[:(junc_idx + 1)]
            if c in constants.AFFECT_REF
        ])
        cv.cpos, cp.cpos = cpos, cpos
        cv.pos, cp.pos = pos1, pos2 + 1

        cv.ref, cp.ref = refseq[rpos - 1], refseq[rpos]
        cv.alt = '%s[%s:%d[%s' % (cv.ref, cv.chrom, cv.pos, varseq)
        cp.alt = '%s]%s:%d]%s' % (varseq, cp.chrom, cp.pos, cp.ref)

        cv.vid = get_next_id(read.query_name)
        cp.vid = get_next_id(read.query_name)
        cv.parid, cp.parid = cp.vid, cv.vid

        loc_left = '%s:%d' % (cv.chrom, pos1)
        loc_right = '%s:%d' % (cv.chrom, pos2)
        if not (loc_left in locs) and not (loc_right in locs):
            # neither end annotated, novel exon junction
            cv.cvtype, cp.cvtype = 'NEJ', 'NEJ'
        elif not (loc_left in locs and loc_right in locs):
            # one end unannotated, partial novel junction
            cv.cvtype, cp.cvtype = 'PNJ', 'PNJ'
        else:
            # both ends annotated, alternative splice site
            cv.cvtype, cp.cvtype = 'AS', 'AS'

        print(cv.vcf_output())
        print(cp.vcf_output())
        CrypticVariant.write_contig_info(ci_file, cv, cp)
コード例 #6
0
def annotate_fusion(args, read, juncs, bam_idx, ex_ref, ref_trees, outbam):
    try:
        r1, r2 = bam_idx.find(read.query_name)
    except ValueError:
        logging.info(
            'WARNING: found >2 reads matching hard-clipped read %s; cannot process'
            % read.query_name)
        return

    ci_file = args.contig_info_output
    cv1 = CrypticVariant().from_read(r1)
    cv2 = CrypticVariant().from_read(r2)
    cv1.cvtype, cv2.cvtype = 'FUS', 'FUS'

    cv1.genes = get_overlapping_genes(r1, ref_trees)
    cv2.genes = get_overlapping_genes(r2, ref_trees)
    if cv1.genes == cv2.genes:
        # intra-genic rearrangement
        cv1.cvtype, cv2.cvtype = 'IGR', 'IGR'
    if cv1.genes == '' and cv2.genes == '':
        # no intersecting gene, this is not an interesting fusion
        logging.info(
            'No gene(s) intersecting candidate fusion contig %s; skipping' %
            read.query_name)
        record[read.query_name] = []
        return

    hc_idx1 = [
        idx for idx, clip in enumerate(r1.cigar)
        if clip[0] == constants.CIGAR['hard-clip']
    ][0]
    hc_left1 = hc_idx1 == 0

    block_idx1 = 0 if hc_left1 else np.max(
        np.where(np.array([b[0] for b in cv1.blocks]) < hc_idx1)[0])
    block1 = cv1.blocks[block_idx1][1]
    cv1.pos = int(block1[0]) if hc_left1 else int(block1[1])

    hc_idx2 = [
        idx for idx, clip in enumerate(r2.cigar)
        if clip[0] == constants.CIGAR['hard-clip']
    ][0]
    hc_left2 = hc_idx2 == 0

    block_idx2 = 0 if hc_left2 else np.max(
        np.where(np.array([b[0] for b in cv2.blocks]) < hc_idx2)[0])
    block2 = cv2.blocks[block_idx2][1]
    cv2.pos = int(block2[0]) if hc_left2 else int(block2[1])

    #TODO: handle inserted sequence btw fusion
    varseq1, varseq2 = '', ''
    refseq1 = r1.get_reference_sequence()
    refseq2 = r2.get_reference_sequence()
    bracket_dir1 = '[' if r1.is_reverse == r2.is_reverse else ']'
    bracket_dir2 = ']' if r1.is_reverse == r2.is_reverse else ']'
    cv1.ref = refseq1[0] if hc_left1 else refseq1[-1]
    cv2.ref = refseq2[0] if hc_left2 else refseq1[-1]
    if r1.is_reverse == r2.is_reverse:
        cv1.alt = '%s]%s:%d]%s' % (varseq1, cv2.chrom, cv2.pos-1, cv1.ref) if hc_left1 else \
                  '%s[%s:%d[%s' % (cv1.ref, cv2.chrom, cv2.pos+1, varseq1)
        cv2.alt = '%s]%s:%d]%s' % (varseq2, cv1.chrom, cv1.pos-1, cv2.ref) if hc_left2 else \
                  '%s[%s:%d[%s' % (cv2.ref, cv1.chrom, cv1.pos+1, varseq2)
    else:
        # contigs align on opposite strands
        cv1.alt = '%s[%s:%d[%s' % (varseq1, cv2.chrom, cv2.pos-1, cv1.ref) if hc_left1 else \
                  '%s]%s:%d]%s' % (cv1.ref, cv2.chrom, cv2.pos+1, varseq1)
        cv2.alt = '%s[%s:%d[%s' % (varseq2, cv1.chrom, cv1.pos-1, cv2.ref) if hc_left2 else \
                  '%s]%s:%d]%s' % (cv2.ref, cv1.chrom, cv1.pos+1, varseq2)

    cv1.vid = get_next_id(r1.query_name)
    cv2.vid = get_next_id(r2.query_name)
    cv1.parid, cv2.parid = cv2.vid, cv1.vid

    print(cv1.vcf_output())
    print(cv2.vcf_output())
    outbam.write(r1)
    outbam.write(r2)
    CrypticVariant.write_contig_info(ci_file, cv1, cv2)

    annotate_single_read(args, r1, juncs, ex_ref, ref_trees, genes=cv1.genes)
    annotate_single_read(args, r2, juncs, ex_ref, ref_trees, genes=cv2.genes)
コード例 #7
0
def annotate_blocks(cv, read, chr_ref, ci_file):
    '''
    Annotate any sequence that is outside of exonic regions
    '''
    cv.parid = '.'  # blocks don't have pairs
    novel_blocks = [(idx, block) for idx, block in cv.blocks
                    if bh.is_novel_block(block, chr_ref, MIN_CLIP)]
    for block_idx, block in novel_blocks:
        cpos1 = sum([
            v for c, v in read.cigar[:block_idx]
            if c in constants.AFFECT_CONTIG
        ])
        cpos2 = sum([
            v for c, v in read.cigar[:block_idx + 1]
            if c in constants.AFFECT_CONTIG
        ])

        # whether sequence block is overlapping, or on left or right side of contig block
        olapping = chr_ref[np.logical_and(block[0] < chr_ref.start,
                                          block[1] > chr_ref.end)]
        left = chr_ref[np.logical_and(block[1] > chr_ref.start,
                                      block[1] <= chr_ref.end)]
        right = chr_ref[np.logical_and(block[0] >= chr_ref.start,
                                       block[0] < chr_ref.end)]

        if len(left) > 0 and len(right) > 0:
            # retained intron
            cv.cvtype = 'RI'
            qseq, rseq = bh.get_block_sequence(read, block_idx)
            seq_right_pos = block[1] - min(left.start)
            seq_left_pos = max(right.end) - block[0]

            cv.pos = block[0] + seq_left_pos + 1
            cv.ref = rseq[seq_left_pos:(-seq_right_pos)]
            cv.alt = ']' + qseq[seq_left_pos:(-seq_right_pos)] + '['
            cv.cpos = cpos1 + seq_left_pos

            cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 -
                                      len(cv.ref)), len(cv.alt) - 2
            cv.vid = get_next_id(read.query_name)
        elif len(olapping) > 0:
            # annotate left side
            cv.cvtype = 'EE'
            cv = annotate_block_left(cv, read, cpos2, olapping, block,
                                     block_idx)
            cv.vid = get_next_id(read.query_name)

            print(cv.vcf_output())
            CrypticVariant.write_contig_info(ci_file, cv)

            # annotate right side
            cv = annotate_block_right(cv, read, cpos1, olapping, block,
                                      block_idx)
            cv.vid = get_next_id(read.query_name)
        elif len(left) > 0:
            # annotate left side
            cv.cvtype = 'EE'
            cv = annotate_block_left(cv, read, cpos2, left, block, block_idx)
            cv.vid = get_next_id(read.query_name)
        elif len(right) > 0:
            # annotate right side
            cv.cvtype = 'EE'
            cv = annotate_block_right(cv, read, cpos1, right, block, block_idx)
            cv.vid = get_next_id(read.query_name)
        else:
            # block does not cross any annotation
            qseq, rseq = bh.get_block_sequence(read, block_idx)
            cv.ref, cv.alt = rseq, '[' + qseq + ']'
            cv.pos, cv.cvtype = block[0] + 1, 'NE'
            cv.cpos = cpos1
            cv.vid = get_next_id(read.query_name)
            cv.vsize, cv.cvsize = abs(len(cv.alt) - 2 -
                                      len(cv.ref)), len(cv.alt) - 2

        print(cv.vcf_output())
        CrypticVariant.write_contig_info(ci_file, cv)