def classify_2_cnv(FF, RR, cnvs, min_frac=0.5): """ Classify the cxSV class of a pair of inv bkpts and two associated CNVs. Matches each CNV to a 5' or 3' location, as constrained by the breakpoint coordinates. Parameters ---------- FF : pysam.VariantRecord RR : pysam.VariantRecord cnvs : [pysam.VariantRecord, pysam.VariantRecord] min_frac : float, optional Minimum reciprocal overlap of each cnv with a candidate CNV interval defined by the breakpoint coordinates. Returns ------- svtype : str """ # Assign CNVs to 5' or 3' based on ordering cnv5, cnv3 = sorted(cnvs, key=lambda r: r.pos) # Check if 5' CNV matches breakpoints if cnv5.info['SVTYPE'] == 'DEL': interval5 = (FF.pos, RR.pos) else: interval5 = (RR.pos, FF.pos) frac5 = svu.reciprocal_overlap(cnv5.pos, cnv5.stop, *interval5) # Check if 3' CNV matches breakpoints if cnv3.info['SVTYPE'] == 'DEL': interval3 = (FF.stop, RR.stop) else: interval3 = (RR.stop, FF.stop) frac3 = svu.reciprocal_overlap(cnv3.pos, cnv3.stop, *interval3) # Report cxSV class based on whether CNVs matched intervals if frac5 >= min_frac and frac3 >= min_frac: svtype = (cnv5.info['SVTYPE'].lower() + 'INV' + cnv3.info['SVTYPE'].lower()) elif frac5 >= min_frac and frac3 < min_frac: return classify_1_cnv(FF, RR, cnv5) elif frac5 < min_frac and frac3 >= min_frac: return classify_1_cnv(FF, RR, cnv3) else: svtype = 'CNV_2_FAIL' return svtype, cnvs
def _test_overlap(cnv): svtype = cnv.info['SVTYPE'] if svtype == 'DEL': frac5 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *del5) frac3 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *del3) else: frac5 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *dup5) frac3 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *dup3) if frac5 >= min_frac and frac3 >= min_frac: return svtype + '_53' elif frac5 >= min_frac: return svtype + '_5' elif frac3 >= min_frac: return svtype + '_3' else: return 'no_hit'
def classify_1_cnv(FF, RR, cnv, min_frac=0.5, min_bkpt_cnv_size=500, max_bkpt_cnv_size=4000): """ Classify the cxSV class of a pair of inv bkpts and one associated CNV. Matches each CNV to a 5' or 3' location, as constrained by the breakpoint coordinates. After matching CNV, check if distance between breakpoints at other end is sufficient to call a second flanking CNV. Parameters ---------- FF : pysam.VariantRecord RR : pysam.VariantRecord cnvs : [pysam.VariantRecord, pysam.VariantRecord] min_frac : float, optional Minimum reciprocal overlap of each cnv with a candidate CNV interval defined by the breakpoint coordinates. min_bkpt_cnv_size : int, optional Minimum distance between breakpoints to call flanking CNV. max_bkpt_cnv_size : int, optional Maximum distance between breakpoints to call flanking CNV. Returns ------- svtype : str """ # Make CNV class lowercase (for later concatenation with INV) cnv_type = cnv.info['SVTYPE'].lower() # Determine eligible 5'/3' CNV intervals defined by the breakpoints if cnv_type == 'del': interval5 = (FF.pos, RR.pos) interval3 = (FF.stop, RR.stop) else: interval5 = (RR.pos, FF.pos) interval3 = (RR.stop, FF.stop) # Check overlap of CNV against full inversion length start = min(FF.pos, RR.pos) end = max(FF.stop, RR.stop) total_frac = svu.reciprocal_overlap(cnv.pos, cnv.stop, start, end) frac5 = svu.overlap_frac(*interval5, cnv.pos, cnv.stop) frac3 = svu.overlap_frac(*interval3, cnv.pos, cnv.stop) # If one CNV spans the entire event, it likely represents two CNV merged # during preprocessing or clustering if total_frac > 0.9 and frac5 > 0.95 and frac3 > 0.95: svtype = cnv_type + 'INV' + cnv_type # + '_merged' return svtype, [cnv] # Otherwise, check whether it's 5' or 3' frac5 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *interval5) frac3 = svu.reciprocal_overlap(cnv.pos, cnv.stop, *interval3) # 5' CNV; check 3' breakpoints for small flanking CNV if frac5 >= min_frac and frac3 < min_frac: svtype = cnv_type + 'INV' dist3 = RR.stop - FF.stop if min_bkpt_cnv_size <= dist3 < max_bkpt_cnv_size: svtype = svtype + 'del' elif min_bkpt_cnv_size <= -dist3 < max_bkpt_cnv_size: svtype = svtype + 'dup' # 3' CNV; check 5' breakpoints for small flanking CNV elif frac5 < min_frac and frac3 >= min_frac: svtype = 'INV' + cnv_type dist5 = RR.pos - FF.pos if min_bkpt_cnv_size <= dist5 < max_bkpt_cnv_size: svtype = 'del' + svtype elif min_bkpt_cnv_size <= -dist5 < max_bkpt_cnv_size: svtype = 'dup' + svtype # Couldn't match the CNV else: return classify_0_cnv(FF, RR) return svtype, [cnv]