def resolve_complex_sv_v2(resolve_CPX, resolve_INV, resolve_CNV, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_', min_rescan_support=4, pe_blacklist=None, quiet=False, SR_only_cutoff=1000): #resolve_CPX = [i for i in out_rec if i.info['SVTYPE']=='CPX'] #resolve_INV = [i for i in out_rec if i.info['SVTYPE']=='INV'] independent_INV = remove_CPX_from_INV(resolve_CPX, resolve_INV) linked_INV = cluster_INV(independent_INV) clusters_v2 = link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000) clusters_v2 = cluster_cleanup(clusters_v2) #Print number of candidate clusters identified if not quiet: now = datetime.datetime.now() print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'identified ' + str(len(clusters_v2)) + ' candidate complex clusters ' + 'during second pass') cpx_records_v2 = deque() cpx_record_ids_v2 = set() for cluster in clusters_v2: #Print status for each cluster if not quiet: now = datetime.datetime.now() print( 'svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'resolving candidate cluster containing the following records: ' + ', '.join([e.id for e in cluster])) # Try finding opposite strand support for single ender inversions if len(cluster) == 1 and cluster[0].info['SVTYPE'] == 'INV': rec, opp = rescan_single_ender(cluster[0], disc_pairs, min_rescan_support, pe_blacklist=pe_blacklist) if opp is not None: cluster = deque([rec, opp]) # if cxsv overlap pulled in unrelated insertions, keep them separate if all([r.info['SVTYPE'] == 'INS' for r in cluster]): for record in cluster: cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff) cpx_record_ids = cpx_record_ids.union(cpx.record_ids) # Assign random string as resolved ID to handle sharding cpx.vcf_record.id = variant_prefix + '_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) cpx_records.append(cpx.vcf_record) # resolved_idx += 1 outcome = 'treated as separate unrelated insertions' else: cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff) cpx_record_ids_v2 = cpx_record_ids_v2.union(cpx.record_ids) if cpx.svtype == 'UNR': # Assign random string as unresolved ID to handle sharding unresolved_vid = 'UNRESOLVED_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) for i, record in enumerate(cpx.records): record.info['EVENT'] = unresolved_vid record.info['UNRESOLVED'] = True cpx_records_v2.append(record) # unresolved_idx += 1 outcome = 'is unresolved' else: cpx.vcf_record.id = variant_prefix + '_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) cpx_records_v2.append(cpx.vcf_record) if 'CPX_TYPE' in cpx.vcf_record.info.keys(): outcome = 'resolved as ' + str( cpx.vcf_record.info['CPX_TYPE']) else: outcome = 'resolved as ' + str( cpx.vcf_record.info['SVTYPE']) #Report outcome per cluster if not quiet: now = datetime.datetime.now() print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'candidate cluster ' + outcome) for i in cpx_records_v2: if i.info['SVTYPE'] == 'CPX': for info in 'UNRESOLVED EVENT UNRESOLVED_TYPE STRANDS'.split(): if info in i.info.keys(): i.info.pop(info) return cpx_records_v2
def resolve_complex_sv(vcf, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_', min_rescan_support=4, pe_blacklist=None, quiet=False, SR_only_cutoff=1000): """ Resolve complex SV from CNV intervals and BCA breakpoints. Yields all resolved events, simple or complex, in sorted order. Parameters ---------- vcf : pysam.VariantFile cytobands : pysam.TabixFile disc_pairs : pysam.TabixFile mei_bed : pybedtools.BedTool variant_prefix : str Prefix to assign to resolved variants min_rescan_support : int Number of pairs required to count a sample as supported during PE rescan pe_blacklist : pysam.TabixFile, optional Blacklisted genomic regions. Anomalous pairs in these regions will be removed prior to clustering. quiet : boolean, optional Do not print status updates Yields ------ sv : pysam.VariantRecord """ clusters = link_cpx(vcf) clusters = clusters_cleanup(clusters) #Print number of candidate clusters identified if not quiet: now = datetime.datetime.now() print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'identified ' + str(len(clusters)) + ' candidate complex clusters ' + 'during first pass') # resolved_idx = unresolved_idx = 1 if not variant_prefix.endswith('_'): variant_prefix += '_' cpx_records = deque() cpx_record_ids = set() for cluster in clusters: #Print status for each cluster if not quiet: now = datetime.datetime.now() print( 'svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'resolving candidate cluster containing the following records: ' + ', '.join([e.id for e in cluster])) # Try finding opposite strand support for single ender inversions if len(cluster) == 1 and cluster[0].info['SVTYPE'] == 'INV': rec, opp = rescan_single_ender(cluster[0], disc_pairs, min_rescan_support, pe_blacklist=pe_blacklist) if opp is not None: cluster = deque([rec, opp]) # if cxsv overlap pulled in unrelated insertions, keep them separate if all([r.info['SVTYPE'] == 'INS' for r in cluster]): for record in cluster: cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff) cpx_record_ids = cpx_record_ids.union(cpx.record_ids) # Assign random string as resolved ID to handle sharding cpx.vcf_record.id = variant_prefix + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) cpx_records.append(cpx.vcf_record) # resolved_idx += 1 outcome = 'treated as separate unrelated insertions' else: cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff) cpx_record_ids = cpx_record_ids.union(cpx.record_ids) if cpx.svtype == 'UNR': # Assign random string as unresolved ID to handle sharding unresolved_vid = 'UNRESOLVED_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) for i, record in enumerate(cpx.records): record.info['EVENT'] = unresolved_vid record.info['UNRESOLVED'] = True cpx_records.append(record) # unresolved_idx += 1 outcome = 'is unresolved' elif cpx.svtype == 'SPLIT': # check all CNVs for depth support and report the first # insertion record and all CNVs with depth support. CNVs without # depth support will have their IDs added to the MEMBERS field of # the INS record cnv_ids_to_append = [] for cnv in cpx.cnvs: if 'RD' in cnv.info['EVIDENCE']: cnv.info['MEMBERS'] = cnv.id cpx_records.append(cnv) else: cnv_ids_to_append.append(cnv.id) ins_rec = cpx.insertions[0] ins_rec.info['MEMBERS'] = ( ins_rec.id, ) + tuple(cnv_ids_to_append) cpx_records.append(ins_rec) outcome = 'split into INS and CNV variants. ' + \ 'The following records were merged into the INS record: ' + \ ', '.join(cnv_ids_to_append) else: cpx.vcf_record.id = variant_prefix + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) cpx_records.append(cpx.vcf_record) if 'CPX_TYPE' in cpx.vcf_record.info.keys(): outcome = 'resolved as ' + str( cpx.vcf_record.info['CPX_TYPE']) else: outcome = 'resolved as ' + str( cpx.vcf_record.info['SVTYPE']) # resolved_idx += 1 #Report outcome per cluster if not quiet: now = datetime.datetime.now() print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' + 'candidate cluster ' + outcome) # Output all variants vcf.reset() for record in _merge_records(vcf, cpx_records, cpx_record_ids): #Clean all BNDs to ensure they are set to unresolved #Reason: some SR-only BNDs will escape being set as UNRESOLVED if they # are part of a multi-breakpoint complex cluster that remains UNRESOLVED # after excluding SR-only breakpoints if record.info['SVTYPE'] == 'BND': record.info['UNRESOLVED'] = True if 'UNRESOLVED_TYPE' not in record.info.keys(): record.info['UNRESOLVED_TYPE'] = 'SINGLE_ENDER' if 'CPX_TYPE' in record.info.keys(): if 'UNRESOLVED' in record.info.keys(): record.info['UNRESOLVED_TYPE'] = record.info['CPX_TYPE'] record.info.pop('CPX_TYPE') else: record.info.pop('STRANDS') if 'CIPOS' in record.info.keys(): record.info.pop('CIPOS') if 'CIEND' in record.info.keys(): record.info.pop('CIEND') if 'RMSSTD' in record.info.keys(): record.info.pop('RMSSTD') yield record