コード例 #1
0
def resolve_complex_sv_v2(resolve_CPX,
                          resolve_INV,
                          resolve_CNV,
                          cytobands,
                          disc_pairs,
                          mei_bed,
                          variant_prefix='CPX_',
                          min_rescan_support=4,
                          pe_blacklist=None,
                          quiet=False,
                          SR_only_cutoff=1000):
    #resolve_CPX = [i for i in out_rec if i.info['SVTYPE']=='CPX']
    #resolve_INV = [i for i in out_rec if i.info['SVTYPE']=='INV']
    independent_INV = remove_CPX_from_INV(resolve_CPX, resolve_INV)
    linked_INV = cluster_INV(independent_INV)
    clusters_v2 = link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000)
    clusters_v2 = cluster_cleanup(clusters_v2)

    #Print number of candidate clusters identified
    if not quiet:
        now = datetime.datetime.now()
        print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
              'identified ' + str(len(clusters_v2)) +
              ' candidate complex clusters ' + 'during second pass')

    cpx_records_v2 = deque()
    cpx_record_ids_v2 = set()
    for cluster in clusters_v2:
        #Print status for each cluster
        if not quiet:
            now = datetime.datetime.now()
            print(
                'svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
                'resolving candidate cluster containing the following records: '
                + ', '.join([e.id for e in cluster]))

        # Try finding opposite strand support for single ender inversions
        if len(cluster) == 1 and cluster[0].info['SVTYPE'] == 'INV':
            rec, opp = rescan_single_ender(cluster[0],
                                           disc_pairs,
                                           min_rescan_support,
                                           pe_blacklist=pe_blacklist)
            if opp is not None:
                cluster = deque([rec, opp])

        # if cxsv overlap pulled in unrelated insertions, keep them separate
        if all([r.info['SVTYPE'] == 'INS' for r in cluster]):
            for record in cluster:
                cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff)
                cpx_record_ids = cpx_record_ids.union(cpx.record_ids)

                # Assign random string as resolved ID to handle sharding
                cpx.vcf_record.id = variant_prefix + '_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                cpx_records.append(cpx.vcf_record)
                # resolved_idx += 1
            outcome = 'treated as separate unrelated insertions'
        else:
            cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff)
            cpx_record_ids_v2 = cpx_record_ids_v2.union(cpx.record_ids)
            if cpx.svtype == 'UNR':
                # Assign random string as unresolved ID to handle sharding
                unresolved_vid = 'UNRESOLVED_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                for i, record in enumerate(cpx.records):
                    record.info['EVENT'] = unresolved_vid
                    record.info['UNRESOLVED'] = True
                    cpx_records_v2.append(record)
                # unresolved_idx += 1
                outcome = 'is unresolved'
            else:
                cpx.vcf_record.id = variant_prefix + '_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                cpx_records_v2.append(cpx.vcf_record)
                if 'CPX_TYPE' in cpx.vcf_record.info.keys():
                    outcome = 'resolved as ' + str(
                        cpx.vcf_record.info['CPX_TYPE'])
                else:
                    outcome = 'resolved as ' + str(
                        cpx.vcf_record.info['SVTYPE'])

        #Report outcome per cluster
        if not quiet:
            now = datetime.datetime.now()
            print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
                  'candidate cluster ' + outcome)

    for i in cpx_records_v2:
        if i.info['SVTYPE'] == 'CPX':
            for info in 'UNRESOLVED EVENT UNRESOLVED_TYPE STRANDS'.split():
                if info in i.info.keys():
                    i.info.pop(info)
    return cpx_records_v2
コード例 #2
0
def resolve_complex_sv(vcf,
                       cytobands,
                       disc_pairs,
                       mei_bed,
                       variant_prefix='CPX_',
                       min_rescan_support=4,
                       pe_blacklist=None,
                       quiet=False,
                       SR_only_cutoff=1000):
    """
    Resolve complex SV from CNV intervals and BCA breakpoints.
    Yields all resolved events, simple or complex, in sorted order.
    Parameters
    ----------
    vcf : pysam.VariantFile
    cytobands : pysam.TabixFile
    disc_pairs : pysam.TabixFile
    mei_bed : pybedtools.BedTool
    variant_prefix : str
        Prefix to assign to resolved variants
    min_rescan_support : int
        Number of pairs required to count a sample as 
        supported during PE rescan
    pe_blacklist : pysam.TabixFile, optional
        Blacklisted genomic regions. Anomalous pairs in these regions will be
        removed prior to clustering.
    quiet : boolean, optional
        Do not print status updates
    Yields
    ------
    sv : pysam.VariantRecord
    """

    clusters = link_cpx(vcf)
    clusters = clusters_cleanup(clusters)

    #Print number of candidate clusters identified
    if not quiet:
        now = datetime.datetime.now()
        print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
              'identified ' + str(len(clusters)) +
              ' candidate complex clusters ' + 'during first pass')

    # resolved_idx = unresolved_idx = 1

    if not variant_prefix.endswith('_'):
        variant_prefix += '_'

    cpx_records = deque()
    cpx_record_ids = set()

    for cluster in clusters:
        #Print status for each cluster
        if not quiet:
            now = datetime.datetime.now()
            print(
                'svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
                'resolving candidate cluster containing the following records: '
                + ', '.join([e.id for e in cluster]))

        # Try finding opposite strand support for single ender inversions
        if len(cluster) == 1 and cluster[0].info['SVTYPE'] == 'INV':
            rec, opp = rescan_single_ender(cluster[0],
                                           disc_pairs,
                                           min_rescan_support,
                                           pe_blacklist=pe_blacklist)
            if opp is not None:
                cluster = deque([rec, opp])

        # if cxsv overlap pulled in unrelated insertions, keep them separate
        if all([r.info['SVTYPE'] == 'INS' for r in cluster]):
            for record in cluster:
                cpx = ComplexSV([record], cytobands, mei_bed, SR_only_cutoff)
                cpx_record_ids = cpx_record_ids.union(cpx.record_ids)

                # Assign random string as resolved ID to handle sharding
                cpx.vcf_record.id = variant_prefix + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                cpx_records.append(cpx.vcf_record)
                # resolved_idx += 1
            outcome = 'treated as separate unrelated insertions'
        else:
            cpx = ComplexSV(cluster, cytobands, mei_bed, SR_only_cutoff)
            cpx_record_ids = cpx_record_ids.union(cpx.record_ids)
            if cpx.svtype == 'UNR':
                # Assign random string as unresolved ID to handle sharding
                unresolved_vid = 'UNRESOLVED_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                for i, record in enumerate(cpx.records):
                    record.info['EVENT'] = unresolved_vid
                    record.info['UNRESOLVED'] = True
                    cpx_records.append(record)
                # unresolved_idx += 1
                outcome = 'is unresolved'
            elif cpx.svtype == 'SPLIT':
                # check all CNVs for depth support and report the first
                # insertion record and all CNVs with depth support. CNVs without
                # depth support will have their IDs added to the MEMBERS field of
                # the INS record
                cnv_ids_to_append = []
                for cnv in cpx.cnvs:
                    if 'RD' in cnv.info['EVIDENCE']:
                        cnv.info['MEMBERS'] = cnv.id
                        cpx_records.append(cnv)
                    else:
                        cnv_ids_to_append.append(cnv.id)
                ins_rec = cpx.insertions[0]
                ins_rec.info['MEMBERS'] = (
                    ins_rec.id, ) + tuple(cnv_ids_to_append)
                cpx_records.append(ins_rec)
                outcome = 'split into INS and CNV variants. ' + \
                          'The following records were merged into the INS record: ' + \
                          ', '.join(cnv_ids_to_append)
            else:
                cpx.vcf_record.id = variant_prefix + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                cpx_records.append(cpx.vcf_record)
                if 'CPX_TYPE' in cpx.vcf_record.info.keys():
                    outcome = 'resolved as ' + str(
                        cpx.vcf_record.info['CPX_TYPE'])
                else:
                    outcome = 'resolved as ' + str(
                        cpx.vcf_record.info['SVTYPE'])
                # resolved_idx += 1

        #Report outcome per cluster
        if not quiet:
            now = datetime.datetime.now()
            print('svtk resolve @ ' + now.strftime("%H:%M:%S") + ': ' +
                  'candidate cluster ' + outcome)

    # Output all variants
    vcf.reset()

    for record in _merge_records(vcf, cpx_records, cpx_record_ids):
        #Clean all BNDs to ensure they are set to unresolved
        #Reason: some SR-only BNDs will escape being set as UNRESOLVED if they
        # are part of a multi-breakpoint complex cluster that remains UNRESOLVED
        # after excluding SR-only breakpoints
        if record.info['SVTYPE'] == 'BND':
            record.info['UNRESOLVED'] = True
            if 'UNRESOLVED_TYPE' not in record.info.keys():
                record.info['UNRESOLVED_TYPE'] = 'SINGLE_ENDER'
        if 'CPX_TYPE' in record.info.keys():
            if 'UNRESOLVED' in record.info.keys():
                record.info['UNRESOLVED_TYPE'] = record.info['CPX_TYPE']
                record.info.pop('CPX_TYPE')
            else:
                record.info.pop('STRANDS')
        if 'CIPOS' in record.info.keys():
            record.info.pop('CIPOS')
        if 'CIEND' in record.info.keys():
            record.info.pop('CIEND')
        if 'RMSSTD' in record.info.keys():
            record.info.pop('RMSSTD')
        yield record