Ejemplo n.º 1
0
def sv_genotype(bam_string,
                vcf_in,
                vcf_out,
                min_aligned,
                split_weight,
                disc_weight,
                num_samp,
                lib_info_path,
                debug,
                alignment_outpath,
                ref_fasta,
                sum_quals,
                max_reads,
                max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(pysam.AlignmentFile(b,
                mode='rc',reference_filename=ref_fasta,format_options=["required_fields=7167"]))
        else:
            sys.stderr.write('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b)
            exit(1)
            
    min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' % bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        out_bam_written_reads = set()
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    split_slop = 3 # amount of slop around breakpoint to count splitters
    in_header = True
    header = []
    breakend_dict = {} # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')


        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write('Warning: SVTYPE missing at variant %s. Skipping.\n' % (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue
            
        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write('Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' % (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95', max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else: o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else: o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse =  False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse =  True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA, chromB, posB, ciB, z, max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            # initialize counts to zero
            ref_span, alt_span = 0, 0
            ref_seq, alt_seq = 0, 0
            alt_clip = 0

            # ref_ciA = ciA
            # ref_ciB = ciB
            ref_ciA = [0,0]
            ref_ciB = [0,0]

            for query_name in sorted(read_batch.keys()):
                fragment = read_batch[query_name]
                # boolean on whether to write the fragment
                write_fragment = False

                # -------------------------------------
                # Check for split-read evidence
                # -------------------------------------

                # get reference sequences
                for read in fragment.primary_reads:
                    is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA, ciA, min_aligned)
                    is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB, ciB, min_aligned)
                    if (is_ref_seq_A or is_ref_seq_B):
                        p_reference = prob_mapq(read)
                        ref_seq += p_reference

                        read.set_tag('XV', 'R')
                        write_fragment = True

                # get non-reference split-read support
                for split in fragment.split_reads:

                    split_lr = split.is_split_straddle(chromA, posA, ciA,
                                                       chromB, posB, ciB,
                                                       o1_is_reverse, o2_is_reverse,
                                                       svtype, split_slop)
                    # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right)
                    p_alt = (prob_mapq(split.query_left) * split_lr[0] + prob_mapq(split.query_right) * split_lr[1]) / 2.0
                    if split.is_soft_clip:
                        alt_clip += p_alt
                    else:
                        alt_seq += p_alt

                    if p_alt > 0:
                        split.tag_split(p_alt)
                        write_fragment = True

                # -------------------------------------
                # Check for paired-end evidence
                # -------------------------------------

                # tally spanning alternate pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    alt_straddle = False
                else:
                    alt_straddle = fragment.is_pair_straddle(chromA, posA, ciA,
                                                             chromB, posB, ciB,
                                                             o1_is_reverse, o2_is_reverse,
                                                             min_aligned,
                                                             fragment.lib)

                # check both sides if inversion (perhaps should do this for BND as well?)
                if svtype in ('INV'):
                    alt_straddle_reciprocal = fragment.is_pair_straddle(chromA, posA, ciA,
                                                                        chromB, posB, ciB,
                                                                        not o1_is_reverse,
                                                                        not o2_is_reverse,
                                                                        min_aligned,
                                                                        fragment.lib)
                else:
                    alt_straddle_reciprocal = False

                if alt_straddle or alt_straddle_reciprocal:
                    if svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_alt = (1 - p_conc) * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            alt_span += p_alt

                            # # since an alt straddler is by definition also a reference straddler,
                            # # we can bail out early here to save some time
                            # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            # ref_span += p_reference
                            # continue

                            fragment.tag_span(p_alt)
                            write_fragment = True

                    else:
                        p_alt = prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                        alt_span += p_alt

                        fragment.tag_span(p_alt)
                        write_fragment = True

                # # tally spanning reference pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    ref_straddle_A = False
                    ref_straddle_B = False
                else:
                    ref_straddle_A = fragment.is_pair_straddle(chromA, posA, ref_ciA,
                                                               chromA, posA, ref_ciA,
                                                               False, True,
                                                               min_aligned,
                                                               fragment.lib)
                    ref_straddle_B = fragment.is_pair_straddle(chromB, posB, ref_ciB,
                                                               chromB, posB, ref_ciB,
                                                               False, True,
                                                               min_aligned,
                                                               fragment.lib)

                if ref_straddle_A or ref_straddle_B:
                    # don't allow the pair to jump the entire variant, except for
                    # length-changing SVs like deletions
                    if not (ref_straddle_A and ref_straddle_B) or svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            ref_span += (ref_straddle_A + ref_straddle_B) * p_reference / 2

                            fragment.tag_span(1 - p_conc)
                            write_fragment = True

                # write to BAM if requested
                if alignment_outpath is not None and  write_fragment:
                    for read in fragment.primary_reads + [split.read for split in fragment.split_reads]:
                        out_bam_written_reads = write_alignment(read, out_bam, out_bam_written_reads)

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP": is_dup = True
                else: is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format('GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format('DP', int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO', int(ref_seq + ref_span))
                var.genotype(sample.name).set_format('AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format('AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')


                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning('Unpaired breakends found in file. These will not be present in output.')

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()

    return
Ejemplo n.º 2
0
def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight,
                disc_weight, num_samp, lib_info_path, debug, alignment_outpath,
                ref_fasta, sum_quals, max_reads, max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(
                pysam.AlignmentFile(b,
                                    mode='rc',
                                    reference_filename=ref_fasta,
                                    format_options=["required_fields=7167"]))
        else:
            sys.stderr.write(
                'Error: %s is not a valid alignment file (*.bam or *.cram)\n' %
                b)
            exit(1)

    min_lib_prevalence = 1e-3  # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' %
                         bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info,
                                          min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        out_bam_written_reads = set()
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    split_slop = 3  # amount of slop around breakpoint to count splitters
    in_header = True
    header = []
    breakend_dict = {
    }  # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None  # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write(
                'Warning: SVTYPE missing at variant %s. Skipping.\n' %
                (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write(
                'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' %
                (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95',
                                          max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else:
                    o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else:
                    o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse = False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse = True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA,
                                                chromB, posB, ciB, z,
                                                max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            # initialize counts to zero
            ref_span, alt_span = 0, 0
            ref_seq, alt_seq = 0, 0
            alt_clip = 0

            # ref_ciA = ciA
            # ref_ciB = ciB
            ref_ciA = [0, 0]
            ref_ciB = [0, 0]

            for query_name in sorted(read_batch.keys()):
                fragment = read_batch[query_name]
                # boolean on whether to write the fragment
                write_fragment = False

                # -------------------------------------
                # Check for split-read evidence
                # -------------------------------------

                # get reference sequences
                for read in fragment.primary_reads:
                    is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA,
                                                       ciA, min_aligned)
                    is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB,
                                                       ciB, min_aligned)
                    if (is_ref_seq_A or is_ref_seq_B):
                        p_reference = prob_mapq(read)
                        ref_seq += p_reference

                        read.set_tag('XV', 'R')
                        write_fragment = True

                # get non-reference split-read support
                for split in fragment.split_reads:

                    split_lr = split.is_split_straddle(chromA, posA, ciA,
                                                       chromB, posB, ciB,
                                                       o1_is_reverse,
                                                       o2_is_reverse, svtype,
                                                       split_slop)
                    # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right)
                    p_alt = (prob_mapq(split.query_left) * split_lr[0] +
                             prob_mapq(split.query_right) * split_lr[1]) / 2.0
                    if split.is_soft_clip:
                        alt_clip += p_alt
                    else:
                        alt_seq += p_alt

                    if p_alt > 0:
                        split.tag_split(p_alt)
                        write_fragment = True

                # -------------------------------------
                # Check for paired-end evidence
                # -------------------------------------

                # tally spanning alternate pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    alt_straddle = False
                else:
                    alt_straddle = fragment.is_pair_straddle(
                        chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse,
                        o2_is_reverse, min_aligned, fragment.lib)

                # check both sides if inversion (perhaps should do this for BND as well?)
                if svtype in ('INV'):
                    alt_straddle_reciprocal = fragment.is_pair_straddle(
                        chromA, posA, ciA, chromB, posB, ciB,
                        not o1_is_reverse, not o2_is_reverse, min_aligned,
                        fragment.lib)
                else:
                    alt_straddle_reciprocal = False

                if alt_straddle or alt_straddle_reciprocal:
                    if svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_alt = (1 - p_conc) * prob_mapq(
                                fragment.readA) * prob_mapq(fragment.readB)
                            alt_span += p_alt

                            # # since an alt straddler is by definition also a reference straddler,
                            # # we can bail out early here to save some time
                            # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            # ref_span += p_reference
                            # continue

                            fragment.tag_span(p_alt)
                            write_fragment = True

                    else:
                        p_alt = prob_mapq(fragment.readA) * prob_mapq(
                            fragment.readB)
                        alt_span += p_alt

                        fragment.tag_span(p_alt)
                        write_fragment = True

                # # tally spanning reference pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    ref_straddle_A = False
                    ref_straddle_B = False
                else:
                    ref_straddle_A = fragment.is_pair_straddle(
                        chromA, posA, ref_ciA, chromA, posA, ref_ciA, False,
                        True, min_aligned, fragment.lib)
                    ref_straddle_B = fragment.is_pair_straddle(
                        chromB, posB, ref_ciB, chromB, posB, ref_ciB, False,
                        True, min_aligned, fragment.lib)

                if ref_straddle_A or ref_straddle_B:
                    # don't allow the pair to jump the entire variant, except for
                    # length-changing SVs like deletions
                    if not (ref_straddle_A
                            and ref_straddle_B) or svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_reference = p_conc * prob_mapq(
                                fragment.readA) * prob_mapq(fragment.readB)
                            ref_span += (ref_straddle_A +
                                         ref_straddle_B) * p_reference / 2

                            fragment.tag_span(1 - p_conc)
                            write_fragment = True

                # write to BAM if requested
                if alignment_outpath is not None and write_fragment:
                    for read in fragment.primary_reads + [
                            split.read for split in fragment.split_reads
                    ]:
                        out_bam_written_reads = write_alignment(
                            read, out_bam, out_bam_written_reads)

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP": is_dup = True
                else: is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(
                    disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([(i, e)
                                            for i, e in enumerate(gt_lplist)],
                                           key=lambda (x): x[1],
                                           reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format(
                    'GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format(
                    'DP',
                    int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO',
                                                     int(ref_seq + ref_span))
                var.genotype(sample.name).set_format(
                    'AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format(
                        'AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')

                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(
                        -10 * (gt_lplist[0] - gt_sum_log)
                    )  # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning(
            'Unpaired breakends found in file. These will not be present in output.'
        )

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()

    return
Ejemplo n.º 3
0
def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
    is_dup = breakpoint['svtype'] == 'DUP'

    elems = ('ref_seq', 'alt_seq', 'alt_clip', 'ref_span', 'alt_span')
    (ref_seq, alt_seq, alt_clip, ref_span, alt_span) = \
        [counts[i] for i in elems]

    # pre-calculations
    alt_splitters = alt_seq + alt_clip
    QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
    QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)

    # the actual bayesian calculation and decision
    gt_lplist = bayes_gt(QR, QA, is_dup)
    best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
    gt_idx = best[0]

    # print log probabilities of homref, het, homalt
    if debug:
        msg = ("{} -- "
               "log probabilities (homref, het, homalt) : "
               "{}").format(breakpoint['id'], gt_lplist)
        logit(msg)

    result = blank_genotype_result()
    result['formats']['GL'] = ','.join(['%.0f' % x for x in gt_lplist])
    result['formats']['DP'] = int(ref_seq + alt_seq + alt_clip + ref_span + alt_span)
    result['formats']['RO'] = int(ref_seq + ref_span)
    result['formats']['AO'] = int(alt_seq + alt_clip + alt_span)
    result['formats']['QR'] = QR
    result['formats']['QA'] = QA
    # if detailed:
    result['formats']['RS'] = int(ref_seq)
    result['formats']['AS'] = int(alt_seq)
    result['formats']['ASC'] = int(alt_clip)
    result['formats']['RP'] = int(ref_span)
    result['formats']['AP'] = int(alt_span)
    try:
        result['formats']['AB'] = '%.2g' % (QA / float(QR + QA))
    except ZeroDivisionError:
        result['formats']['AB'] = '.'

    # assign genotypes
    gt_sum = 0
    for gt in gt_lplist:
        try:
            gt_sum += 10**gt
        except OverflowError:
            gt_sum += 0
    if gt_sum > 0:
        gt_sum_log = math.log(gt_sum, 10)
        sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
        phred_gq = min(-10 * (second_best[1] - best[1]), 200)
        result['formats']['GQ'] = int(phred_gq)
        result['formats']['SQ'] = sample_qual
        result['qual'] += sample_qual
        if gt_idx == 1:
            result['formats']['GT'] = '0/1'
        elif gt_idx == 2:
            result['formats']['GT'] = '1/1'
        elif gt_idx == 0:
            result['formats']['GT'] = '0/0'
    else:
        result['formats']['GQ'] = '.'
        result['formats']['SQ'] = '.'
        result['formats']['GT'] = './.'
    
    return result
Ejemplo n.º 4
0
def bayesian_genotype(breakpoint, counts, split_weight, disc_weight, debug):
    is_dup = breakpoint['svtype'] == 'DUP'

    elems = ('ref_seq', 'alt_seq', 'alt_clip', 'ref_span', 'alt_span')
    (ref_seq, alt_seq, alt_clip, ref_span, alt_span) = \
        [counts[i] for i in elems]

    # pre-calculations
    alt_splitters = alt_seq + alt_clip
    QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
    QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)

    # the actual bayesian calculation and decision
    gt_lplist = bayes_gt(QR, QA, is_dup)
    best, second_best = sorted([(i, e) for i, e in enumerate(gt_lplist)],
                               key=lambda x: x[1],
                               reverse=True)[0:2]
    gt_idx = best[0]

    # print log probabilities of homref, het, homalt
    if debug:
        msg = ("{} -- "
               "log probabilities (homref, het, homalt) : "
               "{}").format(breakpoint['id'], gt_lplist)
        logit(msg)

    result = blank_genotype_result()
    result['formats']['GL'] = ','.join(['%.0f' % x for x in gt_lplist])
    result['formats']['DP'] = int(ref_seq + alt_seq + alt_clip + ref_span +
                                  alt_span)
    result['formats']['RO'] = int(ref_seq + ref_span)
    result['formats']['AO'] = int(alt_seq + alt_clip + alt_span)
    result['formats']['QR'] = QR
    result['formats']['QA'] = QA
    # if detailed:
    result['formats']['RS'] = int(ref_seq)
    result['formats']['AS'] = int(alt_seq)
    result['formats']['ASC'] = int(alt_clip)
    result['formats']['RP'] = int(ref_span)
    result['formats']['AP'] = int(alt_span)
    try:
        result['formats']['AB'] = '%.2g' % (QA / float(QR + QA))
    except ZeroDivisionError:
        result['formats']['AB'] = '.'

    # assign genotypes
    gt_sum = 0
    for gt in gt_lplist:
        try:
            gt_sum += 10**gt
        except OverflowError:
            gt_sum += 0
    if gt_sum > 0:
        gt_sum_log = math.log(gt_sum, 10)
        sample_qual = abs(
            -10 * (gt_lplist[0] - gt_sum_log)
        )  # phred-scaled probability site is non-reference in this sample
        phred_gq = min(-10 * (second_best[1] - best[1]), 200)
        result['formats']['GQ'] = int(phred_gq)
        result['formats']['SQ'] = sample_qual
        result['qual'] += sample_qual
        if gt_idx == 1:
            result['formats']['GT'] = '0/1'
        elif gt_idx == 2:
            result['formats']['GT'] = '1/1'
        elif gt_idx == 0:
            result['formats']['GT'] = '0/0'
    else:
        result['formats']['GQ'] = '.'
        result['formats']['SQ'] = '.'
        result['formats']['GT'] = './.'

    return result
Ejemplo n.º 5
0
def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight,
                disc_weight, num_samp, lib_info_path, debug, alignment_outpath,
                ref_fasta, sum_quals, max_reads, max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(
                pysam.AlignmentFile(b,
                                    mode='rc',
                                    reference_filename=ref_fasta,
                                    format_options=["required_fields=7167"]))
        else:
            sys.stderr.write(
                'Error: %s is not a valid alignment file (*.bam or *.cram)\n' %
                b)
            exit(1)

    min_lib_prevalence = 1e-3  # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' %
                         bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info,
                                          min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

        supporting_reads_file = alignment_outpath.replace('bam', 'tsv')
        supporting_reads_fh = open(supporting_reads_file, 'w')
        supporting_reads_fh.write(
            'chromA,chromB,posA,posB,svtype,read_id,read_chrom,read_reference_start,read_reference_end,start_ref_support,start_ref_pe_support,start_alt_sr_support,start_alt_pe_support,end_ref_support,end_ref_pe_support,end_alt_sr_support,end_alt_pe_support\n'
        )
    else:
        out_bam = None
        supporting_reads_fh = None
    out_bam_written_reads = set()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    in_header = True
    header = []
    breakend_dict = {
    }  # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None  # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write(
                'Warning: SVTYPE missing at variant %s. Skipping.\n' %
                (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write(
                'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' %
                (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95',
                                          max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else:
                    o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else:
                    o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse = False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse = True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads for start and end of breakpoints
            start_read_batch = {}
            end_read_batch = {}
            start_read_batch, many = gather_reads(sample, chromA, posA, ciA, z,
                                                  start_read_batch, max_reads)
            end_read_batch, many = gather_reads(sample, chromB, posB, ciB, z,
                                                end_read_batch, max_reads)
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA,
                                                chromB, posB, ciB, z,
                                                max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            metrics = {'start': {}, 'end': {}}
            # run metrics for each breakpoint separately
            metrics['start'] = calculate_metrics(
                start_read_batch, 'start', var, chromA, chromB, posA, posB,
                ciA, ciB, min_aligned, o1_is_reverse, o2_is_reverse,
                var_length, svtype, out_bam_written_reads, supporting_reads_fh,
                out_bam)
            metrics['end'] = calculate_metrics(end_read_batch, 'end', var,
                                               chromA, chromB, posA, posB, ciA,
                                               ciB, min_aligned, o1_is_reverse,
                                               o2_is_reverse, var_length,
                                               svtype, out_bam_written_reads,
                                               supporting_reads_fh, out_bam)

            # run metrics for both sides of breakpoints
            # no bam file and supporting reads files is written here since they are already written in running the method for each breakpoint separately
            metrics['both'] = calculate_metrics(
                read_batch, 'both', var, chromA, chromB, posA, posB, ciA, ciB,
                min_aligned, o1_is_reverse, o2_is_reverse, var_length, svtype,
                out_bam_written_reads, None, None)

            # set these for metrics from both sides of breakpoints
            ref_span = metrics['both']['ref_span']
            alt_span = metrics['both']['alt_span']
            ref_seq = metrics['both']['ref_seq']
            alt_seq = metrics['both']['alt_seq']
            alt_clip = metrics['both']['alt_clip']

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP":
                    is_dup = True
                else:
                    is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(
                    disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([(i, e)
                                            for i, e in enumerate(gt_lplist)],
                                           key=lambda (x): x[1],
                                           reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format(
                    'GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format(
                    'DP',
                    int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO',
                                                     int(ref_seq + ref_span))
                var.genotype(sample.name).set_format(
                    'AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format(
                        'AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')
                var.genotype(sample.name).set_format(
                    'SRC', int(metrics['start']['ref_count']))
                var.genotype(sample.name).set_format(
                    'SRPC', int(metrics['start']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'SASC', int(metrics['start']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'SAPC', int(metrics['start']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ERC', int(metrics['end']['ref_count']))
                var.genotype(sample.name).set_format(
                    'ERPC', int(metrics['end']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'EASC', int(metrics['end']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'EAPC', int(metrics['end']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'BRC', int(metrics['both']['ref_count']))
                var.genotype(sample.name).set_format(
                    'BRPC', int(metrics['both']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'BASC', int(metrics['both']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'BAPC', int(metrics['both']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ISM', sample.get_mean_insert_size())
                var.genotype(sample.name).set_format(
                    'ISSD', sample.get_stddev_insert_size())

                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(
                        -10 * (gt_lplist[0] - gt_sum_log)
                    )  # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')
                var.genotype(sample.name).set_format(
                    'SRC', int(metrics['start']['ref_count']))
                var.genotype(sample.name).set_format(
                    'SRPC', int(metrics['start']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'SASC', int(metrics['start']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'SAPC', int(metrics['start']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ERC', int(metrics['end']['ref_count']))
                var.genotype(sample.name).set_format(
                    'ERPC', int(metrics['end']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'EASC', int(metrics['end']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'EAPC', int(metrics['end']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'BRC', int(metrics['both']['ref_count']))
                var.genotype(sample.name).set_format(
                    'BRPC', int(metrics['both']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'BASC', int(metrics['both']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'BAPC', int(metrics['both']['alt_pe_count']))

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning(
            'Unpaired breakends found in file. These will not be present in output.'
        )

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()
        supporting_reads_fh.close()

    return