Python Variant.qual Examples

Programming Language: Python

Namespace/Package Name: svtyper.parsers

Class/Type: Variant

Method/Function: qual

Examples at hotexamples.com: 7

Python Variant.qual - 7 examples found. These are the top rated real world Python examples of svtyper.parsers.Variant.qual extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Variant(5)

qual(4)

has_svtype(3)

genotype(2)

get_info(2)

get_svtype(2)

get_var_string(2)

is_valid_svtype(2)

write(2)

Example #1

Show file

def apply_genotypes_to_vcf(src_vcf, out_vcf, genotypes, sample, sum_quals):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        result = genotypes[variant.var_id]

        if result is None:
            msg = ("Found no genotype results for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            raise RuntimeError(msg)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)

Example #2

Show file

File: singlesample.py Project: hall-lab/svtyper

def apply_genotypes_to_vcf(src_vcf, out_vcf, genotypes, sample, sum_quals):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        result = genotypes[variant.var_id]

        if result is None:
            msg = ("Found no genotype results for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            raise RuntimeError(msg)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)

Example #3

Show file

File: classic.py Project: hall-lab/svtyper

def sv_genotype(bam_string,
                vcf_in,
                vcf_out,
                min_aligned,
                split_weight,
                disc_weight,
                num_samp,
                lib_info_path,
                debug,
                alignment_outpath,
                ref_fasta,
                sum_quals,
                max_reads,
                max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(pysam.AlignmentFile(b,
                mode='rc',reference_filename=ref_fasta,format_options=["required_fields=7167"]))
        else:
            sys.stderr.write('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b)
            exit(1)
            
    min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' % bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        out_bam_written_reads = set()
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    split_slop = 3 # amount of slop around breakpoint to count splitters
    in_header = True
    header = []
    breakend_dict = {} # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')


        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write('Warning: SVTYPE missing at variant %s. Skipping.\n' % (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue
            
        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write('Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' % (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95', max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else: o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else: o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse =  False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse =  True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA, chromB, posB, ciB, z, max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            # initialize counts to zero
            ref_span, alt_span = 0, 0
            ref_seq, alt_seq = 0, 0
            alt_clip = 0

            # ref_ciA = ciA
            # ref_ciB = ciB
            ref_ciA = [0,0]
            ref_ciB = [0,0]

            for query_name in sorted(read_batch.keys()):
                fragment = read_batch[query_name]
                # boolean on whether to write the fragment
                write_fragment = False

                # -------------------------------------
                # Check for split-read evidence
                # -------------------------------------

                # get reference sequences
                for read in fragment.primary_reads:
                    is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA, ciA, min_aligned)
                    is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB, ciB, min_aligned)
                    if (is_ref_seq_A or is_ref_seq_B):
                        p_reference = prob_mapq(read)
                        ref_seq += p_reference

                        read.set_tag('XV', 'R')
                        write_fragment = True

                # get non-reference split-read support
                for split in fragment.split_reads:

                    split_lr = split.is_split_straddle(chromA, posA, ciA,
                                                       chromB, posB, ciB,
                                                       o1_is_reverse, o2_is_reverse,
                                                       svtype, split_slop)
                    # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right)
                    p_alt = (prob_mapq(split.query_left) * split_lr[0] + prob_mapq(split.query_right) * split_lr[1]) / 2.0
                    if split.is_soft_clip:
                        alt_clip += p_alt
                    else:
                        alt_seq += p_alt

                    if p_alt > 0:
                        split.tag_split(p_alt)
                        write_fragment = True

                # -------------------------------------
                # Check for paired-end evidence
                # -------------------------------------

                # tally spanning alternate pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    alt_straddle = False
                else:
                    alt_straddle = fragment.is_pair_straddle(chromA, posA, ciA,
                                                             chromB, posB, ciB,
                                                             o1_is_reverse, o2_is_reverse,
                                                             min_aligned,
                                                             fragment.lib)

                # check both sides if inversion (perhaps should do this for BND as well?)
                if svtype in ('INV'):
                    alt_straddle_reciprocal = fragment.is_pair_straddle(chromA, posA, ciA,
                                                                        chromB, posB, ciB,
                                                                        not o1_is_reverse,
                                                                        not o2_is_reverse,
                                                                        min_aligned,
                                                                        fragment.lib)
                else:
                    alt_straddle_reciprocal = False

                if alt_straddle or alt_straddle_reciprocal:
                    if svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_alt = (1 - p_conc) * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            alt_span += p_alt

                            # # since an alt straddler is by definition also a reference straddler,
                            # # we can bail out early here to save some time
                            # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            # ref_span += p_reference
                            # continue

                            fragment.tag_span(p_alt)
                            write_fragment = True

                    else:
                        p_alt = prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                        alt_span += p_alt

                        fragment.tag_span(p_alt)
                        write_fragment = True

                # # tally spanning reference pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    ref_straddle_A = False
                    ref_straddle_B = False
                else:
                    ref_straddle_A = fragment.is_pair_straddle(chromA, posA, ref_ciA,
                                                               chromA, posA, ref_ciA,
                                                               False, True,
                                                               min_aligned,
                                                               fragment.lib)
                    ref_straddle_B = fragment.is_pair_straddle(chromB, posB, ref_ciB,
                                                               chromB, posB, ref_ciB,
                                                               False, True,
                                                               min_aligned,
                                                               fragment.lib)

                if ref_straddle_A or ref_straddle_B:
                    # don't allow the pair to jump the entire variant, except for
                    # length-changing SVs like deletions
                    if not (ref_straddle_A and ref_straddle_B) or svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            ref_span += (ref_straddle_A + ref_straddle_B) * p_reference / 2

                            fragment.tag_span(1 - p_conc)
                            write_fragment = True

                # write to BAM if requested
                if alignment_outpath is not None and  write_fragment:
                    for read in fragment.primary_reads + [split.read for split in fragment.split_reads]:
                        out_bam_written_reads = write_alignment(read, out_bam, out_bam_written_reads)

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP": is_dup = True
                else: is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format('GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format('DP', int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO', int(ref_seq + ref_span))
                var.genotype(sample.name).set_format('AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format('AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')


                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning('Unpaired breakends found in file. These will not be present in output.')

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()

    return

Example #4

Show file

File: classic.py Project: tianfuzeng/svtyper

def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight,
                disc_weight, num_samp, lib_info_path, debug, alignment_outpath,
                ref_fasta, sum_quals, max_reads, max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(
                pysam.AlignmentFile(b,
                                    mode='rc',
                                    reference_filename=ref_fasta,
                                    format_options=["required_fields=7167"]))
        else:
            sys.stderr.write(
                'Error: %s is not a valid alignment file (*.bam or *.cram)\n' %
                b)
            exit(1)

    min_lib_prevalence = 1e-3  # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' %
                         bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info,
                                          min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        out_bam_written_reads = set()
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    split_slop = 3  # amount of slop around breakpoint to count splitters
    in_header = True
    header = []
    breakend_dict = {
    }  # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None  # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write(
                'Warning: SVTYPE missing at variant %s. Skipping.\n' %
                (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write(
                'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' %
                (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95',
                                          max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else:
                    o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else:
                    o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse = False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse = True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA,
                                                chromB, posB, ciB, z,
                                                max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            # initialize counts to zero
            ref_span, alt_span = 0, 0
            ref_seq, alt_seq = 0, 0
            alt_clip = 0

            # ref_ciA = ciA
            # ref_ciB = ciB
            ref_ciA = [0, 0]
            ref_ciB = [0, 0]

            for query_name in sorted(read_batch.keys()):
                fragment = read_batch[query_name]
                # boolean on whether to write the fragment
                write_fragment = False

                # -------------------------------------
                # Check for split-read evidence
                # -------------------------------------

                # get reference sequences
                for read in fragment.primary_reads:
                    is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA,
                                                       ciA, min_aligned)
                    is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB,
                                                       ciB, min_aligned)
                    if (is_ref_seq_A or is_ref_seq_B):
                        p_reference = prob_mapq(read)
                        ref_seq += p_reference

                        read.set_tag('XV', 'R')
                        write_fragment = True

                # get non-reference split-read support
                for split in fragment.split_reads:

                    split_lr = split.is_split_straddle(chromA, posA, ciA,
                                                       chromB, posB, ciB,
                                                       o1_is_reverse,
                                                       o2_is_reverse, svtype,
                                                       split_slop)
                    # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right)
                    p_alt = (prob_mapq(split.query_left) * split_lr[0] +
                             prob_mapq(split.query_right) * split_lr[1]) / 2.0
                    if split.is_soft_clip:
                        alt_clip += p_alt
                    else:
                        alt_seq += p_alt

                    if p_alt > 0:
                        split.tag_split(p_alt)
                        write_fragment = True

                # -------------------------------------
                # Check for paired-end evidence
                # -------------------------------------

                # tally spanning alternate pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    alt_straddle = False
                else:
                    alt_straddle = fragment.is_pair_straddle(
                        chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse,
                        o2_is_reverse, min_aligned, fragment.lib)

                # check both sides if inversion (perhaps should do this for BND as well?)
                if svtype in ('INV'):
                    alt_straddle_reciprocal = fragment.is_pair_straddle(
                        chromA, posA, ciA, chromB, posB, ciB,
                        not o1_is_reverse, not o2_is_reverse, min_aligned,
                        fragment.lib)
                else:
                    alt_straddle_reciprocal = False

                if alt_straddle or alt_straddle_reciprocal:
                    if svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_alt = (1 - p_conc) * prob_mapq(
                                fragment.readA) * prob_mapq(fragment.readB)
                            alt_span += p_alt

                            # # since an alt straddler is by definition also a reference straddler,
                            # # we can bail out early here to save some time
                            # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB)
                            # ref_span += p_reference
                            # continue

                            fragment.tag_span(p_alt)
                            write_fragment = True

                    else:
                        p_alt = prob_mapq(fragment.readA) * prob_mapq(
                            fragment.readB)
                        alt_span += p_alt

                        fragment.tag_span(p_alt)
                        write_fragment = True

                # # tally spanning reference pairs
                if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd:
                    ref_straddle_A = False
                    ref_straddle_B = False
                else:
                    ref_straddle_A = fragment.is_pair_straddle(
                        chromA, posA, ref_ciA, chromA, posA, ref_ciA, False,
                        True, min_aligned, fragment.lib)
                    ref_straddle_B = fragment.is_pair_straddle(
                        chromB, posB, ref_ciB, chromB, posB, ref_ciB, False,
                        True, min_aligned, fragment.lib)

                if ref_straddle_A or ref_straddle_B:
                    # don't allow the pair to jump the entire variant, except for
                    # length-changing SVs like deletions
                    if not (ref_straddle_A
                            and ref_straddle_B) or svtype == 'DEL':
                        p_conc = fragment.p_concordant(var_length)
                        if p_conc is not None:
                            p_reference = p_conc * prob_mapq(
                                fragment.readA) * prob_mapq(fragment.readB)
                            ref_span += (ref_straddle_A +
                                         ref_straddle_B) * p_reference / 2

                            fragment.tag_span(1 - p_conc)
                            write_fragment = True

                # write to BAM if requested
                if alignment_outpath is not None and write_fragment:
                    for read in fragment.primary_reads + [
                            split.read for split in fragment.split_reads
                    ]:
                        out_bam_written_reads = write_alignment(
                            read, out_bam, out_bam_written_reads)

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP": is_dup = True
                else: is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(
                    disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([(i, e)
                                            for i, e in enumerate(gt_lplist)],
                                           key=lambda (x): x[1],
                                           reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format(
                    'GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format(
                    'DP',
                    int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO',
                                                     int(ref_seq + ref_span))
                var.genotype(sample.name).set_format(
                    'AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format(
                        'AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')

                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(
                        -10 * (gt_lplist[0] - gt_sum_log)
                    )  # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning(
            'Unpaired breakends found in file. These will not be present in output.'
        )

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()

    return

Example #5

Show file

def genotype_serial(src_vcf, out_vcf, sample, z, split_slop, min_aligned,
                    sum_quals, split_weight, disc_weight, max_reads,
                    max_ci_dist, debug):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if i % 1000 == 0:
            logit("[ {} | {} ] Processing variant {}".format(
                i, total_variants, variant.var_id))
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        breakpoints = src_vcf.get_variant_breakpoints(variant, max_ci_dist)

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        if breakpoints is None:
            msg = ("Found no breakpoints for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            continue

        result = serial_calculate_genotype(
            sample.bam, get_breakpoint_regions(breakpoints, sample,
                                               z), sample.rg_to_lib,
            sample.active_libs, sample.name, split_slop, min_aligned,
            split_weight, disc_weight, breakpoints, max_reads, debug)

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)

Example #6

Show file

File: singlesample.py Project: hall-lab/svtyper

def genotype_serial(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug):
    # initializations
    bnd_cache = {}
    src_vcf.write_header(out_vcf)
    total_variants = len(list(vcf_variants(src_vcf.filename)))

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    for i, vline in enumerate(vcf_variants(src_vcf.filename)):
        v = vline.rstrip().split('\t')
        variant = Variant(v, src_vcf)
        if i % 1000 == 0:
            logit("[ {} | {} ] Processing variant {}".format(i, total_variants, variant.var_id))
        if not sum_quals:
            variant.qual = 0

        if not variant.has_svtype():
            msg = ('Warning: SVTYPE missing '
                   'at variant %s. '
                   'Skipping.\n') % (variant.var_id)
            logit(msg)
            variant.write(out_vcf)
            continue

        if not variant.is_valid_svtype():
            msg = ('Warning: Unsupported SVTYPE '
                   'at variant %s (%s). '
                   'Skipping.\n') % (variant.var_id, variant.get_svtype())
            logit(msg)
            variant.write(out_vcf)
            continue

        breakpoints = src_vcf.get_variant_breakpoints(variant, max_ci_dist)

        # special BND processing
        if variant.get_svtype() == 'BND':
            if variant.info['MATEID'] in bnd_cache:
                variant2 = variant
                variant = bnd_cache[variant.info['MATEID']]
                del bnd_cache[variant.var_id]
            else:
                bnd_cache[variant.var_id] = variant
                continue

        if breakpoints is None:
            msg = ("Found no breakpoints for variant "
                   "'{}' ({})").format(variant.var_id, variant.get_svtype())
            logit(msg)
            continue

        result = serial_calculate_genotype(
                sample.bam,
                get_breakpoint_regions(breakpoints, sample, z),
                sample.rg_to_lib,
                sample.active_libs,
                sample.name,
                split_slop,
                min_aligned,
                split_weight,
                disc_weight,
                breakpoints,
                max_reads,
                debug
        )

        variant = assign_genotype_to_variant(variant, sample, result)
        variant.write(out_vcf)

        # special BND processing
        if variant.get_svtype() == 'BND':
            variant2.qual = variant.qual
            variant2.active_formats = variant.active_formats
            variant2.genotype = variant.genotype
            variant2.write(out_vcf)

Example #7

Show file

File: classic.py Project: color/svtyper

def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight,
                disc_weight, num_samp, lib_info_path, debug, alignment_outpath,
                ref_fasta, sum_quals, max_reads, max_ci_dist):

    # parse the comma separated inputs
    bam_list = []
    for b in bam_string.split(','):
        if b.endswith('.bam'):
            bam_list.append(pysam.AlignmentFile(b, mode='rb'))
        elif b.endswith('.cram'):
            bam_list.append(
                pysam.AlignmentFile(b,
                                    mode='rc',
                                    reference_filename=ref_fasta,
                                    format_options=["required_fields=7167"]))
        else:
            sys.stderr.write(
                'Error: %s is not a valid alignment file (*.bam or *.cram)\n' %
                b)
            exit(1)

    min_lib_prevalence = 1e-3  # only consider libraries that constitute at least this fraction of the BAM

    # parse lib_info_path JSON
    lib_info = None
    if lib_info_path is not None and os.path.isfile(lib_info_path):
        lib_info_file = open(lib_info_path, 'r')
        lib_info = json.load(lib_info_file)

    if vcf_in is None:
        sys.stderr.write('Warning: VCF not found.\n')

    # build the sample libraries, either from the lib_info JSON or empirically from the BAMs
    sample_list = list()
    for i in xrange(len(bam_list)):
        if lib_info is None:
            logging.info('Calculating library metrics from %s...' %
                         bam_list[i].filename)
            sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence)
        else:
            logging.info('Reading library metrics from %s...' % lib_info_path)
            sample = Sample.from_lib_info(bam_list[i], lib_info,
                                          min_lib_prevalence)

        sample.set_exp_seq_depth(min_aligned)
        sample.set_exp_spanning_depth(min_aligned)
        sample_list.append(sample)
    logging.info('done')

    # diagnostic dump of relevant BAM reads
    if alignment_outpath is not None:
        # create a BAM file of the reads used for genotyping
        template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb')
        out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam)
        template_bam.close()

        supporting_reads_file = alignment_outpath.replace('bam', 'tsv')
        supporting_reads_fh = open(supporting_reads_file, 'w')
        supporting_reads_fh.write(
            'chromA,chromB,posA,posB,svtype,read_id,read_chrom,read_reference_start,read_reference_end,start_ref_support,start_ref_pe_support,start_alt_sr_support,start_alt_pe_support,end_ref_support,end_ref_pe_support,end_alt_sr_support,end_alt_pe_support\n'
        )
    else:
        out_bam = None
        supporting_reads_fh = None
    out_bam_written_reads = set()

    # write the JSON for each sample's libraries
    if lib_info_path is not None and not os.path.isfile(lib_info_path):
        logging.info('Writing library metrics to %s...' % lib_info_path)
        lib_info_file = open(lib_info_path, 'w')
        write_sample_json(sample_list, lib_info_file)
        lib_info_file.close()
        logging.info('done')

    # quit early if VCF absent
    if vcf_in is None:
        if alignment_outpath is not None:
            out_bam.close()
        return

    # set variables for genotyping
    z = 3
    in_header = True
    header = []
    breakend_dict = {
    }  # cache to hold unmatched generic breakends for genotyping
    vcf = Vcf()

    # read input VCF
    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # if detailed:
                vcf.add_custom_svtyper_headers()

                # add the samples in the BAM files to the VCF output
                for sample in sample_list:
                    if sample.name not in vcf.sample_list:
                        vcf.add_sample(sample.name)

                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        var_length = None  # var_length should be None except for deletions
        if not sum_quals:
            var.qual = 0

        # genotype generic breakends
        try:
            svtype = var.get_info('SVTYPE')
        except KeyError:
            sys.stderr.write(
                'Warning: SVTYPE missing at variant %s. Skipping.\n' %
                (var.var_id))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        # print original line if unsupported svtype
        if svtype not in ('BND', 'DEL', 'DUP', 'INV'):
            sys.stderr.write(
                'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' %
                (var.var_id, svtype))
            vcf_out.write(var.get_var_string() + '\n')
            continue

        if svtype == 'BND':
            if var.info['MATEID'] in breakend_dict:
                var2 = var
                var = breakend_dict[var.info['MATEID']]
                chromA = var.chrom
                chromB = var2.chrom
                posA = var.pos
                posB = var2.pos
                # confidence intervals
                ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
                ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95',
                                          max_ci_dist)

                # infer the strands from the alt allele
                if var.alt[-1] == '[' or var.alt[-1] == ']':
                    o1_is_reverse = False
                else:
                    o1_is_reverse = True
                if var2.alt[-1] == '[' or var2.alt[-1] == ']':
                    o2_is_reverse = False
                else:
                    o2_is_reverse = True

                # remove the BND from the breakend_dict
                # to free up memory
                del breakend_dict[var.var_id]
            else:
                breakend_dict[var.var_id] = var
                continue
        else:
            chromA = var.chrom
            chromB = var.chrom
            posA = var.pos
            posB = int(var.get_info('END'))
            # confidence intervals
            ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist)
            ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist)
            if svtype == 'DEL':
                var_length = posB - posA
                o1_is_reverse, o2_is_reverse = False, True
            elif svtype == 'DUP':
                o1_is_reverse, o2_is_reverse = True, False
            elif svtype == 'INV':
                o1_is_reverse, o2_is_reverse = False, False

        # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
        if o1_is_reverse: posA += 1
        if o2_is_reverse: posB += 1

        for sample in sample_list:
            # grab reads for start and end of breakpoints
            start_read_batch = {}
            end_read_batch = {}
            start_read_batch, many = gather_reads(sample, chromA, posA, ciA, z,
                                                  start_read_batch, max_reads)
            end_read_batch, many = gather_reads(sample, chromB, posB, ciB, z,
                                                end_read_batch, max_reads)
            # grab reads from both sides of breakpoint
            read_batch, many = gather_all_reads(sample, chromA, posA, ciA,
                                                chromB, posB, ciB, z,
                                                max_reads)
            if many:
                var.genotype(sample.name).set_format('GT', './.')
                continue

            metrics = {'start': {}, 'end': {}}
            # run metrics for each breakpoint separately
            metrics['start'] = calculate_metrics(
                start_read_batch, 'start', var, chromA, chromB, posA, posB,
                ciA, ciB, min_aligned, o1_is_reverse, o2_is_reverse,
                var_length, svtype, out_bam_written_reads, supporting_reads_fh,
                out_bam)
            metrics['end'] = calculate_metrics(end_read_batch, 'end', var,
                                               chromA, chromB, posA, posB, ciA,
                                               ciB, min_aligned, o1_is_reverse,
                                               o2_is_reverse, var_length,
                                               svtype, out_bam_written_reads,
                                               supporting_reads_fh, out_bam)

            # run metrics for both sides of breakpoints
            # no bam file and supporting reads files is written here since they are already written in running the method for each breakpoint separately
            metrics['both'] = calculate_metrics(
                read_batch, 'both', var, chromA, chromB, posA, posB, ciA, ciB,
                min_aligned, o1_is_reverse, o2_is_reverse, var_length, svtype,
                out_bam_written_reads, None, None)

            # set these for metrics from both sides of breakpoints
            ref_span = metrics['both']['ref_span']
            alt_span = metrics['both']['alt_span']
            ref_seq = metrics['both']['ref_seq']
            alt_seq = metrics['both']['alt_seq']
            alt_clip = metrics['both']['alt_clip']

            if debug:
                print '--------------------------'
                print 'ref_span:', ref_span
                print 'alt_span:', alt_span
                print 'ref_seq:', ref_seq
                print 'alt_seq:', alt_seq
                print 'alt_clip:', alt_clip

            # in the absence of evidence for a particular type, ignore the reference
            # support for that type as well
            if (alt_seq + alt_clip) < 0.5 and alt_span >= 1:
                alt_seq = 0
                alt_clip = 0
                ref_seq = 0
            if alt_span < 0.5 and (alt_seq + alt_clip) >= 1:
                alt_span = 0
                ref_span = 0

            if alt_span + alt_seq == 0 and alt_clip > 0:
                # discount any SV that's only supported by clips.
                alt_clip = 0

            if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0:
                # get bayesian classifier
                if var.info['SVTYPE'] == "DUP":
                    is_dup = True
                else:
                    is_dup = False

                alt_splitters = alt_seq + alt_clip
                QR = int(split_weight * ref_seq) + int(disc_weight * ref_span)
                QA = int(split_weight * alt_splitters) + int(
                    disc_weight * alt_span)
                gt_lplist = bayes_gt(QR, QA, is_dup)
                best, second_best = sorted([(i, e)
                                            for i, e in enumerate(gt_lplist)],
                                           key=lambda (x): x[1],
                                           reverse=True)[0:2]
                gt_idx = best[0]

                # print log probabilities of homref, het, homalt
                if debug:
                    print gt_lplist

                # set the overall variant QUAL score and sample specific fields
                var.genotype(sample.name).set_format(
                    'GL', ','.join(['%.0f' % x for x in gt_lplist]))
                var.genotype(sample.name).set_format(
                    'DP',
                    int(ref_seq + alt_seq + alt_clip + ref_span + alt_span))
                var.genotype(sample.name).set_format('RO',
                                                     int(ref_seq + ref_span))
                var.genotype(sample.name).set_format(
                    'AO', int(alt_seq + alt_clip + alt_span))
                var.genotype(sample.name).set_format('QR', QR)
                var.genotype(sample.name).set_format('QA', QA)
                # if detailed:
                var.genotype(sample.name).set_format('RS', int(ref_seq))
                var.genotype(sample.name).set_format('AS', int(alt_seq))
                var.genotype(sample.name).set_format('ASC', int(alt_clip))
                var.genotype(sample.name).set_format('RP', int(ref_span))
                var.genotype(sample.name).set_format('AP', int(alt_span))
                try:
                    var.genotype(sample.name).set_format(
                        'AB', '%.2g' % (QA / float(QR + QA)))
                except ZeroDivisionError:
                    var.genotype(sample.name).set_format('AB', '.')
                var.genotype(sample.name).set_format(
                    'SRC', int(metrics['start']['ref_count']))
                var.genotype(sample.name).set_format(
                    'SRPC', int(metrics['start']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'SASC', int(metrics['start']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'SAPC', int(metrics['start']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ERC', int(metrics['end']['ref_count']))
                var.genotype(sample.name).set_format(
                    'ERPC', int(metrics['end']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'EASC', int(metrics['end']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'EAPC', int(metrics['end']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'BRC', int(metrics['both']['ref_count']))
                var.genotype(sample.name).set_format(
                    'BRPC', int(metrics['both']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'BASC', int(metrics['both']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'BAPC', int(metrics['both']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ISM', sample.get_mean_insert_size())
                var.genotype(sample.name).set_format(
                    'ISSD', sample.get_stddev_insert_size())

                # assign genotypes
                gt_sum = 0
                for gt in gt_lplist:
                    try:
                        gt_sum += 10**gt
                    except OverflowError:
                        gt_sum += 0
                if gt_sum > 0:
                    gt_sum_log = math.log(gt_sum, 10)
                    sample_qual = abs(
                        -10 * (gt_lplist[0] - gt_sum_log)
                    )  # phred-scaled probability site is non-reference in this sample
                    phred_gq = min(-10 * (second_best[1] - best[1]), 200)
                    var.genotype(sample.name).set_format('GQ', int(phred_gq))
                    var.genotype(sample.name).set_format('SQ', sample_qual)
                    var.qual += sample_qual
                    if gt_idx == 1:
                        var.genotype(sample.name).set_format('GT', '0/1')
                    elif gt_idx == 2:
                        var.genotype(sample.name).set_format('GT', '1/1')
                    elif gt_idx == 0:
                        var.genotype(sample.name).set_format('GT', '0/0')
                else:
                    var.genotype(sample.name).set_format('GQ', '.')
                    var.genotype(sample.name).set_format('SQ', '.')
                    var.genotype(sample.name).set_format('GT', './.')
            else:
                var.genotype(sample.name).set_format('GT', './.')
                var.qual = 0
                var.genotype(sample.name).set_format('GQ', '.')
                var.genotype(sample.name).set_format('SQ', '.')
                var.genotype(sample.name).set_format('GL', '.')
                var.genotype(sample.name).set_format('DP', 0)
                var.genotype(sample.name).set_format('AO', 0)
                var.genotype(sample.name).set_format('RO', 0)
                # if detailed:
                var.genotype(sample.name).set_format('AS', 0)
                var.genotype(sample.name).set_format('ASC', 0)
                var.genotype(sample.name).set_format('RS', 0)
                var.genotype(sample.name).set_format('AP', 0)
                var.genotype(sample.name).set_format('RP', 0)
                var.genotype(sample.name).set_format('QR', 0)
                var.genotype(sample.name).set_format('QA', 0)
                var.genotype(sample.name).set_format('AB', '.')
                var.genotype(sample.name).set_format(
                    'SRC', int(metrics['start']['ref_count']))
                var.genotype(sample.name).set_format(
                    'SRPC', int(metrics['start']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'SASC', int(metrics['start']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'SAPC', int(metrics['start']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'ERC', int(metrics['end']['ref_count']))
                var.genotype(sample.name).set_format(
                    'ERPC', int(metrics['end']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'EASC', int(metrics['end']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'EAPC', int(metrics['end']['alt_pe_count']))
                var.genotype(sample.name).set_format(
                    'BRC', int(metrics['both']['ref_count']))
                var.genotype(sample.name).set_format(
                    'BRPC', int(metrics['both']['ref_pe_count']))
                var.genotype(sample.name).set_format(
                    'BASC', int(metrics['both']['alt_clip_count']))
                var.genotype(sample.name).set_format(
                    'BAPC', int(metrics['both']['alt_pe_count']))

        # after all samples have been processed, write
        vcf_out.write(var.get_var_string() + '\n')
        if var.info['SVTYPE'] == 'BND':
            var2.qual = var.qual
            var2.active_formats = var.active_formats
            var2.genotype = var.genotype
            vcf_out.write(var2.get_var_string() + '\n')

    # throw warning if we've lost unpaired breakends
    if breakend_dict:
        logging.warning(
            'Unpaired breakends found in file. These will not be present in output.'
        )

    # close the files
    vcf_in.close()
    vcf_out.close()
    if alignment_outpath is not None:
        out_bam.close()
        supporting_reads_fh.close()

    return