def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight, disc_weight, num_samp, lib_info_path, debug, alignment_outpath, ref_fasta, sum_quals, max_reads, max_ci_dist): # parse the comma separated inputs bam_list = [] for b in bam_string.split(','): if b.endswith('.bam'): bam_list.append(pysam.AlignmentFile(b, mode='rb')) elif b.endswith('.cram'): bam_list.append( pysam.AlignmentFile(b, mode='rc', reference_filename=ref_fasta, format_options=["required_fields=7167"])) else: sys.stderr.write( 'Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b) exit(1) min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM # parse lib_info_path JSON lib_info = None if lib_info_path is not None and os.path.isfile(lib_info_path): lib_info_file = open(lib_info_path, 'r') lib_info = json.load(lib_info_file) if vcf_in is None: sys.stderr.write('Warning: VCF not found.\n') # build the sample libraries, either from the lib_info JSON or empirically from the BAMs sample_list = list() for i in xrange(len(bam_list)): if lib_info is None: logging.info('Calculating library metrics from %s...' % bam_list[i].filename) sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence) else: logging.info('Reading library metrics from %s...' % lib_info_path) sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence) sample.set_exp_seq_depth(min_aligned) sample.set_exp_spanning_depth(min_aligned) sample_list.append(sample) logging.info('done') # diagnostic dump of relevant BAM reads if alignment_outpath is not None: # create a BAM file of the reads used for genotyping out_bam_written_reads = set() template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb') out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam) template_bam.close() # write the JSON for each sample's libraries if lib_info_path is not None and not os.path.isfile(lib_info_path): logging.info('Writing library metrics to %s...' % lib_info_path) lib_info_file = open(lib_info_path, 'w') write_sample_json(sample_list, lib_info_file) lib_info_file.close() logging.info('done') # quit early if VCF absent if vcf_in is None: if alignment_outpath is not None: out_bam.close() return # set variables for genotyping z = 3 split_slop = 3 # amount of slop around breakpoint to count splitters in_header = True header = [] breakend_dict = { } # cache to hold unmatched generic breakends for genotyping vcf = Vcf() # read input VCF for line in vcf_in: if in_header: if line[0] == '#': header.append(line) if line[1] != '#': vcf_samples = line.rstrip().split('\t')[9:] continue else: in_header = False vcf.add_header(header) # if detailed: vcf.add_custom_svtyper_headers() # add the samples in the BAM files to the VCF output for sample in sample_list: if sample.name not in vcf.sample_list: vcf.add_sample(sample.name) # write the output header vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') var = Variant(v, vcf) var_length = None # var_length should be None except for deletions if not sum_quals: var.qual = 0 # genotype generic breakends try: svtype = var.get_info('SVTYPE') except KeyError: sys.stderr.write( 'Warning: SVTYPE missing at variant %s. Skipping.\n' % (var.var_id)) vcf_out.write(var.get_var_string() + '\n') continue # print original line if unsupported svtype if svtype not in ('BND', 'DEL', 'DUP', 'INV'): sys.stderr.write( 'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' % (var.var_id, svtype)) vcf_out.write(var.get_var_string() + '\n') continue if svtype == 'BND': if var.info['MATEID'] in breakend_dict: var2 = var var = breakend_dict[var.info['MATEID']] chromA = var.chrom chromB = var2.chrom posA = var.pos posB = var2.pos # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95', max_ci_dist) # infer the strands from the alt allele if var.alt[-1] == '[' or var.alt[-1] == ']': o1_is_reverse = False else: o1_is_reverse = True if var2.alt[-1] == '[' or var2.alt[-1] == ']': o2_is_reverse = False else: o2_is_reverse = True # remove the BND from the breakend_dict # to free up memory del breakend_dict[var.var_id] else: breakend_dict[var.var_id] = var continue else: chromA = var.chrom chromB = var.chrom posA = var.pos posB = int(var.get_info('END')) # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist) if svtype == 'DEL': var_length = posB - posA o1_is_reverse, o2_is_reverse = False, True elif svtype == 'DUP': o1_is_reverse, o2_is_reverse = True, False elif svtype == 'INV': o1_is_reverse, o2_is_reverse = False, False # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction) if o1_is_reverse: posA += 1 if o2_is_reverse: posB += 1 for sample in sample_list: # grab reads from both sides of breakpoint read_batch, many = gather_all_reads(sample, chromA, posA, ciA, chromB, posB, ciB, z, max_reads) if many: var.genotype(sample.name).set_format('GT', './.') continue # initialize counts to zero ref_span, alt_span = 0, 0 ref_seq, alt_seq = 0, 0 alt_clip = 0 # ref_ciA = ciA # ref_ciB = ciB ref_ciA = [0, 0] ref_ciB = [0, 0] for query_name in sorted(read_batch.keys()): fragment = read_batch[query_name] # boolean on whether to write the fragment write_fragment = False # ------------------------------------- # Check for split-read evidence # ------------------------------------- # get reference sequences for read in fragment.primary_reads: is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA, ciA, min_aligned) is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB, ciB, min_aligned) if (is_ref_seq_A or is_ref_seq_B): p_reference = prob_mapq(read) ref_seq += p_reference read.set_tag('XV', 'R') write_fragment = True # get non-reference split-read support for split in fragment.split_reads: split_lr = split.is_split_straddle(chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, svtype, split_slop) # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right) p_alt = (prob_mapq(split.query_left) * split_lr[0] + prob_mapq(split.query_right) * split_lr[1]) / 2.0 if split.is_soft_clip: alt_clip += p_alt else: alt_seq += p_alt if p_alt > 0: split.tag_split(p_alt) write_fragment = True # ------------------------------------- # Check for paired-end evidence # ------------------------------------- # tally spanning alternate pairs if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd: alt_straddle = False else: alt_straddle = fragment.is_pair_straddle( chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, min_aligned, fragment.lib) # check both sides if inversion (perhaps should do this for BND as well?) if svtype in ('INV'): alt_straddle_reciprocal = fragment.is_pair_straddle( chromA, posA, ciA, chromB, posB, ciB, not o1_is_reverse, not o2_is_reverse, min_aligned, fragment.lib) else: alt_straddle_reciprocal = False if alt_straddle or alt_straddle_reciprocal: if svtype == 'DEL': p_conc = fragment.p_concordant(var_length) if p_conc is not None: p_alt = (1 - p_conc) * prob_mapq( fragment.readA) * prob_mapq(fragment.readB) alt_span += p_alt # # since an alt straddler is by definition also a reference straddler, # # we can bail out early here to save some time # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB) # ref_span += p_reference # continue fragment.tag_span(p_alt) write_fragment = True else: p_alt = prob_mapq(fragment.readA) * prob_mapq( fragment.readB) alt_span += p_alt fragment.tag_span(p_alt) write_fragment = True # # tally spanning reference pairs if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd: ref_straddle_A = False ref_straddle_B = False else: ref_straddle_A = fragment.is_pair_straddle( chromA, posA, ref_ciA, chromA, posA, ref_ciA, False, True, min_aligned, fragment.lib) ref_straddle_B = fragment.is_pair_straddle( chromB, posB, ref_ciB, chromB, posB, ref_ciB, False, True, min_aligned, fragment.lib) if ref_straddle_A or ref_straddle_B: # don't allow the pair to jump the entire variant, except for # length-changing SVs like deletions if not (ref_straddle_A and ref_straddle_B) or svtype == 'DEL': p_conc = fragment.p_concordant(var_length) if p_conc is not None: p_reference = p_conc * prob_mapq( fragment.readA) * prob_mapq(fragment.readB) ref_span += (ref_straddle_A + ref_straddle_B) * p_reference / 2 fragment.tag_span(1 - p_conc) write_fragment = True # write to BAM if requested if alignment_outpath is not None and write_fragment: for read in fragment.primary_reads + [ split.read for split in fragment.split_reads ]: out_bam_written_reads = write_alignment( read, out_bam, out_bam_written_reads) if debug: print '--------------------------' print 'ref_span:', ref_span print 'alt_span:', alt_span print 'ref_seq:', ref_seq print 'alt_seq:', alt_seq print 'alt_clip:', alt_clip # in the absence of evidence for a particular type, ignore the reference # support for that type as well if (alt_seq + alt_clip) < 0.5 and alt_span >= 1: alt_seq = 0 alt_clip = 0 ref_seq = 0 if alt_span < 0.5 and (alt_seq + alt_clip) >= 1: alt_span = 0 ref_span = 0 if alt_span + alt_seq == 0 and alt_clip > 0: # discount any SV that's only supported by clips. alt_clip = 0 if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0: # get bayesian classifier if var.info['SVTYPE'] == "DUP": is_dup = True else: is_dup = False alt_splitters = alt_seq + alt_clip QR = int(split_weight * ref_seq) + int(disc_weight * ref_span) QA = int(split_weight * alt_splitters) + int( disc_weight * alt_span) gt_lplist = bayes_gt(QR, QA, is_dup) best, second_best = sorted([(i, e) for i, e in enumerate(gt_lplist)], key=lambda (x): x[1], reverse=True)[0:2] gt_idx = best[0] # print log probabilities of homref, het, homalt if debug: print gt_lplist # set the overall variant QUAL score and sample specific fields var.genotype(sample.name).set_format( 'GL', ','.join(['%.0f' % x for x in gt_lplist])) var.genotype(sample.name).set_format( 'DP', int(ref_seq + alt_seq + alt_clip + ref_span + alt_span)) var.genotype(sample.name).set_format('RO', int(ref_seq + ref_span)) var.genotype(sample.name).set_format( 'AO', int(alt_seq + alt_clip + alt_span)) var.genotype(sample.name).set_format('QR', QR) var.genotype(sample.name).set_format('QA', QA) # if detailed: var.genotype(sample.name).set_format('RS', int(ref_seq)) var.genotype(sample.name).set_format('AS', int(alt_seq)) var.genotype(sample.name).set_format('ASC', int(alt_clip)) var.genotype(sample.name).set_format('RP', int(ref_span)) var.genotype(sample.name).set_format('AP', int(alt_span)) try: var.genotype(sample.name).set_format( 'AB', '%.2g' % (QA / float(QR + QA))) except ZeroDivisionError: var.genotype(sample.name).set_format('AB', '.') # assign genotypes gt_sum = 0 for gt in gt_lplist: try: gt_sum += 10**gt except OverflowError: gt_sum += 0 if gt_sum > 0: gt_sum_log = math.log(gt_sum, 10) sample_qual = abs( -10 * (gt_lplist[0] - gt_sum_log) ) # phred-scaled probability site is non-reference in this sample phred_gq = min(-10 * (second_best[1] - best[1]), 200) var.genotype(sample.name).set_format('GQ', int(phred_gq)) var.genotype(sample.name).set_format('SQ', sample_qual) var.qual += sample_qual if gt_idx == 1: var.genotype(sample.name).set_format('GT', '0/1') elif gt_idx == 2: var.genotype(sample.name).set_format('GT', '1/1') elif gt_idx == 0: var.genotype(sample.name).set_format('GT', '0/0') else: var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GT', './.') else: var.genotype(sample.name).set_format('GT', './.') var.qual = 0 var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GL', '.') var.genotype(sample.name).set_format('DP', 0) var.genotype(sample.name).set_format('AO', 0) var.genotype(sample.name).set_format('RO', 0) # if detailed: var.genotype(sample.name).set_format('AS', 0) var.genotype(sample.name).set_format('ASC', 0) var.genotype(sample.name).set_format('RS', 0) var.genotype(sample.name).set_format('AP', 0) var.genotype(sample.name).set_format('RP', 0) var.genotype(sample.name).set_format('QR', 0) var.genotype(sample.name).set_format('QA', 0) var.genotype(sample.name).set_format('AB', '.') # after all samples have been processed, write vcf_out.write(var.get_var_string() + '\n') if var.info['SVTYPE'] == 'BND': var2.qual = var.qual var2.active_formats = var.active_formats var2.genotype = var.genotype vcf_out.write(var2.get_var_string() + '\n') # throw warning if we've lost unpaired breakends if breakend_dict: logging.warning( 'Unpaired breakends found in file. These will not be present in output.' ) # close the files vcf_in.close() vcf_out.close() if alignment_outpath is not None: out_bam.close() return
def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight, disc_weight, num_samp, lib_info_path, debug, alignment_outpath, ref_fasta, sum_quals, max_reads, max_ci_dist): # parse the comma separated inputs bam_list = [] for b in bam_string.split(','): if b.endswith('.bam'): bam_list.append(pysam.AlignmentFile(b, mode='rb')) elif b.endswith('.cram'): bam_list.append(pysam.AlignmentFile(b, mode='rc',reference_filename=ref_fasta,format_options=["required_fields=7167"])) else: sys.stderr.write('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b) exit(1) min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM # parse lib_info_path JSON lib_info = None if lib_info_path is not None and os.path.isfile(lib_info_path): lib_info_file = open(lib_info_path, 'r') lib_info = json.load(lib_info_file) if vcf_in is None: sys.stderr.write('Warning: VCF not found.\n') # build the sample libraries, either from the lib_info JSON or empirically from the BAMs sample_list = list() for i in xrange(len(bam_list)): if lib_info is None: logging.info('Calculating library metrics from %s...' % bam_list[i].filename) sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence) else: logging.info('Reading library metrics from %s...' % lib_info_path) sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence) sample.set_exp_seq_depth(min_aligned) sample.set_exp_spanning_depth(min_aligned) sample_list.append(sample) logging.info('done') # diagnostic dump of relevant BAM reads if alignment_outpath is not None: # create a BAM file of the reads used for genotyping out_bam_written_reads = set() template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb') out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam) template_bam.close() # write the JSON for each sample's libraries if lib_info_path is not None and not os.path.isfile(lib_info_path): logging.info('Writing library metrics to %s...' % lib_info_path) lib_info_file = open(lib_info_path, 'w') write_sample_json(sample_list, lib_info_file) lib_info_file.close() logging.info('done') # quit early if VCF absent if vcf_in is None: if alignment_outpath is not None: out_bam.close() return # set variables for genotyping z = 3 split_slop = 3 # amount of slop around breakpoint to count splitters in_header = True header = [] breakend_dict = {} # cache to hold unmatched generic breakends for genotyping vcf = Vcf() # read input VCF for line in vcf_in: if in_header: if line[0] == '#': header.append(line) if line[1] != '#': vcf_samples = line.rstrip().split('\t')[9:] continue else: in_header = False vcf.add_header(header) # if detailed: vcf.add_custom_svtyper_headers() # add the samples in the BAM files to the VCF output for sample in sample_list: if sample.name not in vcf.sample_list: vcf.add_sample(sample.name) # write the output header vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') var = Variant(v, vcf) var_length = None # var_length should be None except for deletions if not sum_quals: var.qual = 0 # genotype generic breakends try: svtype = var.get_info('SVTYPE') except KeyError: sys.stderr.write('Warning: SVTYPE missing at variant %s. Skipping.\n' % (var.var_id)) vcf_out.write(var.get_var_string() + '\n') continue # print original line if unsupported svtype if svtype not in ('BND', 'DEL', 'DUP', 'INV'): sys.stderr.write('Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' % (var.var_id, svtype)) vcf_out.write(var.get_var_string() + '\n') continue if svtype == 'BND': if var.info['MATEID'] in breakend_dict: var2 = var var = breakend_dict[var.info['MATEID']] chromA = var.chrom chromB = var2.chrom posA = var.pos posB = var2.pos # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95', max_ci_dist) # infer the strands from the alt allele if var.alt[-1] == '[' or var.alt[-1] == ']': o1_is_reverse = False else: o1_is_reverse = True if var2.alt[-1] == '[' or var2.alt[-1] == ']': o2_is_reverse = False else: o2_is_reverse = True # remove the BND from the breakend_dict # to free up memory del breakend_dict[var.var_id] else: breakend_dict[var.var_id] = var continue else: chromA = var.chrom chromB = var.chrom posA = var.pos posB = int(var.get_info('END')) # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist) if svtype == 'DEL': var_length = posB - posA o1_is_reverse, o2_is_reverse = False, True elif svtype == 'DUP': o1_is_reverse, o2_is_reverse = True, False elif svtype == 'INV': o1_is_reverse, o2_is_reverse = False, False # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction) if o1_is_reverse: posA += 1 if o2_is_reverse: posB += 1 for sample in sample_list: # grab reads from both sides of breakpoint read_batch, many = gather_all_reads(sample, chromA, posA, ciA, chromB, posB, ciB, z, max_reads) if many: var.genotype(sample.name).set_format('GT', './.') continue # initialize counts to zero ref_span, alt_span = 0, 0 ref_seq, alt_seq = 0, 0 alt_clip = 0 # ref_ciA = ciA # ref_ciB = ciB ref_ciA = [0,0] ref_ciB = [0,0] for query_name in sorted(read_batch.keys()): fragment = read_batch[query_name] # boolean on whether to write the fragment write_fragment = False # ------------------------------------- # Check for split-read evidence # ------------------------------------- # get reference sequences for read in fragment.primary_reads: is_ref_seq_A = fragment.is_ref_seq(read, var, chromA, posA, ciA, min_aligned) is_ref_seq_B = fragment.is_ref_seq(read, var, chromB, posB, ciB, min_aligned) if (is_ref_seq_A or is_ref_seq_B): p_reference = prob_mapq(read) ref_seq += p_reference read.set_tag('XV', 'R') write_fragment = True # get non-reference split-read support for split in fragment.split_reads: split_lr = split.is_split_straddle(chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, svtype, split_slop) # p_alt = prob_mapq(split.query_left) * prob_mapq(split.query_right) p_alt = (prob_mapq(split.query_left) * split_lr[0] + prob_mapq(split.query_right) * split_lr[1]) / 2.0 if split.is_soft_clip: alt_clip += p_alt else: alt_seq += p_alt if p_alt > 0: split.tag_split(p_alt) write_fragment = True # ------------------------------------- # Check for paired-end evidence # ------------------------------------- # tally spanning alternate pairs if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd: alt_straddle = False else: alt_straddle = fragment.is_pair_straddle(chromA, posA, ciA, chromB, posB, ciB, o1_is_reverse, o2_is_reverse, min_aligned, fragment.lib) # check both sides if inversion (perhaps should do this for BND as well?) if svtype in ('INV'): alt_straddle_reciprocal = fragment.is_pair_straddle(chromA, posA, ciA, chromB, posB, ciB, not o1_is_reverse, not o2_is_reverse, min_aligned, fragment.lib) else: alt_straddle_reciprocal = False if alt_straddle or alt_straddle_reciprocal: if svtype == 'DEL': p_conc = fragment.p_concordant(var_length) if p_conc is not None: p_alt = (1 - p_conc) * prob_mapq(fragment.readA) * prob_mapq(fragment.readB) alt_span += p_alt # # since an alt straddler is by definition also a reference straddler, # # we can bail out early here to save some time # p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB) # ref_span += p_reference # continue fragment.tag_span(p_alt) write_fragment = True else: p_alt = prob_mapq(fragment.readA) * prob_mapq(fragment.readB) alt_span += p_alt fragment.tag_span(p_alt) write_fragment = True # # tally spanning reference pairs if svtype == 'DEL' and posB - posA < 2 * fragment.lib.sd: ref_straddle_A = False ref_straddle_B = False else: ref_straddle_A = fragment.is_pair_straddle(chromA, posA, ref_ciA, chromA, posA, ref_ciA, False, True, min_aligned, fragment.lib) ref_straddle_B = fragment.is_pair_straddle(chromB, posB, ref_ciB, chromB, posB, ref_ciB, False, True, min_aligned, fragment.lib) if ref_straddle_A or ref_straddle_B: # don't allow the pair to jump the entire variant, except for # length-changing SVs like deletions if not (ref_straddle_A and ref_straddle_B) or svtype == 'DEL': p_conc = fragment.p_concordant(var_length) if p_conc is not None: p_reference = p_conc * prob_mapq(fragment.readA) * prob_mapq(fragment.readB) ref_span += (ref_straddle_A + ref_straddle_B) * p_reference / 2 fragment.tag_span(1 - p_conc) write_fragment = True # write to BAM if requested if alignment_outpath is not None and write_fragment: for read in fragment.primary_reads + [split.read for split in fragment.split_reads]: out_bam_written_reads = write_alignment(read, out_bam, out_bam_written_reads) if debug: print '--------------------------' print 'ref_span:', ref_span print 'alt_span:', alt_span print 'ref_seq:', ref_seq print 'alt_seq:', alt_seq print 'alt_clip:', alt_clip # in the absence of evidence for a particular type, ignore the reference # support for that type as well if (alt_seq + alt_clip) < 0.5 and alt_span >= 1: alt_seq = 0 alt_clip = 0 ref_seq = 0 if alt_span < 0.5 and (alt_seq + alt_clip) >= 1: alt_span = 0 ref_span = 0 if alt_span + alt_seq == 0 and alt_clip > 0: # discount any SV that's only supported by clips. alt_clip = 0 if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0: # get bayesian classifier if var.info['SVTYPE'] == "DUP": is_dup = True else: is_dup = False alt_splitters = alt_seq + alt_clip QR = int(split_weight * ref_seq) + int(disc_weight * ref_span) QA = int(split_weight * alt_splitters) + int(disc_weight * alt_span) gt_lplist = bayes_gt(QR, QA, is_dup) best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=lambda(x): x[1], reverse=True)[0:2] gt_idx = best[0] # print log probabilities of homref, het, homalt if debug: print gt_lplist # set the overall variant QUAL score and sample specific fields var.genotype(sample.name).set_format('GL', ','.join(['%.0f' % x for x in gt_lplist])) var.genotype(sample.name).set_format('DP', int(ref_seq + alt_seq + alt_clip + ref_span + alt_span)) var.genotype(sample.name).set_format('RO', int(ref_seq + ref_span)) var.genotype(sample.name).set_format('AO', int(alt_seq + alt_clip + alt_span)) var.genotype(sample.name).set_format('QR', QR) var.genotype(sample.name).set_format('QA', QA) # if detailed: var.genotype(sample.name).set_format('RS', int(ref_seq)) var.genotype(sample.name).set_format('AS', int(alt_seq)) var.genotype(sample.name).set_format('ASC', int(alt_clip)) var.genotype(sample.name).set_format('RP', int(ref_span)) var.genotype(sample.name).set_format('AP', int(alt_span)) try: var.genotype(sample.name).set_format('AB', '%.2g' % (QA / float(QR + QA))) except ZeroDivisionError: var.genotype(sample.name).set_format('AB', '.') # assign genotypes gt_sum = 0 for gt in gt_lplist: try: gt_sum += 10**gt except OverflowError: gt_sum += 0 if gt_sum > 0: gt_sum_log = math.log(gt_sum, 10) sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample phred_gq = min(-10 * (second_best[1] - best[1]), 200) var.genotype(sample.name).set_format('GQ', int(phred_gq)) var.genotype(sample.name).set_format('SQ', sample_qual) var.qual += sample_qual if gt_idx == 1: var.genotype(sample.name).set_format('GT', '0/1') elif gt_idx == 2: var.genotype(sample.name).set_format('GT', '1/1') elif gt_idx == 0: var.genotype(sample.name).set_format('GT', '0/0') else: var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GT', './.') else: var.genotype(sample.name).set_format('GT', './.') var.qual = 0 var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GL', '.') var.genotype(sample.name).set_format('DP', 0) var.genotype(sample.name).set_format('AO', 0) var.genotype(sample.name).set_format('RO', 0) # if detailed: var.genotype(sample.name).set_format('AS', 0) var.genotype(sample.name).set_format('ASC', 0) var.genotype(sample.name).set_format('RS', 0) var.genotype(sample.name).set_format('AP', 0) var.genotype(sample.name).set_format('RP', 0) var.genotype(sample.name).set_format('QR', 0) var.genotype(sample.name).set_format('QA', 0) var.genotype(sample.name).set_format('AB', '.') # after all samples have been processed, write vcf_out.write(var.get_var_string() + '\n') if var.info['SVTYPE'] == 'BND': var2.qual = var.qual var2.active_formats = var.active_formats var2.genotype = var.genotype vcf_out.write(var2.get_var_string() + '\n') # throw warning if we've lost unpaired breakends if breakend_dict: logging.warning('Unpaired breakends found in file. These will not be present in output.') # close the files vcf_in.close() vcf_out.close() if alignment_outpath is not None: out_bam.close() return
def sv_genotype(bam_string, vcf_in, vcf_out, min_aligned, split_weight, disc_weight, num_samp, lib_info_path, debug, alignment_outpath, ref_fasta, sum_quals, max_reads, max_ci_dist): # parse the comma separated inputs bam_list = [] for b in bam_string.split(','): if b.endswith('.bam'): bam_list.append(pysam.AlignmentFile(b, mode='rb')) elif b.endswith('.cram'): bam_list.append( pysam.AlignmentFile(b, mode='rc', reference_filename=ref_fasta, format_options=["required_fields=7167"])) else: sys.stderr.write( 'Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b) exit(1) min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM # parse lib_info_path JSON lib_info = None if lib_info_path is not None and os.path.isfile(lib_info_path): lib_info_file = open(lib_info_path, 'r') lib_info = json.load(lib_info_file) if vcf_in is None: sys.stderr.write('Warning: VCF not found.\n') # build the sample libraries, either from the lib_info JSON or empirically from the BAMs sample_list = list() for i in xrange(len(bam_list)): if lib_info is None: logging.info('Calculating library metrics from %s...' % bam_list[i].filename) sample = Sample.from_bam(bam_list[i], num_samp, min_lib_prevalence) else: logging.info('Reading library metrics from %s...' % lib_info_path) sample = Sample.from_lib_info(bam_list[i], lib_info, min_lib_prevalence) sample.set_exp_seq_depth(min_aligned) sample.set_exp_spanning_depth(min_aligned) sample_list.append(sample) logging.info('done') # diagnostic dump of relevant BAM reads if alignment_outpath is not None: # create a BAM file of the reads used for genotyping template_bam = pysam.AlignmentFile(bam_string.split(',')[0], 'rb') out_bam = pysam.AlignmentFile(alignment_outpath, 'wb', template_bam) template_bam.close() supporting_reads_file = alignment_outpath.replace('bam', 'tsv') supporting_reads_fh = open(supporting_reads_file, 'w') supporting_reads_fh.write( 'chromA,chromB,posA,posB,svtype,read_id,read_chrom,read_reference_start,read_reference_end,start_ref_support,start_ref_pe_support,start_alt_sr_support,start_alt_pe_support,end_ref_support,end_ref_pe_support,end_alt_sr_support,end_alt_pe_support\n' ) else: out_bam = None supporting_reads_fh = None out_bam_written_reads = set() # write the JSON for each sample's libraries if lib_info_path is not None and not os.path.isfile(lib_info_path): logging.info('Writing library metrics to %s...' % lib_info_path) lib_info_file = open(lib_info_path, 'w') write_sample_json(sample_list, lib_info_file) lib_info_file.close() logging.info('done') # quit early if VCF absent if vcf_in is None: if alignment_outpath is not None: out_bam.close() return # set variables for genotyping z = 3 in_header = True header = [] breakend_dict = { } # cache to hold unmatched generic breakends for genotyping vcf = Vcf() # read input VCF for line in vcf_in: if in_header: if line[0] == '#': header.append(line) if line[1] != '#': vcf_samples = line.rstrip().split('\t')[9:] continue else: in_header = False vcf.add_header(header) # if detailed: vcf.add_custom_svtyper_headers() # add the samples in the BAM files to the VCF output for sample in sample_list: if sample.name not in vcf.sample_list: vcf.add_sample(sample.name) # write the output header vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') var = Variant(v, vcf) var_length = None # var_length should be None except for deletions if not sum_quals: var.qual = 0 # genotype generic breakends try: svtype = var.get_info('SVTYPE') except KeyError: sys.stderr.write( 'Warning: SVTYPE missing at variant %s. Skipping.\n' % (var.var_id)) vcf_out.write(var.get_var_string() + '\n') continue # print original line if unsupported svtype if svtype not in ('BND', 'DEL', 'DUP', 'INV'): sys.stderr.write( 'Warning: Unsupported SVTYPE at variant %s (%s). Skipping.\n' % (var.var_id, svtype)) vcf_out.write(var.get_var_string() + '\n') continue if svtype == 'BND': if var.info['MATEID'] in breakend_dict: var2 = var var = breakend_dict[var.info['MATEID']] chromA = var.chrom chromB = var2.chrom posA = var.pos posB = var2.pos # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var2, 'CIPOS', 'CIPOS95', max_ci_dist) # infer the strands from the alt allele if var.alt[-1] == '[' or var.alt[-1] == ']': o1_is_reverse = False else: o1_is_reverse = True if var2.alt[-1] == '[' or var2.alt[-1] == ']': o2_is_reverse = False else: o2_is_reverse = True # remove the BND from the breakend_dict # to free up memory del breakend_dict[var.var_id] else: breakend_dict[var.var_id] = var continue else: chromA = var.chrom chromB = var.chrom posA = var.pos posB = int(var.get_info('END')) # confidence intervals ciA = confidence_interval(var, 'CIPOS', 'CIPOS95', max_ci_dist) ciB = confidence_interval(var, 'CIEND', 'CIEND95', max_ci_dist) if svtype == 'DEL': var_length = posB - posA o1_is_reverse, o2_is_reverse = False, True elif svtype == 'DUP': o1_is_reverse, o2_is_reverse = True, False elif svtype == 'INV': o1_is_reverse, o2_is_reverse = False, False # increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction) if o1_is_reverse: posA += 1 if o2_is_reverse: posB += 1 for sample in sample_list: # grab reads for start and end of breakpoints start_read_batch = {} end_read_batch = {} start_read_batch, many = gather_reads(sample, chromA, posA, ciA, z, start_read_batch, max_reads) end_read_batch, many = gather_reads(sample, chromB, posB, ciB, z, end_read_batch, max_reads) # grab reads from both sides of breakpoint read_batch, many = gather_all_reads(sample, chromA, posA, ciA, chromB, posB, ciB, z, max_reads) if many: var.genotype(sample.name).set_format('GT', './.') continue metrics = {'start': {}, 'end': {}} # run metrics for each breakpoint separately metrics['start'] = calculate_metrics( start_read_batch, 'start', var, chromA, chromB, posA, posB, ciA, ciB, min_aligned, o1_is_reverse, o2_is_reverse, var_length, svtype, out_bam_written_reads, supporting_reads_fh, out_bam) metrics['end'] = calculate_metrics(end_read_batch, 'end', var, chromA, chromB, posA, posB, ciA, ciB, min_aligned, o1_is_reverse, o2_is_reverse, var_length, svtype, out_bam_written_reads, supporting_reads_fh, out_bam) # run metrics for both sides of breakpoints # no bam file and supporting reads files is written here since they are already written in running the method for each breakpoint separately metrics['both'] = calculate_metrics( read_batch, 'both', var, chromA, chromB, posA, posB, ciA, ciB, min_aligned, o1_is_reverse, o2_is_reverse, var_length, svtype, out_bam_written_reads, None, None) # set these for metrics from both sides of breakpoints ref_span = metrics['both']['ref_span'] alt_span = metrics['both']['alt_span'] ref_seq = metrics['both']['ref_seq'] alt_seq = metrics['both']['alt_seq'] alt_clip = metrics['both']['alt_clip'] if debug: print '--------------------------' print 'ref_span:', ref_span print 'alt_span:', alt_span print 'ref_seq:', ref_seq print 'alt_seq:', alt_seq print 'alt_clip:', alt_clip # in the absence of evidence for a particular type, ignore the reference # support for that type as well if (alt_seq + alt_clip) < 0.5 and alt_span >= 1: alt_seq = 0 alt_clip = 0 ref_seq = 0 if alt_span < 0.5 and (alt_seq + alt_clip) >= 1: alt_span = 0 ref_span = 0 if alt_span + alt_seq == 0 and alt_clip > 0: # discount any SV that's only supported by clips. alt_clip = 0 if ref_seq + alt_seq + ref_span + alt_span + alt_clip > 0: # get bayesian classifier if var.info['SVTYPE'] == "DUP": is_dup = True else: is_dup = False alt_splitters = alt_seq + alt_clip QR = int(split_weight * ref_seq) + int(disc_weight * ref_span) QA = int(split_weight * alt_splitters) + int( disc_weight * alt_span) gt_lplist = bayes_gt(QR, QA, is_dup) best, second_best = sorted([(i, e) for i, e in enumerate(gt_lplist)], key=lambda (x): x[1], reverse=True)[0:2] gt_idx = best[0] # print log probabilities of homref, het, homalt if debug: print gt_lplist # set the overall variant QUAL score and sample specific fields var.genotype(sample.name).set_format( 'GL', ','.join(['%.0f' % x for x in gt_lplist])) var.genotype(sample.name).set_format( 'DP', int(ref_seq + alt_seq + alt_clip + ref_span + alt_span)) var.genotype(sample.name).set_format('RO', int(ref_seq + ref_span)) var.genotype(sample.name).set_format( 'AO', int(alt_seq + alt_clip + alt_span)) var.genotype(sample.name).set_format('QR', QR) var.genotype(sample.name).set_format('QA', QA) # if detailed: var.genotype(sample.name).set_format('RS', int(ref_seq)) var.genotype(sample.name).set_format('AS', int(alt_seq)) var.genotype(sample.name).set_format('ASC', int(alt_clip)) var.genotype(sample.name).set_format('RP', int(ref_span)) var.genotype(sample.name).set_format('AP', int(alt_span)) try: var.genotype(sample.name).set_format( 'AB', '%.2g' % (QA / float(QR + QA))) except ZeroDivisionError: var.genotype(sample.name).set_format('AB', '.') var.genotype(sample.name).set_format( 'SRC', int(metrics['start']['ref_count'])) var.genotype(sample.name).set_format( 'SRPC', int(metrics['start']['ref_pe_count'])) var.genotype(sample.name).set_format( 'SASC', int(metrics['start']['alt_clip_count'])) var.genotype(sample.name).set_format( 'SAPC', int(metrics['start']['alt_pe_count'])) var.genotype(sample.name).set_format( 'ERC', int(metrics['end']['ref_count'])) var.genotype(sample.name).set_format( 'ERPC', int(metrics['end']['ref_pe_count'])) var.genotype(sample.name).set_format( 'EASC', int(metrics['end']['alt_clip_count'])) var.genotype(sample.name).set_format( 'EAPC', int(metrics['end']['alt_pe_count'])) var.genotype(sample.name).set_format( 'BRC', int(metrics['both']['ref_count'])) var.genotype(sample.name).set_format( 'BRPC', int(metrics['both']['ref_pe_count'])) var.genotype(sample.name).set_format( 'BASC', int(metrics['both']['alt_clip_count'])) var.genotype(sample.name).set_format( 'BAPC', int(metrics['both']['alt_pe_count'])) var.genotype(sample.name).set_format( 'ISM', sample.get_mean_insert_size()) var.genotype(sample.name).set_format( 'ISSD', sample.get_stddev_insert_size()) # assign genotypes gt_sum = 0 for gt in gt_lplist: try: gt_sum += 10**gt except OverflowError: gt_sum += 0 if gt_sum > 0: gt_sum_log = math.log(gt_sum, 10) sample_qual = abs( -10 * (gt_lplist[0] - gt_sum_log) ) # phred-scaled probability site is non-reference in this sample phred_gq = min(-10 * (second_best[1] - best[1]), 200) var.genotype(sample.name).set_format('GQ', int(phred_gq)) var.genotype(sample.name).set_format('SQ', sample_qual) var.qual += sample_qual if gt_idx == 1: var.genotype(sample.name).set_format('GT', '0/1') elif gt_idx == 2: var.genotype(sample.name).set_format('GT', '1/1') elif gt_idx == 0: var.genotype(sample.name).set_format('GT', '0/0') else: var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GT', './.') else: var.genotype(sample.name).set_format('GT', './.') var.qual = 0 var.genotype(sample.name).set_format('GQ', '.') var.genotype(sample.name).set_format('SQ', '.') var.genotype(sample.name).set_format('GL', '.') var.genotype(sample.name).set_format('DP', 0) var.genotype(sample.name).set_format('AO', 0) var.genotype(sample.name).set_format('RO', 0) # if detailed: var.genotype(sample.name).set_format('AS', 0) var.genotype(sample.name).set_format('ASC', 0) var.genotype(sample.name).set_format('RS', 0) var.genotype(sample.name).set_format('AP', 0) var.genotype(sample.name).set_format('RP', 0) var.genotype(sample.name).set_format('QR', 0) var.genotype(sample.name).set_format('QA', 0) var.genotype(sample.name).set_format('AB', '.') var.genotype(sample.name).set_format( 'SRC', int(metrics['start']['ref_count'])) var.genotype(sample.name).set_format( 'SRPC', int(metrics['start']['ref_pe_count'])) var.genotype(sample.name).set_format( 'SASC', int(metrics['start']['alt_clip_count'])) var.genotype(sample.name).set_format( 'SAPC', int(metrics['start']['alt_pe_count'])) var.genotype(sample.name).set_format( 'ERC', int(metrics['end']['ref_count'])) var.genotype(sample.name).set_format( 'ERPC', int(metrics['end']['ref_pe_count'])) var.genotype(sample.name).set_format( 'EASC', int(metrics['end']['alt_clip_count'])) var.genotype(sample.name).set_format( 'EAPC', int(metrics['end']['alt_pe_count'])) var.genotype(sample.name).set_format( 'BRC', int(metrics['both']['ref_count'])) var.genotype(sample.name).set_format( 'BRPC', int(metrics['both']['ref_pe_count'])) var.genotype(sample.name).set_format( 'BASC', int(metrics['both']['alt_clip_count'])) var.genotype(sample.name).set_format( 'BAPC', int(metrics['both']['alt_pe_count'])) # after all samples have been processed, write vcf_out.write(var.get_var_string() + '\n') if var.info['SVTYPE'] == 'BND': var2.qual = var.qual var2.active_formats = var.active_formats var2.genotype = var.genotype vcf_out.write(var2.get_var_string() + '\n') # throw warning if we've lost unpaired breakends if breakend_dict: logging.warning( 'Unpaired breakends found in file. These will not be present in output.' ) # close the files vcf_in.close() vcf_out.close() if alignment_outpath is not None: out_bam.close() supporting_reads_fh.close() return