def test_get_blocked_alignment(): bam = pysam.AlignmentFile( '/home/jgarthur/sv/analysis/alignments/bwa_mem/short-reads/jun_jul.mdup.merge.mdup.bam', 'rb') blocks = [ GenomeInterval('1', 0, 100), GenomeInterval('1', 110, 210), GenomeInterval('1', 210, 2000) ] aln = pysam.AlignedSegment() aln.pos = 0 aln.cigarstring = '50M' aln.seq = 'A' * 50 aln.is_reverse = False print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1], 0)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([0], 50)) aln.is_reverse = True print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([0], 50)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([1], 0)) aln = pysam.AlignedSegment() aln.rname = 0 aln.pos = 90 aln.seq = 'A' * 40 aln.cigarstring = '20M20S' aln.set_tag('SA', '1,191,-,20M20S,60,0;', 'Z') print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1, 2], -90)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([3, 0], -80))
def genome_blocks_gaps(blocks, path): chrom = blocks[0].chrom blocks_gaps = [] start_gap = block_gap(blocks, path[0]) blocks_gaps.append(GenomeInterval(chrom, 0, start_gap, is_gap=True)) blocks_gaps.append(blocks[int(floor(path[0] / 2))]) for i in range(1, len(path) - 1, 2): gap_size = int( floor( (block_gap(blocks, path[i]) + block_gap(blocks, path[i + 1])) / 2)) blocks_gaps.append(GenomeInterval(chrom, 0, gap_size, is_gap=True)) blocks_gaps.append(blocks[int(floor(path[i + 1] / 2))]) end_gap = block_gap(blocks, path[-1]) blocks_gaps.append(GenomeInterval(chrom, 0, end_gap, is_gap=True)) return blocks_gaps
def test_intersects(): i = GenomeInterval('20', 10, 20) assert (not i.intersects((20, 30))) assert (i.intersects((19, 20))) assert (not i.intersects((0, 10))) assert (i.intersects((0, 11))) assert (i.intersects((11, 12)))
def simplify_blocks_diploid(blocks, path1, path2): block_nums = [int(floor(path1[i] / 2)) for i in range(1, len(path1), 2)] block_nums.extend( [int(floor(path2[i] / 2)) for i in range(1, len(path2), 2)]) neighbors = defaultdict(set) for path in (path1, path2): neighbors[path[0]].add(-1) neighbors[path[-1]].add(-2) for i in range(1, len(path) - 1): if i % 2 == 0: neighbors[path[i]].add(path[i - 1]) else: neighbors[path[i]].add(path[i + 1]) # print(neighbors) min_block = min([b for b in block_nums if not blocks[b].is_insertion()]) max_block = max([b for b in block_nums if not blocks[b].is_insertion()]) ins_blocks = [b for b in block_nums if blocks[b].is_insertion()] new_blocks = [] path_map = {} idx = 0 merging = False for b in range(min_block, max_block + 1): if not merging: block_start = blocks[b].start right_node = 2 * b + 1 if all(n == right_node+1 for n in neighbors[right_node]) and \ all(n == right_node for n in neighbors[right_node+1]) and \ b < max_block: # combine after merging = True path_map[2 * b] = None path_map[2 * b + 1] = None else: newblock = GenomeInterval(blocks[b].chrom, block_start, blocks[b].end) new_blocks.append(newblock) path_map[2 * b] = 2 * idx path_map[2 * b + 1] = 2 * idx + 1 merging = False idx += 1 new_blocks.extend([deepcopy(blocks[b]) for b in ins_blocks]) for b in ins_blocks: path_map[2 * b] = 2 * idx path_map[2 * b + 1] = 2 * idx + 1 idx += 1 new_path1 = [path_map[p] for p in path1 if path_map[p] is not None] new_path2 = [path_map[p] for p in path2 if path_map[p] is not None] # print(new_path1) # print(new_path2) # print(new_blocks) return new_blocks, new_path1, new_path2
def test_genome_blocks_gaps(): blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 105, 200), GenomeInterval(1, 200, 300), GenomeInterval(1, 305, 400), GenomeInterval(1, 420, 500) ] print(blocks) path = list(range(10)) print(path) print(genome_blocks_gaps(blocks, path)) print('') path = [0, 1, 4, 5, 8, 9] print(path) print(genome_blocks_gaps(blocks, path)) print('')
def test_affected_len(): print(align_strings('abcde', 'abdef')) print('-' * 50) print(align_strings('aab', 'ab')) print('-' * 50) print(align_strings('sjdioa', 'ssjjdioa')) print('=' * 50) blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 100, 150), GenomeInterval(1, 150, 300), GenomeInterval(1, 300, 325), GenomeInterval(1, 325, 425) ] # ABCDE (reference) # ABCDE assert (0 == sv_affected_len(range(len(blocks) * 2), blocks)) # ABCD'E assert (25 == sv_affected_len([0, 1, 2, 3, 4, 5, 7, 6, 8, 9], blocks)) # AE assert (225 == sv_affected_len([0, 1, 8, 9], blocks)) # ABCDCDE assert (175 == sv_affected_len([0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9], blocks)) # AB'CCE assert (225 == sv_affected_len([0, 1, 3, 2, 4, 5, 4, 5, 8, 9], blocks)) blocks.append(GenomeInterval(None, 0, 500, is_de_novo=True)) assert (0 == sv_affected_len(range((len(blocks) - 1) * 2), blocks)) assert (500 == sv_affected_len([0, 1, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9], blocks)) assert (525 == sv_affected_len([0, 1, 10, 11, 2, 3, 4, 5, 8, 9], blocks))
def test_get_gap_overlap_positions(): rlen = 50 blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 100, 200), GenomeInterval(1, 249, 300), GenomeInterval(1, 350, 400), GenomeInterval(1, 500, 600) ] paths = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 7, 6, 8, 9]) truth = ([(299, 301), (399, 451)], [(299, 326), (424, 451)]) for i in range(len(truth)): out = get_gap_overlap_positions(paths[i], blocks, rlen) inter = pyinter.IntervalSet() for interval in truth[i]: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter) blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 200, 300), GenomeInterval(0, 350, 400), GenomeInterval(1, 0, 50, True), GenomeInterval(1, 0, 50, True) ] path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5] truth = [(99, 131), (169, 201), (349, 356), (394, 401)] out = get_gap_overlap_positions(path, blocks, rlen) inter = pyinter.IntervalSet() for interval in truth: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter)
def sv_classify_test(): blocks = [GenomeInterval('1', 100*i, 100*i + 100) for i in range(10)] num_genome_blocks = 10 blocks.append(GenomeInterval('1', 0, 100, True)) path = [0, 1, 2, 3, 6, 7, 8, 9] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 20, 21, 4, 5] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 5, 4, 6, 7] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 7, 6, 5, 4, 8, 9] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 2, 3, 4, 5] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 0, 1, 2, 3, 4, 5] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 2, 3, 2, 3, 4, 5] print(path) print(classify_svs(path, blocks, num_genome_blocks)) path = [0, 1, 2, 3, 2, 3, 4, 5, 20, 21, 6, 7] print(path) print(classify_svs(path, blocks, num_genome_blocks))
def test_plot_rearrangement(): blocks = [ GenomeInterval('1', 0, 1000), GenomeInterval('1', 1010, 1012), # 1500), GenomeInterval('1', 1505, 2000), GenomeInterval('1', 2000, 4000), GenomeInterval('1', 4000, 20000), GenomeInterval('1', 0, 10, is_de_novo=True), GenomeInterval('1', 0, 1000, is_de_novo=True) ] outdir = '~/tmp/' p1 = [0, 1, 4, 5, 6, 7] fn = 'ACD.png' print(fn) plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, None, True) p1 = [0, 1, 10, 11, 4, 5, 6, 7] fn = 'AICD.png' print(fn) plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, None, True) p1 = [0, 1, 2, 3, 3, 2, 6, 7, 8, 9] p2 = [0, 1, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9] fn = 'ABB-DE_ACBCDE.png' print(fn) plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, p2, True) p1 = [0, 1, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 2, 3, 6, 7, 7, 6, 8, 9] p2 = p2 fn = 'ABB--.png' print(fn) plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, p2, True) blocks = [GenomeInterval('1', i, i + 100) for i in range(0, 2000, 100)] p1 = list(range(0, 39)) p2 = p2 fn = 'ABCD---.png' print(fn) plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 19, p1, p2, True)
def test_altered_reference_sequence(): ref = pysam.FastaFile('/home/jgarthur/sv/reference/GRCh37.fa') blocks = [ GenomeInterval('20', 100000 + 100 * i, 100000 + 100 * (i + 1)) for i in range(10) ] + [GenomeInterval('20', 0, 1000, True)] refpath = [0, 1, 2, 3, 4, 5, 6, 7] delpath = [0, 1, 2, 3, 6, 7, 8, 9] del2path = [0, 1, 2, 3, 8, 9, 10, 11] inspath = [0, 1, 2, 3, 20, 21, 4, 5, 6, 7] duppath = [0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9] dup2path = [0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11] dupend = [0, 1, 2, 3, 4, 5, 4, 5] dupstartdel = [0, 1, 0, 1, 4, 5] invpath = [0, 1, 2, 3, 5, 4, 6, 7, 8, 9] inv2path = [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11] dduppath = [0, 1, 2, 3, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11] flank_size = 1000 out = altered_reference_sequence(refpath, blocks, ref, flank_size) assert (out[0] == []) out = altered_reference_sequence(delpath, blocks, ref, flank_size) print(len(out[0][0])) assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100300, 100300 + 200))) assert (out[1][0] == [(0, 200), (200, 400)]) assert (out[2] == []) assert (out[3][0] == [100, 0]) out = altered_reference_sequence(del2path, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100400, 100400 + 200))) assert (out[1][0] == [(0, 200), (200, 400)]) assert (out[2] == []) assert (out[3][0] == [200, 0]) out = altered_reference_sequence(duppath, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100200, 100300) + fetch_seq(ref, '20', 100200, 100300) + fetch_seq(ref, '20', 100300, 100300 + 200))) assert (out[1][0] == [(0, 200), (200, 300), (300, 400), (400, 600)]) assert (out[2] == []) assert (out[3][0] == [0, 0, 0, 0]) out = altered_reference_sequence(dupend, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100200, 100300) + fetch_seq(ref, '20', 100200, 100300))) assert (out[1][0] == [(0, 200), (200, 300), (300, 400)]) assert (out[2] == []) assert (out[3][0] == [0, 0, 0]) out = altered_reference_sequence(dup2path, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100200, 100400) + fetch_seq(ref, '20', 100200, 100400) + fetch_seq(ref, '20', 100400, 100400 + 200))) out = altered_reference_sequence(dupstartdel, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100000, 100100) * 2 + fetch_seq(ref, '20', 100200, 100200 + 100))) assert (out[1][0] == [(0, 100), (100, 200), (200, 300)]) assert (out[2] == []) assert (out[3][0] == [0, 100, 0]) out = altered_reference_sequence(invpath, blocks, ref, flank_size) assert (out[0][0] == ( fetch_seq(ref, '20', 100200 - 200, 100200) + reverse_complement(fetch_seq(ref, '20', 100200, 100300)) + fetch_seq(ref, '20', 100300, 100300 + 200))) out = altered_reference_sequence(inv2path, blocks, ref, flank_size) assert (out[0][0] == ( fetch_seq(ref, '20', 100200 - 200, 100200) + reverse_complement(fetch_seq(ref, '20', 100200, 100400)) + fetch_seq(ref, '20', 100400, 100400 + 200))) out = altered_reference_sequence(inspath, blocks, ref, flank_size) assert (out[0] == [ fetch_seq(ref, '20', 100200 - 200, 100200), fetch_seq(ref, '20', 100200, 100200 + 200) ]) out = altered_reference_sequence(dduppath, blocks, ref, flank_size) assert (out[0][0] == fetch_seq(ref, '20', 100200 - 200, 100200) + fetch_seq(ref, '20', 100300, 100400) + fetch_seq(ref, '20', 100200, 100400 + 200)) assert (out[1][0] == [(0, 200), (200, 300), (300, 400), (400, 500), (500, 700)]) assert (out[2] == []) assert (out[3][0] == [0, 0, 0, 0, 0]) blocks = [ GenomeInterval('20', 100000, 101000), GenomeInterval('20', 101025, 101125), GenomeInterval('20', 101130, 102500) ] refpath = [0, 1, 2, 3, 4, 5] delpath = [0, 1, 4, 5] duppath = [0, 1, 2, 3, 2, 3, 4, 5] invpath = [0, 1, 3, 2, 4, 5] out = altered_reference_sequence(refpath, blocks, ref, flank_size) assert (out[0] == []) out = altered_reference_sequence(delpath, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100000, 101013) + fetch_seq(ref, '20', 101128, 102130))) out = altered_reference_sequence(duppath, blocks, ref, flank_size) assert (out[0][0] == (fetch_seq(ref, '20', 100000, 101013) + fetch_seq(ref, '20', 101013, 101128) * 2 + fetch_seq(ref, '20', 101128, 102130))) out = altered_reference_sequence(invpath, blocks, ref, flank_size) assert (out[0][0] == ( fetch_seq(ref, '20', 100000, 101013) + reverse_complement(fetch_seq(ref, '20', 101013, 101128)) + fetch_seq(ref, '20', 101128, 102130)))
def test_simplify_blocks(): blocks = [GenomeInterval(1, 100 * i, 100 * (i + 1)) for i in range(20)] blocks.append(GenomeInterval(1, 0, 100, is_de_novo=True)) blocks.append(GenomeInterval(1, 0, 100, is_de_novo=True)) # deletion assert (simplify_blocks(blocks, [0, 1, 4, 5], flank_size=100)[1:] == ([0, 1, 4, 5], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 6, 7], flank_size=100)[1:] == ([0, 1, 4, 5], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 8, 9], flank_size=100)[1:] == ([0, 1, 4, 5], True, True)) # inversion assert (simplify_blocks(blocks, [0, 1, 3, 2, 4, 5], flank_size=100)[1:] == ([0, 1, 3, 2, 4, 5], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 5, 4, 6, 7, 8, 9], flank_size=100)[1:] == ([0, 1, 3, 2, 4, 5], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11], flank_size=100)[1:] == ([0, 1, 3, 2, 4, 5], True, True)) # duplication assert (simplify_blocks(blocks, [0, 1, 2, 3, 2, 3, 4, 5], flank_size=100)[1:] == ([0, 1, 2, 3, 2, 3, 4, 5], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9], flank_size=100)[1:] == ([0, 1, 2, 3, 2, 3, 4, 5], True, True)) assert (simplify_blocks( blocks, [0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 7, 8, 9], flank_size=100)[1:] == ([0, 1, 0, 1, 2, 3, 4, 5, 4, 5], False, False)) # dispersed duplication assert (simplify_blocks(blocks, [0, 1, 4, 5, 2, 3, 4, 5, 6, 7], flank_size=100)[1:] == ([ 0, 1, 4, 5, 2, 3, 4, 5, 6, 7 ], True, True)) assert (simplify_blocks( blocks, [0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], flank_size=100)[1:] == ([0, 1, 4, 5, 2, 3, 4, 5, 6, 7], True, True)) print( simplify_blocks(blocks, [0, 1, 5, 4, 2, 3, 4, 5, 6, 7], flank_size=100)) assert (simplify_blocks(blocks, [0, 1, 5, 4, 2, 3, 4, 5, 6, 7], flank_size=100)[1:] == ([ 0, 1, 5, 4, 2, 3, 4, 5, 6, 7 ], True, True)) assert (simplify_blocks( blocks, [0, 1, 2, 3, 11, 10, 9, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], flank_size=100)[1:] == ([0, 1, 5, 4, 2, 3, 4, 5, 6, 7], True, True)) # insertion assert (simplify_blocks(blocks, [0, 1, 2, 3, 40, 41, 4, 5, 42, 43, 6, 7], flank_size=100)[1:] == ([ 0, 1, 6, 7, 2, 3, 8, 9, 4, 5 ], True, True)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 40, 41, 42, 43, 4, 5, 6, 7], flank_size=100)[1:] == ([0, 1, 4, 5, 6, 7, 2, 3], True, True)) # some other cases assert (simplify_blocks(blocks, [2, 3, 0, 1, 2, 3], flank_size=100)[1:] == ([2, 3, 0, 1, 2, 3], False, False)) assert (simplify_blocks(blocks, [0, 1, 2, 3, 0, 1], flank_size=100)[1:] == ([0, 1, 2, 3, 0, 1], False, False)) assert (simplify_blocks( blocks, [0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 8, 9, 2, 3, 10, 11], flank_size=100)[1:] == ([ 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 8, 9, 2, 3, 10, 11 ], True, True))
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None, caller=None, flank_size=1000, verbosity=0): os.system('mkdir -p %s' % outdir) vcf = open(vcffile, 'r') log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w') data = [] svtype_skipped = {} seen_coords_count = {} skipped_refgap = 0 write_extra = False # need to write FORMAT or INFO to file? with open(vcffile, 'r') as vcf: toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#'] if filter_gaps: chroms = set(toks[0] for toks in toks_list) chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms} else: chrom_gaps = None for toks in toks_list: # NOTE not parsing qual; do filtering beforehand for DELLY chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks # VCF is 1-indexed, but specifies pos/end positions # which are to the left of breakpoints, so no adjustment pos = int(pos) tags = info.split(';') if 'PRECISE' in tags: filterstring += ':PRECISE' elif 'IMPRECISE' in tags: filterstring += ':IMPRECISE' elif caller == 'lumpy': # only includes tags for imprecise events filterstring += ':PRECISE' tags = [t for t in tags if '=' in t] tagd = {t.split('=')[0]: t.split('=')[1] for t in tags} end = int(tagd.get('END', -99999)) svtype = tagd['SVTYPE'] if caller == 'pindel' and svtype == 'INS': inslen = int(tagd['SVLEN']) else: inslen = int(tagd.get('INSLEN', 0)) if caller == 'pindel': homlen = int(tagd['HOMLEN']) if pos + homlen > end or svtype == 'INS': print('pos + homlen > end: positions {0}'.format((pos, end))) cipos = (0, 0) ciend = (0, 0) else: cipos = (0, homlen) ciend = (0, homlen) else: if 'CIPOS95' in tagd: # LUMPY tmp = tagd['CIPOS95'].split(',') cipos = (int(tmp[0]), int(tmp[1])) elif 'CIPOS' in tagd: tmp = tagd['CIPOS'].split(',') cipos = (int(tmp[0]), int(tmp[1])) else: cipos = (0, 0) if 'CIEND95' in tagd: # LUMPY tmp = tagd['CIEND95'].split(',') ciend = (int(tmp[0]), int(tmp[1])) elif 'CIEND' in tagd: tmp = tagd['CIEND'].split(',') ciend = (int(tmp[0]), int(tmp[1])) else: ciend = (0, 0) split_support = int(tagd.get('SR', 0)) pe_support = int(tagd.get('PE', 0)) # lumpy STRANDS only relevant for inversions if caller == 'lumpy' and svtype == 'INV': tmp = tagd['STRANDS'].split(',') tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)} tagd['INV_PLUS'] = tmpd['++'] tagd['INV_MINUS'] = tmpd['--'] tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS', 'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2') tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used} tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))} if 'AD' in tags2: # pindel split_support = int(tags2['AD'].split(',')[1]) gt = tags2['GT'] if gt == './.' or gt == '.|.': is_het = False filterstring += ':NOGT' elif gt in ('0/0', '0|0'): is_het = False filterstring += ':ZEROGT' elif gt in ('0/1', '1/0', '0|1', '1|0'): is_het = True else: assert(gt in ('1/1', '1|1')) is_het = False tags2_used = ('AD', 'SR', 'PE', 'SU') tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used} if len(tagd_extra) + len(tags2_extra) > 0: write_extra = True # cases if svtype == 'DEL': path = (0, 1, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Del' elif svtype == 'INV': path = (0, 1, 3, 2, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'InvL' elif svtype == 'DUP' or svtype == 'DUP:TANDEM': path = (0, 1, 2, 3, 2, 3, 4, 5) refpath = (0, 1, 2, 3, 4, 5) supptype = 'Dup' elif svtype == 'INS': # INSERTIONS parse inslen, add insertion block to blocks path = (0, 1, 4, 5, 2, 3) refpath = (0, 1, 2, 3) supptype = 'Ins' else: # skipping delly TRA # skipping BND events as they may be ambiguous, in terms of the path svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1 continue # check ref gap overlap if filter_gaps and end > pos: sv_interval = pyinter.closedopen(pos, end) sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval]) if len(sv_gap_intersection) > 0: skipped_refgap += 1 continue # create breakpoints and blocks, keeping in mind uncertainty and possible insertion if caller == 'lumpy' and svtype != 'INS': # lumpy intervals are not symmetric. POS and END are each the "best guess" for # the breakpoints bp = [(pos, pos), (end, end)] elif svtype != 'INS': # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \ # (pos + cipos[1] < end + ciend[0]): if (pos + cipos[1] < end + ciend[0]): bp = [(pos + cipos[0], pos + cipos[1]), (end + ciend[0], end + ciend[1])] else: bp = [(pos, pos), (end, end)] filterstring += ':BPOVERLAP' else: # if cipos[1] != -cipos[0]: if cipos[1] > cipos[0]: bp = [(pos + cipos[0], pos + cipos[1])] else: bp = [(pos, pos)] pe = [(x, supptype) for x in range(pe_support)] # TODO SupportingSplit splits = [] for i in range(split_support): aln_tmp = pysam.AlignedSegment() aln_tmp.qname = i aln_tmp.is_read1 = True split_type = supptype + '+' splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type)) breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp} slop_left, slop_right = flank_size, flank_size start = bp[0][0] - slop_left end = bp[-1][1] + slop_right cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity) blocks, _, left_bp, right_bp = cbout if svtype == 'INS': blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True)) paths = [path, refpath] if is_het else [path, path] score = 0 coords = (start, end) scc = seen_coords_count.get(coords, 0) if scc > 0: id_extra = chr(ord('a') + scc) else: id_extra = '' seen_coords_count[coords] = scc + 1 this_data = (paths, blocks, left_bp, right_bp, score, filterstring, id_extra, tagd_extra, tags2_extra) data.append(this_data) for svtype, count in svtype_skipped.items(): log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count)) log.write('skipped_refgap\t{0}\n'.format(skipped_refgap)) do_sv_processing(data, outdir, reffile, log, verbosity, write_extra) vcf.close() log.close()
def path_classify_test(): blocks = [GenomeInterval('1', 100*i, 100*i + 100) for i in range(10)] num_genome_blocks = 10 blocks.append(GenomeInterval('1', 0, 100, True)) # ABC/ABC p1 = [0, 1, 2, 3, 4, 5] p2 = p1 print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) ref = pysam.FastaFile('/home/jgarthur/sv/reference/GRCh37.fa') # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ABC/AC p1 = [0, 1, 2, 3, 4, 5] p2 = [0, 1, 4, 5] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBC/ABC p1 = [0, 1, 4, 5, 2, 3, 4, 5] p2 = [0, 1, 2, 3, 4, 5] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBC/ACBC p1 = [0, 1, 4, 5, 2, 3, 4, 5] p2 = p1 print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBC/AC'BC p1 = [0, 1, 4, 5, 2, 3, 4, 5] p2 = [0, 1, 5, 4, 2, 3, 4, 5] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBAC/AC p1 = [0, 1, 4, 5, 2, 3, 0, 1, 4, 5] p2 = [0, 1, 4, 5] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBD/ACB'D p1 = [0, 1, 4, 5, 2, 3, 6, 7] p2 = [0, 1, 4, 5, 3, 2, 6, 7] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACBD # AC'BD p1 = [0, 1, 4, 5, 2, 3, 6, 7] p2 = [0, 1, 5, 4, 2, 3, 6, 7] print(p1) print(p2) ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # AIC # AIBC p1 = [0, 1, 20, 21, 4, 5] p2 = [0, 1, 20, 21, 2, 3, 4, 5] ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ACD # ACDD'E p1 = [0, 1, 4, 5, 6, 7, 8, 9] p2 = [0, 1, 4, 5, 6, 7, 7, 6, 8, 9] ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ABBBC # ABBBC p1 = [0, 1, 2, 3, 2, 3, 2, 3, 4, 5] p2 = p1 ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv])) # print('\n'.join([sv_to_vcf(s, ref) for s in sv])) # ABBB'C # ABBB'C p1 = [0, 1, 2, 3, 2, 3, 3, 2, 4, 5] p2 = p1 ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks) print(ev) print('\n'.join([repr(s) for s in sv]))
def test_compute_normalizing_constant(): def create_insert_cs(ins): cdf = np.cumsum(ins) cs = np.cumsum(cdf) return lambda x, cs=cs: 0 if x < 0 else (cs[-1] + (x - len(ins)) + 1 ) if x >= len(ins) else cs[x] ins = np.array([0] + ([1 / 200] * 200)) cdf_sum = create_insert_cs(ins) blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1000, 2000)] nc1 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum, [1, 0, 0, 0], 100, 100) print(blocks) print(nc1) print('') blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1099, 2000)] nc2 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum, [1, 0, 0, 0], 100, 100) print(blocks) print(nc2) print('') blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1100, 2000)] nc3 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum, [1, 0, 0, 0], 100, 100) print(blocks) print(nc3) print('') assert (0 < nc3 < nc1) blocks = [ GenomeInterval(1, 0, 1000), GenomeInterval(1, 1000, 1940), GenomeInterval(1, 0, 60, True) ] path = [0, 1, 4, 5, 2, 3] nc4 = compute_normalizing_constant(path, blocks, 1, cdf_sum, [1, 0, 0, 0], 100, 100) print(blocks) print(nc4) print('') assert (0 < nc4 == nc1) blocks = [ GenomeInterval(1, 0, 1000), GenomeInterval(1, 1000, 1938), GenomeInterval(1, 0, 62, True) ] path = [0, 1, 4, 5, 2, 3] nc4 = compute_normalizing_constant(path, blocks, 1, cdf_sum, [1, 0, 0, 0], 100, 100) print(blocks) print(nc4) print('') assert (0 < nc4 < nc1) # try large gaps ins = np.array([0] + ([1 / 10000] * 10000)) cdf_sum = create_insert_cs(ins) blocks = [ GenomeInterval(1, 45024, 64579), GenomeInterval(1, 65306, 65307), GenomeInterval(1, 66018, 79509), GenomeInterval(1, 0, 1000, True) ] path = [0, 1, 2, 3, 4, 5] pm = [.97, .01, .01, .01] nc = compute_normalizing_constant(path, blocks, 1, cdf_sum, pm, 100, 100) print(blocks) print(nc) path = [2, 3, 6, 7] nc2 = compute_normalizing_constant(path, blocks, 1, cdf_sum, pm, 100, 100) print(path) print(nc2)
def simplify_blocks(blocks, path, flank_size): block_nums = [int(floor(path[i] / 2)) for i in range(1, len(path), 2)] neighbors = defaultdict(set) neighbors[path[0]].add(-1) neighbors[path[-1]].add(-2) for i in range(1, len(path) - 1): if i % 2 == 0: neighbors[path[i]].add(path[i - 1]) else: neighbors[path[i]].add(path[i + 1]) min_block = min([b for b in block_nums if not blocks[b].is_insertion()]) max_block = max([b for b in block_nums if not blocks[b].is_insertion()]) ins_blocks = [b for b in block_nums if blocks[b].is_insertion()] new_blocks = [] path_map = {} idx = 0 merging = False for b in range(min_block, max_block + 1): if not merging: block_start = blocks[b].start right_node = 2 * b + 1 if all(n == right_node+1 for n in neighbors[right_node]) and \ all(n == right_node for n in neighbors[right_node+1]) and \ b < max_block: # combine after merging = True path_map[2 * b] = None path_map[2 * b + 1] = None else: newblock = GenomeInterval(blocks[b].chrom, block_start, blocks[b].end) new_blocks.append(newblock) path_map[2 * b] = 2 * idx path_map[2 * b + 1] = 2 * idx + 1 merging = False idx += 1 new_blocks.extend([deepcopy(blocks[b]) for b in ins_blocks]) for b in ins_blocks: path_map[2 * b] = 2 * idx path_map[2 * b + 1] = 2 * idx + 1 idx += 1 new_path = [path_map[p] for p in path if path_map[p] is not None] # adjust flanks if new_block_nums = [ int(floor(new_path[i] / 2)) for i in range(1, len(new_path), 2) ] new_block_counts = Counter(new_block_nums) new_min_block = min( [b for b in new_block_nums if not new_blocks[b].is_insertion()]) new_max_block = max( [b for b in new_block_nums if not new_blocks[b].is_insertion()]) left_block, right_block = new_block_nums[0], new_block_nums[-1] # check that left block is minimum in reference, properly oriented, and not duplicated if left_block == new_min_block and new_path[0] % 2 == 0 and \ new_block_counts[left_block] == 1: new_blocks[left_block].start = max( new_blocks[left_block].start, new_blocks[left_block].end - flank_size) has_left_flank = True else: has_left_flank = False if right_block == new_max_block and new_path[-1] % 2 == 1 and \ new_block_counts[right_block] == 1: new_blocks[right_block].end = min( new_blocks[right_block].end, new_blocks[right_block].start + flank_size) has_right_flank = True else: has_right_flank = False return new_blocks, new_path, has_left_flank, has_right_flank
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity): # create list of blocks between breakpoints # while adjusting for genome gaps gap_indices = set() gap_indices.add(0) blocks = [] left_breakpoints = [] right_breakpoints = [] breakpoints[(end, end)] = Breakpoint((end, end)) bploc = list(breakpoints.keys()) bploc.sort() last_end = start last_breakpoint = Breakpoint((start, start)) for bpl in bploc: breakpoint = breakpoints[bpl] if bpl[0] <= start or bpl[1] > end: continue iset = pyinter.IntervalSet() blockinterval = pyinter.closedopen(last_end, bpl[0]) iset.add(blockinterval) adjusted_blocks = iset.difference(gaps) adjusted_blocks = sorted(list(adjusted_blocks)) if verbosity > 1: print('bploc {0}'.format(bpl)) print('bp {0}'.format(breakpoint)) print('blockinterval {0}'.format(blockinterval)) print('adjusted {0}'.format(adjusted_blocks)) for ab in adjusted_blocks: if ab.lower_value == ab.upper_value: # block completely within a gap gap_indices.add(len(blocks)) break else: if ab.lower_value != blockinterval.lower_value: gap_indices.add(len(blocks)) left_breakpoint = Breakpoint( (ab.lower_value, ab.lower_value)) else: left_breakpoint = last_breakpoint if ab.upper_value != blockinterval.upper_value: gap_indices.add(len(blocks) + 1) right_breakpoint = Breakpoint( (ab.upper_value, ab.upper_value)) else: right_breakpoint = breakpoint if verbosity > 1: print('adding {0}'.format( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))) print('\tleft {0}'.format(left_breakpoint)) print('\tright {0}'.format(right_breakpoint)) blocks.append( GenomeInterval(chrom_name, ab.lower_value, ab.upper_value)) left_breakpoints.append(left_breakpoint) right_breakpoints.append(right_breakpoint) last_end = bpl[1] last_breakpoint = breakpoints[bpl] gap_indices.add(len(blocks)) gap_indices = sorted(list(gap_indices)) if verbosity > 1: print('--creating blocks--') print(breakpoints) print(blocks) print(gap_indices) print(left_breakpoints) print(right_breakpoints) return blocks, gap_indices, left_breakpoints, right_breakpoints
def test_get_insertion_overlap_positions(): blocks = [ GenomeInterval(1, 0, 100), # 01 GenomeInterval(1, 100, 200), # 23 GenomeInterval(1, 210, 300), # 45 GenomeInterval(1, 350, 360), # 67 GenomeInterval(1, 370, 400), # 89 GenomeInterval(1, 0, 100, True), # 10, 11 GenomeInterval(1, 0, 10, True) ] # 12, 13 paths = (list(range(10)), [0, 1, 10, 11, 2, 3], [0, 1, 2, 3, 10, 11, 2, 3], [0, 1, 2, 3, 12, 13, 2, 3], [0, 1, 2, 3, 4, 5, 10, 11, 6, 7], [0, 1, 2, 3, 4, 5, 12, 13, 6, 7]) truth = [ tuple(), ((80, 170), ), ((185, 275), ), tuple(), ((305, 395), ), tuple() ] rlen = 50 m = 20 for i in range(len(truth)): out, _, _ = get_insertion_overlap_positions(paths[i], blocks, rlen, m) inter = pyinter.IntervalSet() for interval in truth[i]: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter) blocks = [ GenomeInterval(1, 0, 100), GenomeInterval(1, 200, 300), GenomeInterval(0, 350, 400), GenomeInterval(1, 0, 50, True), GenomeInterval(1, 0, 50, True) ] path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5] truth = [(130, 170), (355, 395)] out, _, _ = get_insertion_overlap_positions(path, blocks, rlen, m) inter = pyinter.IntervalSet() for interval in truth: inter.add(pyinter.open(interval[0], interval[1])) print('truth: {0}\nresult: {1}\n'.format(inter, out)) assert (out == inter)
def test_get_overlapping_blocks(): blocks = [GenomeInterval('1', 0, 100), GenomeInterval('1', 100, 200)] aln_blocks = [(-10, 0)] aln_gaps = [0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True) assert (ov == []) aln_blocks = [(10, 20)] aln_gaps = [0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True) assert (ov == [1]) assert (offset == -10) aln_gaps = [10, 10] aln_blocks = [(10, 20), (110, 120), (200, 250)] aln_gaps = [0, 0, 0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True) assert (ov == [1, 3]) assert (offset == -10) ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, True, find_offset=True) assert (ov == [2, 0]) assert (offset == 70) aln_gaps = [10, 10, 0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True) assert (ov == [1, 3]) assert (offset == 0) ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, True, find_offset=True) assert (ov == [2, 0]) assert (offset == 70) aln_blocks = [(-10, 0), (90, 110)] aln_gaps = [0, 0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True) assert (ov == [1, 3]) assert (offset == -(90 - 0) + 10) blocks = [GenomeInterval('1', 0, 100), GenomeInterval('1', 110, 200)] aln_blocks = [(0, 111)] aln_gaps = [0, 0] ov, offset = get_overlapping_blocks(blocks, aln_blocks, aln_gaps, 0, False, find_offset=True)
def sv_output(path1, path2, blocks, event1, event2, frac1, frac2, sv_list, complex_types, event_lh, ref_lh, next_best_lh, next_best_pathstring, num_paths, filter_criteria, filterstring_manual=None, id_extra='', output_vcf=False, reference=False, output_split_support=False): lines = '' splitlines = '' vcflines = [] sv1 = [ sv for sv in sv_list if sv.genotype == '1/1' or sv.genotype == '1/0' ] sv2 = [ sv for sv in sv_list if sv.genotype == '1/1' or sv.genotype == '0/1' ] compound_het = (path1 != path2) and (len(sv1) > 0) and (len(sv2) > 0) is_het = (path1 != path2) num_paths = str(num_paths) for (k, path, event, svs, complex_type, frac) in [(0, path1, event1, sv1, complex_types[0], frac1), (1, path2, event2, sv2, complex_types[1], frac2)]: if k == 1 and path1 == path2: continue if len(svs) == 0: continue chrom = blocks[int(floor(path1[0] / 2))].chrom # CLEANUP this code is duplicated up above -- should be merged id = '_'.join(svs[0].event_id.split(',')[0:2]) if compound_het: id = id + '_' + str(k + 1) id += id_extra num_sv = len(svs) if filterstring_manual is None: fs = sorted( set((get_filter_string(sv, filter_criteria) for sv in svs))) if all(x == 'PASS' for x in fs): filters = 'PASS' else: filters = ','.join(x for x in fs if x != 'PASS') else: filters = filterstring_manual all_sv_bp1 = [int(floor(np.median(sv.bp1))) for sv in svs] all_sv_bp2 = [int(floor(np.median(sv.bp2))) for sv in svs] all_sv_bp = all_sv_bp1 + all_sv_bp2 minbp, maxbp = min(all_sv_bp), max(all_sv_bp) total_span = maxbp - minbp # sv_span = maxbp - minbp # bp_cis = bp_ci for sv in svs # (bp1, bp2) in bp_cis sv_bp_joined = ';'.join(get_bp_string(sv) for sv in svs) sv_bp_uncertainty_joined = ';'.join( get_bp_uncertainty_string(sv) for sv in svs) sv_bp_ci = [get_bp_ci(sv) for sv in svs] svtypes = list(sv.type.split(':')[0] for sv in svs) # use DUP not DUP:TANDEM svtypes_joined = ','.join(svtypes) nonins_blocks = [b for b in blocks if not b.is_insertion()] nni = len(nonins_blocks) block_bp = [nonins_blocks[0].start] + \ [int(floor(np.median((blocks[i-1].end, blocks[i].start)))) for i in range(1, nni)] + \ [nonins_blocks[-1].end] block_bp_joined = ','.join(str(x) for x in block_bp) block_bp_uncertainty = [0] + \ [block_gap(blocks, 2*i) for i in range(1, nni)] + \ [0] block_bp_uncertainty_joined = ','.join( str(x) for x in block_bp_uncertainty) blocks_midpoints = [ GenomeInterval(chrom, block_bp[i], block_bp[i + 1]) for i in range(nni) ] blocks_midpoints.extend([b for b in blocks if b.is_insertion()]) len_affected = sv_affected_len(path, blocks_midpoints) pathstring = path_to_string(path, blocks=blocks) nblocks = len([b for b in blocks if not b.is_insertion()]) refpath = list(range(2 * nblocks)) ref_string = path_to_string(refpath, blocks=blocks) gt = 'HET' if is_het else 'HOM' insertion_lengths = [ get_sv_ins(sv) for sv in svs if get_sv_ins(sv) > 0 ] if len(insertion_lengths) == 0: inslen_joined = 'NA' else: inslen_joined = ','.join(str(l) for l in insertion_lengths) sr = list(sv.split_support for sv in svs) pe = list(sv.pe_support for sv in svs) sr_joined = ','.join(map(str, sr)) pe_joined = ','.join(map(str, pe)) lhr = '%.2f' % (event_lh - ref_lh) lhr_next = '%.2f' % (event_lh - next_best_lh) frac_str = '%.3f' % frac line = '\t'.join( str(x) for x in (chrom, minbp, maxbp, id, svtypes_joined, complex_type, num_sv, block_bp_joined, block_bp_uncertainty_joined, ref_string, pathstring, len_affected, filters, sv_bp_joined, sv_bp_uncertainty_joined, gt, frac_str, inslen_joined, sr_joined, pe_joined, lhr, lhr_next, next_best_pathstring, num_paths)) # num_sv # block_bp_joined # block_bp_uncertainty_joined line += '\n' lines = lines + line if output_vcf: template = vcf_line_template() info_tags_ordered = [ 'SV_TYPE', 'HAPLOID_CN', 'COMPLEX_TYPE', 'MATE_ID', 'END', 'CI_POS', 'CI_END', 'INS_LEN', 'SR', 'PE', 'SV_SPAN', 'EVENT_SPAN', 'EVENT_START', 'EVENT_END', 'EVENT_AFFECTED_LEN', 'EVENT_NUM_SV', 'REF_STRUCTURE', 'ALT_STRUCTURE', 'SEGMENT_ENDPTS', 'SEGMENT_ENDPTS_CIWIDTH', 'AF', 'SCORE_VS_REF', 'SCORE_VS_NEXT', 'NEXT_BEST_STRUCTURE', 'NUM_PATHS' ] info_tags_ordering = { y: x for x, y in enumerate(info_tags_ordered) } for (i, sv) in enumerate(svs): info_list = [] sv_chrom = sv.ref_chrom # pos pos = all_sv_bp1[i] + 1 if num_sv > 1: id_vcf = id + '_' + str(i + 1) else: id_vcf = id ref_base = fetch_seq(reference, sv_chrom, pos - 1, pos) # pysam is 0-indexed alt = '<{0}>'.format(sv.type) qual = '.' svtype = svtypes[i] info_list.append(('SV_TYPE', svtype)) end = all_sv_bp2[i] + 1 info_list.append(('END', end)) block_bp_vcf = ','.join(str(x + 1) for x in block_bp) info_list.append(('SEGMENT_ENDPTS', block_bp_vcf)) info_list.append( ('SEGMENT_ENDPTS_CIWIDTH', block_bp_uncertainty_joined)) if svtype == 'INS': svlen = sv.length else: svlen = end - pos info_list.append(('SV_SPAN', svlen)) info_list.append(('EVENT_SPAN', total_span)) info_list.append(('EVENT_AFFECTED_LEN', len_affected)) if svtype == 'DUP': info_list.append(('HAPLOID_CN', sv.copynumber)) bp1_ci, bp2_ci = sv_bp_ci[i] bp1_ci_str = str(bp1_ci[0]) + ',' + str(bp1_ci[1]) bp2_ci_str = str(bp2_ci[0]) + ',' + str(bp2_ci[1]) if bp1_ci_str != '0,0': info_list.append(('CI_POS', bp1_ci_str)) if bp2_ci_str != '0,0' and svtype != 'INS': info_list.append(('CI_END', bp2_ci_str)) info_list.extend([ ('REF_STRUCTURE', ref_string), ('ALT_STRUCTURE', pathstring), ('AF', frac_str), ('SR', sr[i]), ('PE', pe[i]), ('SCORE_VS_REF', lhr), ('SCORE_VS_NEXT', lhr_next), ('NEXT_BEST_STRUCTURE', next_best_pathstring), ('NUM_PATHS', num_paths), ('EVENT_START', minbp + 1), ('EVENT_END', maxbp), ('EVENT_NUM_SV', num_sv) ]) # FORMAT/GT format_str = 'GT' gt_vcf = sv.genotype if svtype != 'BND': # write line info_list.sort(key=lambda x: info_tags_ordering[x[0]]) info = ';'.join( ['{0}={1}'.format(el[0], el[1]) for el in info_list]) line = template.format(chr=chrom, pos=pos, id=id_vcf, ref=ref_base, alt=alt, qual=qual, filter=filters, info=info, format_str=format_str, gt=gt_vcf) vcflines.append(line) else: # breakend type --> 2 lines in vcf id_bnd1, id_bnd2 = id_vcf + 'A', id_vcf + 'B' mateid_bnd1, mateid_bnd2 = id_bnd2, id_bnd1 orientation_bnd1, orientation_bnd2 = sv.bnd_orientation pos_bnd1 = all_sv_bp1[i] + 1 pos_bnd2 = all_sv_bp2[i] + 1 if orientation_bnd1 == '-': pos_bnd1 -= 1 if orientation_bnd2 == '-': pos_bnd2 -= 1 ref_bnd1 = fetch_seq(reference, sv_chrom, pos_bnd1 - 1, pos_bnd1) ref_bnd2 = fetch_seq(reference, sv_chrom, pos_bnd2 - 1, pos_bnd2) alt_bnd1 = bnd_alt_string(orientation_bnd1, orientation_bnd2, sv.ref_chrom, pos_bnd2, ref_bnd1) alt_bnd2 = bnd_alt_string(orientation_bnd2, orientation_bnd1, sv.ref_chrom, pos_bnd1, ref_bnd2) ctype_str = complex_type.upper().replace('.', '_') info_list_bnd1 = [('MATE_ID', mateid_bnd1)] info_list_bnd2 = [('MATE_ID', mateid_bnd2)] if bp1_ci_str != '0,0': info_list_bnd1.append(('CI_POS', bp1_ci_str)) if bp2_ci_str != '0,0': info_list_bnd2.append(('CI_POS', bp2_ci_str)) if sv.bnd_ins > 0: info_list_bnd1.append(('INS_LEN', sv.bnd_ins)) info_list_bnd2.append(('INS_LEN', sv.bnd_ins)) common_tags = [('SV_TYPE', svtype), ('COMPLEX_TYPE', ctype_str), ('EVENT_SPAN', total_span), ('EVENT_START', minbp + 1), ('EVENT_END', maxbp), ('EVENT_AFFECTED_LEN', len_affected), ('EVENT_NUM_SV', num_sv), ('SEGMENT_ENDPTS', block_bp_vcf), ('SEGMENT_ENDPTS_CIWIDTH', block_bp_uncertainty_joined), ('REF_STRUCTURE', ref_string), ('ALT_STRUCTURE', pathstring), ('AF', frac_str), ('SR', sr[i]), ('PE', pe[i]), ('SCORE_VS_REF', lhr), ('SCORE_VS_NEXT', lhr_next), ('NEXT_BEST_STRUCTURE', next_best_pathstring), ('NUM_PATHS', num_paths)] info_list_bnd1.extend(common_tags) info_list_bnd2.extend(common_tags) info_list_bnd1.sort(key=lambda x: info_tags_ordering[x[0]]) info_list_bnd2.sort(key=lambda x: info_tags_ordering[x[0]]) info_bnd1 = ';'.join([ '{0}={1}'.format(el[0], el[1]) for el in info_list_bnd1 ]) info_bnd2 = ';'.join([ '{0}={1}'.format(el[0], el[1]) for el in info_list_bnd2 ]) line1 = template.format(chr=chrom, pos=pos_bnd1, id=id_bnd1, ref=ref_bnd1, alt=alt_bnd1, qual=qual, filter=filters, info=info_bnd1, format_str=format_str, gt=gt_vcf) line2 = template.format(chr=chrom, pos=pos_bnd2, id=id_bnd2, ref=ref_bnd2, alt=alt_bnd2, qual=qual, filter=filters, info=info_bnd2, format_str=format_str, gt=gt_vcf) vcflines.append(line1) vcflines.append(line2) if output_split_support: split_line_list = [] bp_orientations = { 'Del': ('-', '+'), 'Dup': ('+', '-'), 'InvL': ('-', '-'), 'InvR': ('+', '+') } bp_idx = 1 for sv in svs: bp1 = str(int(floor(np.median(sv.bp1)))) bp2 = str(int(floor(np.median(sv.bp2)))) for split in sv.supporting_splits: orientation = bp_orientations[split.split_type[:-1]] orientation = ','.join(orientation) strand = split.split_type[-1] qname = split.aln.qname seq = split.aln.seq mapq = str(split.aln.mapq) if split.mate is not None: mate_seq = split.mate.seq mate_mapq = str(split.mate.mapq) mate_has_split = str(split.mate_has_split) else: mate_seq = 'NA' mate_mapq = 'NA' mate_has_split = 'NA' line = '\t'.join( str(x) for x in (id, block_bp_joined, ref_string, pathstring, sv_bp_joined, 'split', qname, bp_idx, bp1, bp2, orientation, qname, strand, seq, mapq, mate_seq, mate_mapq, mate_has_split)) split_line_list.append(line) bp_idx += 1 if len(split_line_list) > 0: splitlines = splitlines + '\n'.join(split_line_list) + '\n' return lines, vcflines, splitlines