Example #1
0
def test_get_blocked_alignment():
    bam = pysam.AlignmentFile(
        '/home/jgarthur/sv/analysis/alignments/bwa_mem/short-reads/jun_jul.mdup.merge.mdup.bam',
        'rb')
    blocks = [
        GenomeInterval('1', 0, 100),
        GenomeInterval('1', 110, 210),
        GenomeInterval('1', 210, 2000)
    ]
    aln = pysam.AlignedSegment()
    aln.pos = 0
    aln.cigarstring = '50M'
    aln.seq = 'A' * 50
    aln.is_reverse = False
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1], 0))
    assert (get_blocked_alignment(aln, blocks, 0, bam,
                                  is_rf=True) == ([0], 50))
    aln.is_reverse = True
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([0], 50))
    assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([1], 0))

    aln = pysam.AlignedSegment()
    aln.rname = 0
    aln.pos = 90
    aln.seq = 'A' * 40
    aln.cigarstring = '20M20S'
    aln.set_tag('SA', '1,191,-,20M20S,60,0;', 'Z')
    print(get_blocked_alignment(aln, blocks, 0, bam))
    assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1, 2], -90))
    assert (get_blocked_alignment(aln, blocks, 0, bam,
                                  is_rf=True) == ([3, 0], -80))
Example #2
0
def genome_blocks_gaps(blocks, path):
    chrom = blocks[0].chrom
    blocks_gaps = []
    start_gap = block_gap(blocks, path[0])
    blocks_gaps.append(GenomeInterval(chrom, 0, start_gap, is_gap=True))
    blocks_gaps.append(blocks[int(floor(path[0] / 2))])
    for i in range(1, len(path) - 1, 2):
        gap_size = int(
            floor(
                (block_gap(blocks, path[i]) + block_gap(blocks, path[i + 1])) /
                2))
        blocks_gaps.append(GenomeInterval(chrom, 0, gap_size, is_gap=True))
        blocks_gaps.append(blocks[int(floor(path[i + 1] / 2))])
    end_gap = block_gap(blocks, path[-1])
    blocks_gaps.append(GenomeInterval(chrom, 0, end_gap, is_gap=True))
    return blocks_gaps
Example #3
0
def test_intersects():
    i = GenomeInterval('20', 10, 20)
    assert (not i.intersects((20, 30)))
    assert (i.intersects((19, 20)))
    assert (not i.intersects((0, 10)))
    assert (i.intersects((0, 11)))
    assert (i.intersects((11, 12)))
Example #4
0
def simplify_blocks_diploid(blocks, path1, path2):
    block_nums = [int(floor(path1[i] / 2)) for i in range(1, len(path1), 2)]
    block_nums.extend(
        [int(floor(path2[i] / 2)) for i in range(1, len(path2), 2)])

    neighbors = defaultdict(set)
    for path in (path1, path2):
        neighbors[path[0]].add(-1)
        neighbors[path[-1]].add(-2)
        for i in range(1, len(path) - 1):
            if i % 2 == 0:
                neighbors[path[i]].add(path[i - 1])
            else:
                neighbors[path[i]].add(path[i + 1])

    # print(neighbors)
    min_block = min([b for b in block_nums if not blocks[b].is_insertion()])
    max_block = max([b for b in block_nums if not blocks[b].is_insertion()])
    ins_blocks = [b for b in block_nums if blocks[b].is_insertion()]
    new_blocks = []
    path_map = {}
    idx = 0
    merging = False
    for b in range(min_block, max_block + 1):
        if not merging:
            block_start = blocks[b].start
        right_node = 2 * b + 1
        if all(n == right_node+1 for n in neighbors[right_node]) and \
           all(n == right_node for n in neighbors[right_node+1]) and \
           b < max_block:
            # combine after
            merging = True
            path_map[2 * b] = None
            path_map[2 * b + 1] = None
        else:
            newblock = GenomeInterval(blocks[b].chrom, block_start,
                                      blocks[b].end)
            new_blocks.append(newblock)
            path_map[2 * b] = 2 * idx
            path_map[2 * b + 1] = 2 * idx + 1
            merging = False
            idx += 1
    new_blocks.extend([deepcopy(blocks[b]) for b in ins_blocks])
    for b in ins_blocks:
        path_map[2 * b] = 2 * idx
        path_map[2 * b + 1] = 2 * idx + 1
        idx += 1
    new_path1 = [path_map[p] for p in path1 if path_map[p] is not None]
    new_path2 = [path_map[p] for p in path2 if path_map[p] is not None]

    # print(new_path1)
    # print(new_path2)
    # print(new_blocks)

    return new_blocks, new_path1, new_path2
Example #5
0
def test_genome_blocks_gaps():
    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 105, 200),
        GenomeInterval(1, 200, 300),
        GenomeInterval(1, 305, 400),
        GenomeInterval(1, 420, 500)
    ]

    print(blocks)

    path = list(range(10))
    print(path)
    print(genome_blocks_gaps(blocks, path))
    print('')

    path = [0, 1, 4, 5, 8, 9]
    print(path)
    print(genome_blocks_gaps(blocks, path))
    print('')
Example #6
0
def test_affected_len():
    print(align_strings('abcde', 'abdef'))
    print('-' * 50)
    print(align_strings('aab', 'ab'))
    print('-' * 50)
    print(align_strings('sjdioa', 'ssjjdioa'))

    print('=' * 50)

    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 100, 150),
        GenomeInterval(1, 150, 300),
        GenomeInterval(1, 300, 325),
        GenomeInterval(1, 325, 425)
    ]

    # ABCDE (reference)
    # ABCDE
    assert (0 == sv_affected_len(range(len(blocks) * 2), blocks))
    # ABCD'E
    assert (25 == sv_affected_len([0, 1, 2, 3, 4, 5, 7, 6, 8, 9], blocks))
    # AE
    assert (225 == sv_affected_len([0, 1, 8, 9], blocks))
    # ABCDCDE
    assert (175 == sv_affected_len([0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9],
                                   blocks))
    # AB'CCE
    assert (225 == sv_affected_len([0, 1, 3, 2, 4, 5, 4, 5, 8, 9], blocks))

    blocks.append(GenomeInterval(None, 0, 500, is_de_novo=True))
    assert (0 == sv_affected_len(range((len(blocks) - 1) * 2), blocks))
    assert (500 == sv_affected_len([0, 1, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9],
                                   blocks))
    assert (525 == sv_affected_len([0, 1, 10, 11, 2, 3, 4, 5, 8, 9], blocks))
Example #7
0
def test_get_gap_overlap_positions():
    rlen = 50
    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 100, 200),
        GenomeInterval(1, 249, 300),
        GenomeInterval(1, 350, 400),
        GenomeInterval(1, 500, 600)
    ]

    paths = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 7, 6, 8, 9])
    truth = ([(299, 301), (399, 451)], [(299, 326), (424, 451)])

    for i in range(len(truth)):
        out = get_gap_overlap_positions(paths[i], blocks, rlen)
        inter = pyinter.IntervalSet()
        for interval in truth[i]:
            inter.add(pyinter.open(interval[0], interval[1]))
        print('truth: {0}\nresult: {1}\n'.format(inter, out))
        assert (out == inter)

    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 200, 300),
        GenomeInterval(0, 350, 400),
        GenomeInterval(1, 0, 50, True),
        GenomeInterval(1, 0, 50, True)
    ]

    path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5]
    truth = [(99, 131), (169, 201), (349, 356), (394, 401)]
    out = get_gap_overlap_positions(path, blocks, rlen)
    inter = pyinter.IntervalSet()
    for interval in truth:
        inter.add(pyinter.open(interval[0], interval[1]))
    print('truth: {0}\nresult: {1}\n'.format(inter, out))
    assert (out == inter)
Example #8
0
def sv_classify_test():
    blocks = [GenomeInterval('1', 100*i, 100*i + 100) for i in range(10)]
    num_genome_blocks = 10
    blocks.append(GenomeInterval('1', 0, 100, True))

    path = [0, 1, 2, 3, 6, 7, 8, 9]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 20, 21, 4, 5]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 5, 4, 6, 7]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 7, 6, 5, 4, 8, 9]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 2, 3, 4, 5]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 0, 1, 2, 3, 4, 5]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 2, 3, 2, 3, 4, 5]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))

    path = [0, 1, 2, 3, 2, 3, 4, 5, 20, 21, 6, 7]
    print(path)
    print(classify_svs(path, blocks, num_genome_blocks))
Example #9
0
def test_plot_rearrangement():
    blocks = [
        GenomeInterval('1', 0, 1000),
        GenomeInterval('1', 1010, 1012),  # 1500),
        GenomeInterval('1', 1505, 2000),
        GenomeInterval('1', 2000, 4000),
        GenomeInterval('1', 4000, 20000),
        GenomeInterval('1', 0, 10, is_de_novo=True),
        GenomeInterval('1', 0, 1000, is_de_novo=True)
    ]
    outdir = '~/tmp/'

    p1 = [0, 1, 4, 5, 6, 7]
    fn = 'ACD.png'
    print(fn)
    plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, None, True)

    p1 = [0, 1, 10, 11, 4, 5, 6, 7]
    fn = 'AICD.png'
    print(fn)
    plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, None, True)

    p1 = [0, 1, 2, 3, 3, 2, 6, 7, 8, 9]
    p2 = [0, 1, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]
    fn = 'ABB-DE_ACBCDE.png'
    print(fn)
    plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, p2, True)

    p1 = [0, 1, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 2, 3, 6, 7, 7, 6, 8, 9]
    p2 = p2
    fn = 'ABB--.png'
    print(fn)
    plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 4, p1, p2, True)

    blocks = [GenomeInterval('1', i, i + 100) for i in range(0, 2000, 100)]
    p1 = list(range(0, 39))
    p2 = p2
    fn = 'ABCD---.png'
    print(fn)
    plot_rearrangement(os.path.join(outdir, fn), blocks, 0, 19, p1, p2, True)
Example #10
0
def test_altered_reference_sequence():
    ref = pysam.FastaFile('/home/jgarthur/sv/reference/GRCh37.fa')
    blocks = [
        GenomeInterval('20', 100000 + 100 * i, 100000 + 100 * (i + 1))
        for i in range(10)
    ] + [GenomeInterval('20', 0, 1000, True)]

    refpath = [0, 1, 2, 3, 4, 5, 6, 7]
    delpath = [0, 1, 2, 3, 6, 7, 8, 9]
    del2path = [0, 1, 2, 3, 8, 9, 10, 11]
    inspath = [0, 1, 2, 3, 20, 21, 4, 5, 6, 7]
    duppath = [0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9]
    dup2path = [0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11]
    dupend = [0, 1, 2, 3, 4, 5, 4, 5]
    dupstartdel = [0, 1, 0, 1, 4, 5]
    invpath = [0, 1, 2, 3, 5, 4, 6, 7, 8, 9]
    inv2path = [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11]
    dduppath = [0, 1, 2, 3, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11]

    flank_size = 1000

    out = altered_reference_sequence(refpath, blocks, ref, flank_size)
    assert (out[0] == [])
    out = altered_reference_sequence(delpath, blocks, ref, flank_size)
    print(len(out[0][0]))
    assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) +
                          fetch_seq(ref, '20', 100300, 100300 + 200)))
    assert (out[1][0] == [(0, 200), (200, 400)])
    assert (out[2] == [])
    assert (out[3][0] == [100, 0])
    out = altered_reference_sequence(del2path, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) +
                          fetch_seq(ref, '20', 100400, 100400 + 200)))
    assert (out[1][0] == [(0, 200), (200, 400)])
    assert (out[2] == [])
    assert (out[3][0] == [200, 0])
    out = altered_reference_sequence(duppath, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) +
                          fetch_seq(ref, '20', 100200, 100300) +
                          fetch_seq(ref, '20', 100200, 100300) +
                          fetch_seq(ref, '20', 100300, 100300 + 200)))
    assert (out[1][0] == [(0, 200), (200, 300), (300, 400), (400, 600)])
    assert (out[2] == [])
    assert (out[3][0] == [0, 0, 0, 0])
    out = altered_reference_sequence(dupend, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) +
                          fetch_seq(ref, '20', 100200, 100300) +
                          fetch_seq(ref, '20', 100200, 100300)))
    assert (out[1][0] == [(0, 200), (200, 300), (300, 400)])
    assert (out[2] == [])
    assert (out[3][0] == [0, 0, 0])
    out = altered_reference_sequence(dup2path, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100200 - 200, 100200) +
                          fetch_seq(ref, '20', 100200, 100400) +
                          fetch_seq(ref, '20', 100200, 100400) +
                          fetch_seq(ref, '20', 100400, 100400 + 200)))
    out = altered_reference_sequence(dupstartdel, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100000, 100100) * 2 +
                          fetch_seq(ref, '20', 100200, 100200 + 100)))
    assert (out[1][0] == [(0, 100), (100, 200), (200, 300)])
    assert (out[2] == [])
    assert (out[3][0] == [0, 100, 0])
    out = altered_reference_sequence(invpath, blocks, ref, flank_size)
    assert (out[0][0] == (
        fetch_seq(ref, '20', 100200 - 200, 100200) +
        reverse_complement(fetch_seq(ref, '20', 100200, 100300)) +
        fetch_seq(ref, '20', 100300, 100300 + 200)))
    out = altered_reference_sequence(inv2path, blocks, ref, flank_size)
    assert (out[0][0] == (
        fetch_seq(ref, '20', 100200 - 200, 100200) +
        reverse_complement(fetch_seq(ref, '20', 100200, 100400)) +
        fetch_seq(ref, '20', 100400, 100400 + 200)))
    out = altered_reference_sequence(inspath, blocks, ref, flank_size)
    assert (out[0] == [
        fetch_seq(ref, '20', 100200 - 200, 100200),
        fetch_seq(ref, '20', 100200, 100200 + 200)
    ])
    out = altered_reference_sequence(dduppath, blocks, ref, flank_size)
    assert (out[0][0] == fetch_seq(ref, '20', 100200 - 200, 100200) +
            fetch_seq(ref, '20', 100300, 100400) +
            fetch_seq(ref, '20', 100200, 100400 + 200))
    assert (out[1][0] == [(0, 200), (200, 300), (300, 400), (400, 500),
                          (500, 700)])
    assert (out[2] == [])
    assert (out[3][0] == [0, 0, 0, 0, 0])

    blocks = [
        GenomeInterval('20', 100000, 101000),
        GenomeInterval('20', 101025, 101125),
        GenomeInterval('20', 101130, 102500)
    ]
    refpath = [0, 1, 2, 3, 4, 5]
    delpath = [0, 1, 4, 5]
    duppath = [0, 1, 2, 3, 2, 3, 4, 5]
    invpath = [0, 1, 3, 2, 4, 5]
    out = altered_reference_sequence(refpath, blocks, ref, flank_size)
    assert (out[0] == [])
    out = altered_reference_sequence(delpath, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100000, 101013) +
                          fetch_seq(ref, '20', 101128, 102130)))
    out = altered_reference_sequence(duppath, blocks, ref, flank_size)
    assert (out[0][0] == (fetch_seq(ref, '20', 100000, 101013) +
                          fetch_seq(ref, '20', 101013, 101128) * 2 +
                          fetch_seq(ref, '20', 101128, 102130)))
    out = altered_reference_sequence(invpath, blocks, ref, flank_size)
    assert (out[0][0] == (
        fetch_seq(ref, '20', 100000, 101013) +
        reverse_complement(fetch_seq(ref, '20', 101013, 101128)) +
        fetch_seq(ref, '20', 101128, 102130)))
Example #11
0
def test_simplify_blocks():
    blocks = [GenomeInterval(1, 100 * i, 100 * (i + 1)) for i in range(20)]
    blocks.append(GenomeInterval(1, 0, 100, is_de_novo=True))
    blocks.append(GenomeInterval(1, 0, 100, is_de_novo=True))

    # deletion
    assert (simplify_blocks(blocks, [0, 1, 4, 5],
                            flank_size=100)[1:] == ([0, 1, 4, 5], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 6, 7],
                            flank_size=100)[1:] == ([0, 1, 4, 5], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 8, 9],
                            flank_size=100)[1:] == ([0, 1, 4, 5], True, True))
    # inversion
    assert (simplify_blocks(blocks, [0, 1, 3, 2, 4, 5],
                            flank_size=100)[1:] == ([0, 1, 3, 2, 4,
                                                     5], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 5, 4, 6, 7, 8, 9],
                            flank_size=100)[1:] == ([0, 1, 3, 2, 4,
                                                     5], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11],
                            flank_size=100)[1:] == ([0, 1, 3, 2, 4,
                                                     5], True, True))
    # duplication
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 2, 3, 4, 5],
                            flank_size=100)[1:] == ([0, 1, 2, 3, 2, 3, 4,
                                                     5], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9],
                            flank_size=100)[1:] == ([0, 1, 2, 3, 2, 3, 4,
                                                     5], True, True))
    assert (simplify_blocks(
        blocks, [0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 7, 8, 9],
        flank_size=100)[1:] == ([0, 1, 0, 1, 2, 3, 4, 5, 4, 5], False, False))
    # dispersed duplication
    assert (simplify_blocks(blocks, [0, 1, 4, 5, 2, 3, 4, 5, 6, 7],
                            flank_size=100)[1:] == ([
                                0, 1, 4, 5, 2, 3, 4, 5, 6, 7
                            ], True, True))
    assert (simplify_blocks(
        blocks, [0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
        flank_size=100)[1:] == ([0, 1, 4, 5, 2, 3, 4, 5, 6, 7], True, True))
    print(
        simplify_blocks(blocks, [0, 1, 5, 4, 2, 3, 4, 5, 6, 7],
                        flank_size=100))
    assert (simplify_blocks(blocks, [0, 1, 5, 4, 2, 3, 4, 5, 6, 7],
                            flank_size=100)[1:] == ([
                                0, 1, 5, 4, 2, 3, 4, 5, 6, 7
                            ], True, True))
    assert (simplify_blocks(
        blocks, [0, 1, 2, 3, 11, 10, 9, 8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
        flank_size=100)[1:] == ([0, 1, 5, 4, 2, 3, 4, 5, 6, 7], True, True))
    # insertion
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 40, 41, 4, 5, 42, 43, 6, 7],
                            flank_size=100)[1:] == ([
                                0, 1, 6, 7, 2, 3, 8, 9, 4, 5
                            ], True, True))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 40, 41, 42, 43, 4, 5, 6, 7],
                            flank_size=100)[1:] == ([0, 1, 4, 5, 6, 7, 2,
                                                     3], True, True))
    # some other cases
    assert (simplify_blocks(blocks, [2, 3, 0, 1, 2, 3],
                            flank_size=100)[1:] == ([2, 3, 0, 1, 2,
                                                     3], False, False))
    assert (simplify_blocks(blocks, [0, 1, 2, 3, 0, 1],
                            flank_size=100)[1:] == ([0, 1, 2, 3, 0,
                                                     1], False, False))
    assert (simplify_blocks(
        blocks, [0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 8, 9, 2, 3, 10, 11],
        flank_size=100)[1:] == ([
            0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 8, 9, 2, 3, 10, 11
        ], True, True))
Example #12
0
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None,
                        caller=None, flank_size=1000, verbosity=0):
    os.system('mkdir -p %s' % outdir)

    vcf = open(vcffile, 'r')
    log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w')
    data = []
    svtype_skipped = {}
    seen_coords_count = {}
    skipped_refgap = 0
    write_extra = False         # need to write FORMAT or INFO to file?

    with open(vcffile, 'r') as vcf:
        toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#']

    if filter_gaps:
        chroms = set(toks[0] for toks in toks_list)
        chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms}
    else:
        chrom_gaps = None

    for toks in toks_list:
        # NOTE not parsing qual; do filtering beforehand for DELLY
        chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks

        # VCF is 1-indexed, but specifies pos/end positions
        # which are to the left of breakpoints, so no adjustment
        pos = int(pos)

        tags = info.split(';')
        if 'PRECISE' in tags:
            filterstring += ':PRECISE'
        elif 'IMPRECISE' in tags:
            filterstring += ':IMPRECISE'
        elif caller == 'lumpy':  # only includes tags for imprecise events
            filterstring += ':PRECISE'
        tags = [t for t in tags if '=' in t]
        tagd = {t.split('=')[0]: t.split('=')[1] for t in tags}
        end = int(tagd.get('END', -99999))
        svtype = tagd['SVTYPE']
        if caller == 'pindel' and svtype == 'INS':
            inslen = int(tagd['SVLEN'])
        else:
            inslen = int(tagd.get('INSLEN', 0))

        if caller == 'pindel':
            homlen = int(tagd['HOMLEN'])
            if pos + homlen > end or svtype == 'INS':
                print('pos + homlen > end: positions {0}'.format((pos, end)))
                cipos = (0, 0)
                ciend = (0, 0)
            else:
                cipos = (0, homlen)
                ciend = (0, homlen)
        else:
            if 'CIPOS95' in tagd:   # LUMPY
                tmp = tagd['CIPOS95'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            elif 'CIPOS' in tagd:
                tmp = tagd['CIPOS'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            else:
                cipos = (0, 0)
            if 'CIEND95' in tagd:   # LUMPY
                tmp = tagd['CIEND95'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            elif 'CIEND' in tagd:
                tmp = tagd['CIEND'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            else:
                ciend = (0, 0)
        split_support = int(tagd.get('SR', 0))
        pe_support = int(tagd.get('PE', 0))
        # lumpy STRANDS only relevant for inversions
        if caller == 'lumpy' and svtype == 'INV':
            tmp = tagd['STRANDS'].split(',')
            tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)}
            tagd['INV_PLUS'] = tmpd['++']
            tagd['INV_MINUS'] = tmpd['--']
        tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS',
                     'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2')
        tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used}

        tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))}
        if 'AD' in tags2:       # pindel
            split_support = int(tags2['AD'].split(',')[1])

        gt = tags2['GT']

        if gt == './.' or gt == '.|.':
            is_het = False
            filterstring += ':NOGT'
        elif gt in ('0/0', '0|0'):
            is_het = False
            filterstring += ':ZEROGT'
        elif gt in ('0/1', '1/0', '0|1', '1|0'):
            is_het = True
        else:
            assert(gt in ('1/1', '1|1'))
            is_het = False

        tags2_used = ('AD', 'SR', 'PE', 'SU')
        tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used}
        if len(tagd_extra) + len(tags2_extra) > 0:
            write_extra = True

        # cases
        if svtype == 'DEL':
            path = (0, 1, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Del'
        elif svtype == 'INV':
            path = (0, 1, 3, 2, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'InvL'
        elif svtype == 'DUP' or svtype == 'DUP:TANDEM':
            path = (0, 1, 2, 3, 2, 3, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Dup'
        elif svtype == 'INS':
            # INSERTIONS parse inslen, add insertion block to blocks
            path = (0, 1, 4, 5, 2, 3)
            refpath = (0, 1, 2, 3)
            supptype = 'Ins'
        else:
            # skipping delly TRA
            # skipping BND events as they may be ambiguous, in terms of the path
            svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1
            continue

        # check ref gap overlap
        if filter_gaps and end > pos:
            sv_interval = pyinter.closedopen(pos, end)
            sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval])
            if len(sv_gap_intersection) > 0:
                skipped_refgap += 1
                continue

        # create breakpoints and blocks, keeping in mind uncertainty and possible insertion
        if caller == 'lumpy' and svtype != 'INS':
            # lumpy intervals are not symmetric. POS and END are each the "best guess" for
            # the breakpoints
            bp = [(pos, pos), (end, end)]
        elif svtype != 'INS':
            # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \
            #    (pos + cipos[1] < end + ciend[0]):
            if (pos + cipos[1] < end + ciend[0]):
                bp = [(pos + cipos[0], pos + cipos[1]),
                      (end + ciend[0], end + ciend[1])]
            else:
                bp = [(pos, pos), (end, end)]
                filterstring += ':BPOVERLAP'
        else:
            # if cipos[1] != -cipos[0]:
            if cipos[1] > cipos[0]:
                bp = [(pos + cipos[0], pos + cipos[1])]
            else:
                bp = [(pos, pos)]
        pe = [(x, supptype) for x in range(pe_support)]
        # TODO SupportingSplit
        splits = []
        for i in range(split_support):
            aln_tmp = pysam.AlignedSegment()
            aln_tmp.qname = i
            aln_tmp.is_read1 = True
            split_type = supptype + '+'
            splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type))
        breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp}
        slop_left, slop_right = flank_size, flank_size
        start = bp[0][0] - slop_left
        end = bp[-1][1] + slop_right
        cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity)
        blocks, _, left_bp, right_bp = cbout

        if svtype == 'INS':
            blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True))

        paths = [path, refpath] if is_het else [path, path]
        score = 0

        coords = (start, end)
        scc = seen_coords_count.get(coords, 0)
        if scc > 0:
            id_extra = chr(ord('a') + scc)
        else:
            id_extra = ''
        seen_coords_count[coords] = scc + 1

        this_data = (paths, blocks, left_bp, right_bp, score, filterstring,
                     id_extra, tagd_extra, tags2_extra)
        data.append(this_data)
    for svtype, count in svtype_skipped.items():
        log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count))
    log.write('skipped_refgap\t{0}\n'.format(skipped_refgap))
    do_sv_processing(data, outdir, reffile, log, verbosity, write_extra)

    vcf.close()
    log.close()
Example #13
0
def path_classify_test():
    blocks = [GenomeInterval('1', 100*i, 100*i + 100) for i in range(10)]
    num_genome_blocks = 10
    blocks.append(GenomeInterval('1', 0, 100, True))

    # ABC/ABC
    p1 = [0, 1, 2, 3, 4, 5]
    p2 = p1
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    ref = pysam.FastaFile('/home/jgarthur/sv/reference/GRCh37.fa')
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))

    # ABC/AC
    p1 = [0, 1, 2, 3, 4, 5]
    p2 = [0, 1, 4, 5]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBC/ABC
    p1 = [0, 1, 4, 5, 2, 3, 4, 5]
    p2 = [0, 1, 2, 3, 4, 5]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBC/ACBC
    p1 = [0, 1, 4, 5, 2, 3, 4, 5]
    p2 = p1
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBC/AC'BC
    p1 = [0, 1, 4, 5, 2, 3, 4, 5]
    p2 = [0, 1, 5, 4, 2, 3, 4, 5]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBAC/AC
    p1 = [0, 1, 4, 5, 2, 3, 0, 1, 4, 5]
    p2 = [0, 1, 4, 5]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBD/ACB'D
    p1 = [0, 1, 4, 5, 2, 3, 6, 7]
    p2 = [0, 1, 4, 5, 3, 2, 6, 7]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACBD
    # AC'BD
    p1 = [0, 1, 4, 5, 2, 3, 6, 7]
    p2 = [0, 1, 5, 4, 2, 3, 6, 7]
    print(p1)
    print(p2)
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # AIC
    # AIBC
    p1 = [0, 1, 20, 21, 4, 5]
    p2 = [0, 1, 20, 21, 2, 3, 4, 5]
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ACD
    # ACDD'E
    p1 = [0, 1, 4, 5, 6, 7, 8, 9]
    p2 = [0, 1, 4, 5, 6, 7, 7, 6, 8, 9]
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ABBBC
    # ABBBC
    p1 = [0, 1, 2, 3, 2, 3, 2, 3, 4, 5]
    p2 = p1
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
    # print('\n'.join([sv_to_vcf(s, ref) for s in sv]))
    # ABBB'C
    # ABBB'C
    p1 = [0, 1, 2, 3, 2, 3, 3, 2, 4, 5]
    p2 = p1
    ev, sv = classify_paths(p1, p2, blocks, num_genome_blocks)
    print(ev)
    print('\n'.join([repr(s) for s in sv]))
Example #14
0
def test_compute_normalizing_constant():
    def create_insert_cs(ins):
        cdf = np.cumsum(ins)
        cs = np.cumsum(cdf)
        return lambda x, cs=cs: 0 if x < 0 else (cs[-1] + (x - len(ins)) + 1
                                                 ) if x >= len(ins) else cs[x]

    ins = np.array([0] + ([1 / 200] * 200))
    cdf_sum = create_insert_cs(ins)

    blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1000, 2000)]
    nc1 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum,
                                       [1, 0, 0, 0], 100, 100)
    print(blocks)
    print(nc1)
    print('')

    blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1099, 2000)]
    nc2 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum,
                                       [1, 0, 0, 0], 100, 100)
    print(blocks)
    print(nc2)
    print('')

    blocks = [GenomeInterval(1, 0, 1000), GenomeInterval(1, 1100, 2000)]
    nc3 = compute_normalizing_constant(list(range(4)), blocks, 1, cdf_sum,
                                       [1, 0, 0, 0], 100, 100)
    print(blocks)
    print(nc3)
    print('')
    assert (0 < nc3 < nc1)

    blocks = [
        GenomeInterval(1, 0, 1000),
        GenomeInterval(1, 1000, 1940),
        GenomeInterval(1, 0, 60, True)
    ]
    path = [0, 1, 4, 5, 2, 3]
    nc4 = compute_normalizing_constant(path, blocks, 1, cdf_sum, [1, 0, 0, 0],
                                       100, 100)
    print(blocks)
    print(nc4)
    print('')
    assert (0 < nc4 == nc1)

    blocks = [
        GenomeInterval(1, 0, 1000),
        GenomeInterval(1, 1000, 1938),
        GenomeInterval(1, 0, 62, True)
    ]
    path = [0, 1, 4, 5, 2, 3]
    nc4 = compute_normalizing_constant(path, blocks, 1, cdf_sum, [1, 0, 0, 0],
                                       100, 100)
    print(blocks)
    print(nc4)
    print('')
    assert (0 < nc4 < nc1)

    # try large gaps
    ins = np.array([0] + ([1 / 10000] * 10000))
    cdf_sum = create_insert_cs(ins)

    blocks = [
        GenomeInterval(1, 45024, 64579),
        GenomeInterval(1, 65306, 65307),
        GenomeInterval(1, 66018, 79509),
        GenomeInterval(1, 0, 1000, True)
    ]
    path = [0, 1, 2, 3, 4, 5]
    pm = [.97, .01, .01, .01]
    nc = compute_normalizing_constant(path, blocks, 1, cdf_sum, pm, 100, 100)
    print(blocks)
    print(nc)

    path = [2, 3, 6, 7]
    nc2 = compute_normalizing_constant(path, blocks, 1, cdf_sum, pm, 100, 100)
    print(path)
    print(nc2)
Example #15
0
def simplify_blocks(blocks, path, flank_size):
    block_nums = [int(floor(path[i] / 2)) for i in range(1, len(path), 2)]

    neighbors = defaultdict(set)
    neighbors[path[0]].add(-1)
    neighbors[path[-1]].add(-2)
    for i in range(1, len(path) - 1):
        if i % 2 == 0:
            neighbors[path[i]].add(path[i - 1])
        else:
            neighbors[path[i]].add(path[i + 1])

    min_block = min([b for b in block_nums if not blocks[b].is_insertion()])
    max_block = max([b for b in block_nums if not blocks[b].is_insertion()])
    ins_blocks = [b for b in block_nums if blocks[b].is_insertion()]
    new_blocks = []
    path_map = {}
    idx = 0
    merging = False
    for b in range(min_block, max_block + 1):
        if not merging:
            block_start = blocks[b].start
        right_node = 2 * b + 1
        if all(n == right_node+1 for n in neighbors[right_node]) and \
           all(n == right_node for n in neighbors[right_node+1]) and \
           b < max_block:
            # combine after
            merging = True
            path_map[2 * b] = None
            path_map[2 * b + 1] = None
        else:
            newblock = GenomeInterval(blocks[b].chrom, block_start,
                                      blocks[b].end)
            new_blocks.append(newblock)
            path_map[2 * b] = 2 * idx
            path_map[2 * b + 1] = 2 * idx + 1
            merging = False
            idx += 1
    new_blocks.extend([deepcopy(blocks[b]) for b in ins_blocks])
    for b in ins_blocks:
        path_map[2 * b] = 2 * idx
        path_map[2 * b + 1] = 2 * idx + 1
        idx += 1
    new_path = [path_map[p] for p in path if path_map[p] is not None]

    # adjust flanks if
    new_block_nums = [
        int(floor(new_path[i] / 2)) for i in range(1, len(new_path), 2)
    ]
    new_block_counts = Counter(new_block_nums)
    new_min_block = min(
        [b for b in new_block_nums if not new_blocks[b].is_insertion()])
    new_max_block = max(
        [b for b in new_block_nums if not new_blocks[b].is_insertion()])
    left_block, right_block = new_block_nums[0], new_block_nums[-1]

    # check that left block is minimum in reference, properly oriented, and not duplicated
    if left_block == new_min_block and new_path[0] % 2 == 0 and \
       new_block_counts[left_block] == 1:
        new_blocks[left_block].start = max(
            new_blocks[left_block].start,
            new_blocks[left_block].end - flank_size)
        has_left_flank = True
    else:
        has_left_flank = False
    if right_block == new_max_block and new_path[-1] % 2 == 1 and \
       new_block_counts[right_block] == 1:
        new_blocks[right_block].end = min(
            new_blocks[right_block].end,
            new_blocks[right_block].start + flank_size)
        has_right_flank = True
    else:
        has_right_flank = False

    return new_blocks, new_path, has_left_flank, has_right_flank
Example #16
0
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity):
    # create list of blocks between breakpoints
    # while adjusting for genome gaps
    gap_indices = set()
    gap_indices.add(0)
    blocks = []
    left_breakpoints = []
    right_breakpoints = []

    breakpoints[(end, end)] = Breakpoint((end, end))

    bploc = list(breakpoints.keys())
    bploc.sort()

    last_end = start
    last_breakpoint = Breakpoint((start, start))

    for bpl in bploc:
        breakpoint = breakpoints[bpl]

        if bpl[0] <= start or bpl[1] > end:
            continue
        iset = pyinter.IntervalSet()
        blockinterval = pyinter.closedopen(last_end, bpl[0])

        iset.add(blockinterval)
        adjusted_blocks = iset.difference(gaps)
        adjusted_blocks = sorted(list(adjusted_blocks))

        if verbosity > 1:
            print('bploc {0}'.format(bpl))
            print('bp {0}'.format(breakpoint))
            print('blockinterval {0}'.format(blockinterval))
            print('adjusted {0}'.format(adjusted_blocks))

        for ab in adjusted_blocks:
            if ab.lower_value == ab.upper_value:  # block completely within a gap
                gap_indices.add(len(blocks))
                break
            else:
                if ab.lower_value != blockinterval.lower_value:
                    gap_indices.add(len(blocks))
                    left_breakpoint = Breakpoint(
                        (ab.lower_value, ab.lower_value))
                else:
                    left_breakpoint = last_breakpoint
                if ab.upper_value != blockinterval.upper_value:
                    gap_indices.add(len(blocks) + 1)
                    right_breakpoint = Breakpoint(
                        (ab.upper_value, ab.upper_value))
                else:
                    right_breakpoint = breakpoint
                if verbosity > 1:
                    print('adding {0}'.format(
                        GenomeInterval(chrom_name, ab.lower_value,
                                       ab.upper_value)))
                    print('\tleft {0}'.format(left_breakpoint))
                    print('\tright {0}'.format(right_breakpoint))
                blocks.append(
                    GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))
                left_breakpoints.append(left_breakpoint)
                right_breakpoints.append(right_breakpoint)
        last_end = bpl[1]
        last_breakpoint = breakpoints[bpl]
    gap_indices.add(len(blocks))
    gap_indices = sorted(list(gap_indices))
    if verbosity > 1:
        print('--creating blocks--')
        print(breakpoints)
        print(blocks)
        print(gap_indices)
        print(left_breakpoints)
        print(right_breakpoints)
    return blocks, gap_indices, left_breakpoints, right_breakpoints
Example #17
0
def test_get_insertion_overlap_positions():
    blocks = [
        GenomeInterval(1, 0, 100),  # 01
        GenomeInterval(1, 100, 200),  # 23
        GenomeInterval(1, 210, 300),  # 45
        GenomeInterval(1, 350, 360),  # 67
        GenomeInterval(1, 370, 400),  # 89
        GenomeInterval(1, 0, 100, True),  # 10, 11
        GenomeInterval(1, 0, 10, True)
    ]  # 12, 13
    paths = (list(range(10)), [0, 1, 10, 11, 2, 3], [0, 1, 2, 3, 10, 11, 2, 3],
             [0, 1, 2, 3, 12, 13, 2, 3], [0, 1, 2, 3, 4, 5, 10, 11, 6,
                                          7], [0, 1, 2, 3, 4, 5, 12, 13, 6, 7])
    truth = [
        tuple(), ((80, 170), ), ((185, 275), ),
        tuple(), ((305, 395), ),
        tuple()
    ]
    rlen = 50
    m = 20

    for i in range(len(truth)):
        out, _, _ = get_insertion_overlap_positions(paths[i], blocks, rlen, m)
        inter = pyinter.IntervalSet()
        for interval in truth[i]:
            inter.add(pyinter.open(interval[0], interval[1]))
        print('truth: {0}\nresult: {1}\n'.format(inter, out))
        assert (out == inter)

    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 200, 300),
        GenomeInterval(0, 350, 400),
        GenomeInterval(1, 0, 50, True),
        GenomeInterval(1, 0, 50, True)
    ]
    path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5]
    truth = [(130, 170), (355, 395)]
    out, _, _ = get_insertion_overlap_positions(path, blocks, rlen, m)
    inter = pyinter.IntervalSet()
    for interval in truth:
        inter.add(pyinter.open(interval[0], interval[1]))
    print('truth: {0}\nresult: {1}\n'.format(inter, out))
    assert (out == inter)
Example #18
0
def test_get_overlapping_blocks():
    blocks = [GenomeInterval('1', 0, 100), GenomeInterval('1', 100, 200)]

    aln_blocks = [(-10, 0)]
    aln_gaps = [0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
    assert (ov == [])

    aln_blocks = [(10, 20)]
    aln_gaps = [0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
    assert (ov == [1])
    assert (offset == -10)
    aln_gaps = [10, 10]

    aln_blocks = [(10, 20), (110, 120), (200, 250)]
    aln_gaps = [0, 0, 0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
    assert (ov == [1, 3])
    assert (offset == -10)
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        True,
                                        find_offset=True)
    assert (ov == [2, 0])
    assert (offset == 70)

    aln_gaps = [10, 10, 0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
    assert (ov == [1, 3])
    assert (offset == 0)
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        True,
                                        find_offset=True)
    assert (ov == [2, 0])
    assert (offset == 70)

    aln_blocks = [(-10, 0), (90, 110)]
    aln_gaps = [0, 0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
    assert (ov == [1, 3])
    assert (offset == -(90 - 0) + 10)

    blocks = [GenomeInterval('1', 0, 100), GenomeInterval('1', 110, 200)]
    aln_blocks = [(0, 111)]
    aln_gaps = [0, 0]
    ov, offset = get_overlapping_blocks(blocks,
                                        aln_blocks,
                                        aln_gaps,
                                        0,
                                        False,
                                        find_offset=True)
Example #19
0
def sv_output(path1,
              path2,
              blocks,
              event1,
              event2,
              frac1,
              frac2,
              sv_list,
              complex_types,
              event_lh,
              ref_lh,
              next_best_lh,
              next_best_pathstring,
              num_paths,
              filter_criteria,
              filterstring_manual=None,
              id_extra='',
              output_vcf=False,
              reference=False,
              output_split_support=False):
    lines = ''
    splitlines = ''
    vcflines = []
    sv1 = [
        sv for sv in sv_list if sv.genotype == '1/1' or sv.genotype == '1/0'
    ]
    sv2 = [
        sv for sv in sv_list if sv.genotype == '1/1' or sv.genotype == '0/1'
    ]
    compound_het = (path1 != path2) and (len(sv1) > 0) and (len(sv2) > 0)
    is_het = (path1 != path2)
    num_paths = str(num_paths)
    for (k, path, event, svs, complex_type,
         frac) in [(0, path1, event1, sv1, complex_types[0], frac1),
                   (1, path2, event2, sv2, complex_types[1], frac2)]:
        if k == 1 and path1 == path2:
            continue
        if len(svs) == 0:
            continue

        chrom = blocks[int(floor(path1[0] / 2))].chrom

        # CLEANUP this code is duplicated up above -- should be merged
        id = '_'.join(svs[0].event_id.split(',')[0:2])
        if compound_het:
            id = id + '_' + str(k + 1)
        id += id_extra

        num_sv = len(svs)

        if filterstring_manual is None:
            fs = sorted(
                set((get_filter_string(sv, filter_criteria) for sv in svs)))
            if all(x == 'PASS' for x in fs):
                filters = 'PASS'
            else:
                filters = ','.join(x for x in fs if x != 'PASS')
        else:
            filters = filterstring_manual

        all_sv_bp1 = [int(floor(np.median(sv.bp1))) for sv in svs]
        all_sv_bp2 = [int(floor(np.median(sv.bp2))) for sv in svs]
        all_sv_bp = all_sv_bp1 + all_sv_bp2

        minbp, maxbp = min(all_sv_bp), max(all_sv_bp)
        total_span = maxbp - minbp
        # sv_span = maxbp - minbp

        # bp_cis = bp_ci for sv in svs
        # (bp1, bp2) in bp_cis

        sv_bp_joined = ';'.join(get_bp_string(sv) for sv in svs)
        sv_bp_uncertainty_joined = ';'.join(
            get_bp_uncertainty_string(sv) for sv in svs)
        sv_bp_ci = [get_bp_ci(sv) for sv in svs]

        svtypes = list(sv.type.split(':')[0]
                       for sv in svs)  # use DUP not DUP:TANDEM
        svtypes_joined = ','.join(svtypes)

        nonins_blocks = [b for b in blocks if not b.is_insertion()]
        nni = len(nonins_blocks)
        block_bp = [nonins_blocks[0].start] + \
                   [int(floor(np.median((blocks[i-1].end, blocks[i].start))))
                    for i in range(1, nni)] + \
                   [nonins_blocks[-1].end]
        block_bp_joined = ','.join(str(x) for x in block_bp)
        block_bp_uncertainty = [0] + \
                               [block_gap(blocks, 2*i) for i in range(1, nni)] + \
                               [0]
        block_bp_uncertainty_joined = ','.join(
            str(x) for x in block_bp_uncertainty)

        blocks_midpoints = [
            GenomeInterval(chrom, block_bp[i], block_bp[i + 1])
            for i in range(nni)
        ]
        blocks_midpoints.extend([b for b in blocks if b.is_insertion()])
        len_affected = sv_affected_len(path, blocks_midpoints)

        pathstring = path_to_string(path, blocks=blocks)
        nblocks = len([b for b in blocks if not b.is_insertion()])
        refpath = list(range(2 * nblocks))
        ref_string = path_to_string(refpath, blocks=blocks)
        gt = 'HET' if is_het else 'HOM'

        insertion_lengths = [
            get_sv_ins(sv) for sv in svs if get_sv_ins(sv) > 0
        ]
        if len(insertion_lengths) == 0:
            inslen_joined = 'NA'
        else:
            inslen_joined = ','.join(str(l) for l in insertion_lengths)
        sr = list(sv.split_support for sv in svs)
        pe = list(sv.pe_support for sv in svs)
        sr_joined = ','.join(map(str, sr))
        pe_joined = ','.join(map(str, pe))
        lhr = '%.2f' % (event_lh - ref_lh)
        lhr_next = '%.2f' % (event_lh - next_best_lh)
        frac_str = '%.3f' % frac

        line = '\t'.join(
            str(x)
            for x in (chrom, minbp, maxbp, id, svtypes_joined, complex_type,
                      num_sv, block_bp_joined, block_bp_uncertainty_joined,
                      ref_string, pathstring, len_affected, filters,
                      sv_bp_joined, sv_bp_uncertainty_joined, gt, frac_str,
                      inslen_joined, sr_joined, pe_joined, lhr, lhr_next,
                      next_best_pathstring, num_paths))
        # num_sv
        # block_bp_joined
        # block_bp_uncertainty_joined
        line += '\n'
        lines = lines + line

        if output_vcf:
            template = vcf_line_template()
            info_tags_ordered = [
                'SV_TYPE', 'HAPLOID_CN', 'COMPLEX_TYPE', 'MATE_ID', 'END',
                'CI_POS', 'CI_END', 'INS_LEN', 'SR', 'PE', 'SV_SPAN',
                'EVENT_SPAN', 'EVENT_START', 'EVENT_END', 'EVENT_AFFECTED_LEN',
                'EVENT_NUM_SV', 'REF_STRUCTURE', 'ALT_STRUCTURE',
                'SEGMENT_ENDPTS', 'SEGMENT_ENDPTS_CIWIDTH', 'AF',
                'SCORE_VS_REF', 'SCORE_VS_NEXT', 'NEXT_BEST_STRUCTURE',
                'NUM_PATHS'
            ]
            info_tags_ordering = {
                y: x
                for x, y in enumerate(info_tags_ordered)
            }
            for (i, sv) in enumerate(svs):
                info_list = []
                sv_chrom = sv.ref_chrom
                # pos
                pos = all_sv_bp1[i] + 1
                if num_sv > 1:
                    id_vcf = id + '_' + str(i + 1)
                else:
                    id_vcf = id
                ref_base = fetch_seq(reference, sv_chrom, pos - 1,
                                     pos)  # pysam is 0-indexed
                alt = '<{0}>'.format(sv.type)
                qual = '.'
                svtype = svtypes[i]
                info_list.append(('SV_TYPE', svtype))
                end = all_sv_bp2[i] + 1
                info_list.append(('END', end))
                block_bp_vcf = ','.join(str(x + 1) for x in block_bp)
                info_list.append(('SEGMENT_ENDPTS', block_bp_vcf))
                info_list.append(
                    ('SEGMENT_ENDPTS_CIWIDTH', block_bp_uncertainty_joined))

                if svtype == 'INS':
                    svlen = sv.length
                else:
                    svlen = end - pos
                info_list.append(('SV_SPAN', svlen))
                info_list.append(('EVENT_SPAN', total_span))
                info_list.append(('EVENT_AFFECTED_LEN', len_affected))

                if svtype == 'DUP':
                    info_list.append(('HAPLOID_CN', sv.copynumber))

                bp1_ci, bp2_ci = sv_bp_ci[i]
                bp1_ci_str = str(bp1_ci[0]) + ',' + str(bp1_ci[1])
                bp2_ci_str = str(bp2_ci[0]) + ',' + str(bp2_ci[1])
                if bp1_ci_str != '0,0':
                    info_list.append(('CI_POS', bp1_ci_str))
                if bp2_ci_str != '0,0' and svtype != 'INS':
                    info_list.append(('CI_END', bp2_ci_str))
                info_list.extend([
                    ('REF_STRUCTURE', ref_string),
                    ('ALT_STRUCTURE', pathstring), ('AF', frac_str),
                    ('SR', sr[i]), ('PE', pe[i]), ('SCORE_VS_REF', lhr),
                    ('SCORE_VS_NEXT', lhr_next),
                    ('NEXT_BEST_STRUCTURE', next_best_pathstring),
                    ('NUM_PATHS', num_paths), ('EVENT_START', minbp + 1),
                    ('EVENT_END', maxbp), ('EVENT_NUM_SV', num_sv)
                ])

                # FORMAT/GT
                format_str = 'GT'
                gt_vcf = sv.genotype
                if svtype != 'BND':
                    # write line
                    info_list.sort(key=lambda x: info_tags_ordering[x[0]])
                    info = ';'.join(
                        ['{0}={1}'.format(el[0], el[1]) for el in info_list])
                    line = template.format(chr=chrom,
                                           pos=pos,
                                           id=id_vcf,
                                           ref=ref_base,
                                           alt=alt,
                                           qual=qual,
                                           filter=filters,
                                           info=info,
                                           format_str=format_str,
                                           gt=gt_vcf)
                    vcflines.append(line)
                else:  # breakend type --> 2 lines in vcf
                    id_bnd1, id_bnd2 = id_vcf + 'A', id_vcf + 'B'
                    mateid_bnd1, mateid_bnd2 = id_bnd2, id_bnd1
                    orientation_bnd1, orientation_bnd2 = sv.bnd_orientation
                    pos_bnd1 = all_sv_bp1[i] + 1
                    pos_bnd2 = all_sv_bp2[i] + 1
                    if orientation_bnd1 == '-':
                        pos_bnd1 -= 1
                    if orientation_bnd2 == '-':
                        pos_bnd2 -= 1
                    ref_bnd1 = fetch_seq(reference, sv_chrom, pos_bnd1 - 1,
                                         pos_bnd1)
                    ref_bnd2 = fetch_seq(reference, sv_chrom, pos_bnd2 - 1,
                                         pos_bnd2)
                    alt_bnd1 = bnd_alt_string(orientation_bnd1,
                                              orientation_bnd2, sv.ref_chrom,
                                              pos_bnd2, ref_bnd1)
                    alt_bnd2 = bnd_alt_string(orientation_bnd2,
                                              orientation_bnd1, sv.ref_chrom,
                                              pos_bnd1, ref_bnd2)

                    ctype_str = complex_type.upper().replace('.', '_')

                    info_list_bnd1 = [('MATE_ID', mateid_bnd1)]
                    info_list_bnd2 = [('MATE_ID', mateid_bnd2)]
                    if bp1_ci_str != '0,0':
                        info_list_bnd1.append(('CI_POS', bp1_ci_str))
                    if bp2_ci_str != '0,0':
                        info_list_bnd2.append(('CI_POS', bp2_ci_str))
                    if sv.bnd_ins > 0:
                        info_list_bnd1.append(('INS_LEN', sv.bnd_ins))
                        info_list_bnd2.append(('INS_LEN', sv.bnd_ins))
                    common_tags = [('SV_TYPE', svtype),
                                   ('COMPLEX_TYPE', ctype_str),
                                   ('EVENT_SPAN', total_span),
                                   ('EVENT_START', minbp + 1),
                                   ('EVENT_END', maxbp),
                                   ('EVENT_AFFECTED_LEN', len_affected),
                                   ('EVENT_NUM_SV', num_sv),
                                   ('SEGMENT_ENDPTS', block_bp_vcf),
                                   ('SEGMENT_ENDPTS_CIWIDTH',
                                    block_bp_uncertainty_joined),
                                   ('REF_STRUCTURE', ref_string),
                                   ('ALT_STRUCTURE', pathstring),
                                   ('AF', frac_str), ('SR', sr[i]),
                                   ('PE', pe[i]), ('SCORE_VS_REF', lhr),
                                   ('SCORE_VS_NEXT', lhr_next),
                                   ('NEXT_BEST_STRUCTURE',
                                    next_best_pathstring),
                                   ('NUM_PATHS', num_paths)]
                    info_list_bnd1.extend(common_tags)
                    info_list_bnd2.extend(common_tags)

                    info_list_bnd1.sort(key=lambda x: info_tags_ordering[x[0]])
                    info_list_bnd2.sort(key=lambda x: info_tags_ordering[x[0]])
                    info_bnd1 = ';'.join([
                        '{0}={1}'.format(el[0], el[1]) for el in info_list_bnd1
                    ])
                    info_bnd2 = ';'.join([
                        '{0}={1}'.format(el[0], el[1]) for el in info_list_bnd2
                    ])
                    line1 = template.format(chr=chrom,
                                            pos=pos_bnd1,
                                            id=id_bnd1,
                                            ref=ref_bnd1,
                                            alt=alt_bnd1,
                                            qual=qual,
                                            filter=filters,
                                            info=info_bnd1,
                                            format_str=format_str,
                                            gt=gt_vcf)
                    line2 = template.format(chr=chrom,
                                            pos=pos_bnd2,
                                            id=id_bnd2,
                                            ref=ref_bnd2,
                                            alt=alt_bnd2,
                                            qual=qual,
                                            filter=filters,
                                            info=info_bnd2,
                                            format_str=format_str,
                                            gt=gt_vcf)
                    vcflines.append(line1)
                    vcflines.append(line2)

        if output_split_support:
            split_line_list = []
            bp_orientations = {
                'Del': ('-', '+'),
                'Dup': ('+', '-'),
                'InvL': ('-', '-'),
                'InvR': ('+', '+')
            }
            bp_idx = 1
            for sv in svs:
                bp1 = str(int(floor(np.median(sv.bp1))))
                bp2 = str(int(floor(np.median(sv.bp2))))
                for split in sv.supporting_splits:
                    orientation = bp_orientations[split.split_type[:-1]]
                    orientation = ','.join(orientation)
                    strand = split.split_type[-1]
                    qname = split.aln.qname
                    seq = split.aln.seq
                    mapq = str(split.aln.mapq)
                    if split.mate is not None:
                        mate_seq = split.mate.seq
                        mate_mapq = str(split.mate.mapq)
                        mate_has_split = str(split.mate_has_split)
                    else:
                        mate_seq = 'NA'
                        mate_mapq = 'NA'
                        mate_has_split = 'NA'
                    line = '\t'.join(
                        str(x)
                        for x in (id, block_bp_joined, ref_string, pathstring,
                                  sv_bp_joined, 'split', qname, bp_idx, bp1,
                                  bp2, orientation, qname, strand, seq, mapq,
                                  mate_seq, mate_mapq, mate_has_split))
                    split_line_list.append(line)
                bp_idx += 1
            if len(split_line_list) > 0:
                splitlines = splitlines + '\n'.join(split_line_list) + '\n'
    return lines, vcflines, splitlines