Beispiel #1
0
def test_banded_sw_leak():
    """Test that the banded sw implementation is not leaking memory"""
    import os
    import time
    import gc
    from guppy import hpy
    h = hpy()
    proc = os.getpid()
    _Example = namedtuple('TestExample', ('ref', 'read', 'expected_cigar'))
    ref1 = 'CCCTTTAGTTATGCTTTCTCTTCGGCGGGCGTGGGAC' + 'CGTAATGAGAACTGTACATCAGTCTG'
    read1 = 'TATGCTTTCTCTTCGGCGGGCGTGGGACAAAATCGTAATGAGAACTGTAC'

    cig1 = '28M5I17M'
    example = _Example(ref1, read1, cig1)

    def _get_mem():
        size = [
            t for t in [
                u.split('\t') for u in open('/proc/{}/status'.format(
                    os.getpid())).read().split('\n')
            ] if t[0] == 'VmSize:'
        ]
        print(size)
        return int(size[0][1].split()[0])

    for _ in xrange(5000):
        _ = banded_sw(example.ref, example.read)
    _ = None
    gc.collect()
    heap_init = h.heap()
    initmem = _get_mem()
    for _ in xrange(5000):
        _ = np.zeros((len(example.ref), len(example.read)), dtype=int)
    _ = None
    gc.collect()
    slmem = _get_mem()
    for _ in xrange(5000):
        _ = banded_sw(example.ref, example.read)
    _ = None
    gc.collect()
    curmem = _get_mem()
    if curmem - initmem > 40 and slmem - initmem < 5:
        print('Initial heap:\n{}'.format(heap_init))
        print('-' * 40)
        print(h.heap())
        raise ValueError(
            'There is a memory leak. Mem difference: {}kb'.format(curmem -
                                                                  initmem))
Beispiel #2
0
def test_shady_haplotype2():
    """Test that a large deletion can actually be deleted"""
    ref = 'CAATCCCCTAGCGGCTCAATCACTGAACCTCCTCCTCTCCGGGGCGTTGGCGTCTTCTTTTATGTGAGAAGAATAATTACCCCTAGCGGCGTTAACAGTTGGGTG'
    h1 = 'CAATCCCCTAGCGGC' + 'GTTAACAGTTGGGTG'
    expected_cigar = '15M75D15M'
    offset, cigar, score, mismatch = banded_sw(ref, h1, not_in_ref_penalty=40)
    foffset, fcigar, fscore, fmismatch = full_sw(ref, h1, lenient=True)
    assert fcigar == expected_cigar
Beispiel #3
0
def main(args):
    m260b.debug.debug.DEBUG = args.debug
    ref_header, ref_sequence = read_basic_fasta(args.reference_file)
    if args.input_bam:
        reads = Samfile(args.input_bam)
        if args.start and args.stop:
            reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop)
    else:
        reads = get_sorted_aligned_reads(args, ref_header, ref_sequence)
    #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None
    chr = ref_header[1:].strip()
    fail_reasons = Counter()
    haplo_out = None
    if args.haplotype_out:
        haplo_out = Samfile(args.haplotype_out,
                            'wb',
                            header=SAM_HEADER(ref_header, ref_sequence))
    vcf_stream = VCFWriter(open(args.out_vcf, 'wb'),
                           make_vcf_header(args)) if args.out_vcf else None
    for region, reads in active_regions(reads,
                                        ref_sequence,
                                        chr,
                                        start_offset=0,
                                        flank=30,
                                        dfrac=1.0):
        #print('Calling region {}-{}'.format(region.start, region.stop))
        haplotype = build_haplotype(region.reference,
                                    reads,
                                    k=11,
                                    min_kmer_count=2)
        if haplotype.fail_reason:
            print('Failure {} at window\n{}'.format(haplotype.fail_reason,
                                                    region))
            continue
        # align the haplotype to the reference sequence
        offset, cigar, score, mismatch = banded_sw(region.reference,
                                                   haplotype.seq)
        haplotype_start = region.start + offset
        _info = AlignmentInfo(haplotype_start, cigar, False, mismatch)
        haplo_seq = SeqRecord(Seq(haplotype.seq, DNA),
                              id='Haplotype{}'.format(region.start))
        dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality',
                         [40] * len(haplotype.seq))
        haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None,
                                           'hw2_rg', False)
        if haplo_out:
            haplo_out.write(haplo_read)
        #print(haplotype)
        for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr):
            if vcf_stream:
                vcf_stream.write_record(variant)
            print(vcf2m260(variant))
    if vcf_stream:
        vcf_stream.flush()
        vcf_stream.close()
Beispiel #4
0
def test_shady_haplotype_alignment():
    """Test a real example of a bad alignment"""
    #                                             v                                                          v
    ref = 'AACAACAACAA' + 'CCTGGTCAGGAGTTGAGCCTCCATACTATACTTACTAGTGGTGTACTAACATCCAAACTATTCCCGCGGGACTTAATATGTGATGTCCGCCGTGGTGCGCAATTACGTACGTAGGAAGAGATTGTTATCCAATCTTTTCACGT'
    h1 = 'AACAACAACAACGACAACCTGGTCAGGAGTTGAGCCTCCTTACTATACTTACTAGTGGTGTACTAACATCCAAACTATTCCCGCGGGACTTAATATGTAATGTCCGCCGTGGTGCGCAATTACGTACGTAGGAAGAGATTGTTATCCAATCTTTTCACGT'
    ref = 'AACAACAAC' + 'AACCTGGTCAGGAGTTGAGCCTCCATACTATACTT'
    h1 = 'AACAACAACAACGACAACCTGGTCAGGAGTTGAGCCTCCTTACTATACTT'
    expected_cigar, expected_mismatch = '12M6I32M', 1
    offset, cigar, score, mismatch = banded_sw(ref, h1)
    gain, moves = _banded_sw_matrix(ref, h1)
    assert cigar == expected_cigar, 'E={} != O={}'.format(
        expected_cigar, cigar)
    assert mismatch == expected_mismatch, 'E={} != O={}'.format(
        expected_mismatch, mismatch)
Beispiel #5
0
def split_haplotype(region, haplotype, max_edit_per_10bp=1):
    offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq, not_in_ref_penalty=40)
    if offset > 0:
        offset, cigar, score, mismatch = full_sw(region.reference, haplotype.seq, lenient=True)
    #print('splitting: {} // mismatch={} // pos={}'.format(cigar, mismatch, region.start))
    cigar_elements = [ (int(size), oper) for size, oper in CIGAR_REGEX.findall(cigar) ]
    if (len(cigar_elements) + mismatch) / (len(haplotype.seq)/10.) > max_edit_per_10bp:
        # bad haplotype; kill the events
        print('Bad haplotype alignment at {}'.format((region.start, region.stop)))
        haplotype = haplotype.__class__('bad_alignment', seq=[], event_scores=[])
    cigar_elements = iter(cigar_elements)
    cur_size, cur_oper = next(cigar_elements)
    ref_adj = 0
    score = -2
    for h_offset, h_base in enumerate(haplotype.seq):
        #print("o={}, ho={}/{}, adj={}, cop={}, csz={}, h_b={}, r_b={}".format(offset, h_offset, len(haplotype.seq), ref_adj, cur_oper, cur_size, h_base, region.reference[offset + h_offset + ref_adj]))
        if cur_oper == 'M':
            if h_base != region.reference[offset + h_offset + ref_adj]:
                score = _get_score(score, h_offset, haplotype.event_scores)
                # SNP (or MNP but we'll split those up)
                yield CalledEvent(region.start + h_offset + ref_adj, h_base, 'SNP', score)
            cur_size -= 1
            if cur_size == 0 and h_offset < region.stop:
                # there's some next, non-M cigar string, so yield it as an event
                cur_size, cur_oper = next(cigar_elements)
                if cur_oper == 'I':
                    score = _get_score(score, h_offset, haplotype.event_scores, indel_adj=1)  # insertion is next base
                    yield CalledEvent(region.start + h_offset + ref_adj, 
                                      h_base + haplotype.seq[1+h_offset:(1+h_offset+cur_size)],
                                      'INS', score)
                elif cur_oper == 'D':
                    score = _get_score(score, h_offset, haplotype.event_scores, indel_adj=1)  # deletion is next base
                    yield CalledEvent(region.start + h_offset + ref_adj,
                                      region.reference[(h_offset+ref_adj):(ref_adj+1+h_offset+cur_size)],
                                      'DEL', score)
        else:
            if cur_oper == 'I':
                ref_adj -= 1
                cur_size -= 1
            else:
                ref_adj += cur_size
                cur_size = 0
            if cur_size == 0:
                # must be an 'M' here
                cur_size, cur_oper = next(cigar_elements)
Beispiel #6
0
def test_banded_sw():
    """\
    Test that the banded smith waterman captures the same events as the full. These are real examples of screw-ups.
 
    """
    example = namedtuple('TestExample', ('ref', 'read', 'expected_cigar'))
    examples = list()
    ref1 = 'CCCTTTAGTTATGCTTTCTCTTCGGCGGGCGTGGGAC' + 'CGTAATGAGAACTGTACATCAGTCTG'
    read1 = 'TATGCTTTCTCTTCGGCGGGCGTGGGACAAAATCGTAATGAGAACTGTAC'

    cig1 = '28M5I17M'
    examples.append(example(ref1, read1, cig1))
    for example in examples:
        _, cigar, _, _ = banded_sw(example.ref, example.read)
        if cigar != example.expected_cigar:
            import numpy
            numpy.set_printoptions(threshold='nan')
            full_mat, _ = _full_sw_matrix(example.ref, example.read)
            banded_mat, _ = _banded_sw_matrix(example.ref, example.read)
            raise ValueError(
                'Expected cigar: {}, observed: {}'.format(
                    example.expected_cigar, cigar) +
                '\nFull:\n{}\n\nBanded:\n{}'.format(full_mat, banded_mat))