def test_banded_sw_leak(): """Test that the banded sw implementation is not leaking memory""" import os import time import gc from guppy import hpy h = hpy() proc = os.getpid() _Example = namedtuple('TestExample', ('ref', 'read', 'expected_cigar')) ref1 = 'CCCTTTAGTTATGCTTTCTCTTCGGCGGGCGTGGGAC' + 'CGTAATGAGAACTGTACATCAGTCTG' read1 = 'TATGCTTTCTCTTCGGCGGGCGTGGGACAAAATCGTAATGAGAACTGTAC' cig1 = '28M5I17M' example = _Example(ref1, read1, cig1) def _get_mem(): size = [ t for t in [ u.split('\t') for u in open('/proc/{}/status'.format( os.getpid())).read().split('\n') ] if t[0] == 'VmSize:' ] print(size) return int(size[0][1].split()[0]) for _ in xrange(5000): _ = banded_sw(example.ref, example.read) _ = None gc.collect() heap_init = h.heap() initmem = _get_mem() for _ in xrange(5000): _ = np.zeros((len(example.ref), len(example.read)), dtype=int) _ = None gc.collect() slmem = _get_mem() for _ in xrange(5000): _ = banded_sw(example.ref, example.read) _ = None gc.collect() curmem = _get_mem() if curmem - initmem > 40 and slmem - initmem < 5: print('Initial heap:\n{}'.format(heap_init)) print('-' * 40) print(h.heap()) raise ValueError( 'There is a memory leak. Mem difference: {}kb'.format(curmem - initmem))
def test_shady_haplotype2(): """Test that a large deletion can actually be deleted""" ref = 'CAATCCCCTAGCGGCTCAATCACTGAACCTCCTCCTCTCCGGGGCGTTGGCGTCTTCTTTTATGTGAGAAGAATAATTACCCCTAGCGGCGTTAACAGTTGGGTG' h1 = 'CAATCCCCTAGCGGC' + 'GTTAACAGTTGGGTG' expected_cigar = '15M75D15M' offset, cigar, score, mismatch = banded_sw(ref, h1, not_in_ref_penalty=40) foffset, fcigar, fscore, fmismatch = full_sw(ref, h1, lenient=True) assert fcigar == expected_cigar
def main(args): m260b.debug.debug.DEBUG = args.debug ref_header, ref_sequence = read_basic_fasta(args.reference_file) if args.input_bam: reads = Samfile(args.input_bam) if args.start and args.stop: reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop) else: reads = get_sorted_aligned_reads(args, ref_header, ref_sequence) #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None chr = ref_header[1:].strip() fail_reasons = Counter() haplo_out = None if args.haplotype_out: haplo_out = Samfile(args.haplotype_out, 'wb', header=SAM_HEADER(ref_header, ref_sequence)) vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None for region, reads in active_regions(reads, ref_sequence, chr, start_offset=0, flank=30, dfrac=1.0): #print('Calling region {}-{}'.format(region.start, region.stop)) haplotype = build_haplotype(region.reference, reads, k=11, min_kmer_count=2) if haplotype.fail_reason: print('Failure {} at window\n{}'.format(haplotype.fail_reason, region)) continue # align the haplotype to the reference sequence offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq) haplotype_start = region.start + offset _info = AlignmentInfo(haplotype_start, cigar, False, mismatch) haplo_seq = SeqRecord(Seq(haplotype.seq, DNA), id='Haplotype{}'.format(region.start)) dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality', [40] * len(haplotype.seq)) haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None, 'hw2_rg', False) if haplo_out: haplo_out.write(haplo_read) #print(haplotype) for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr): if vcf_stream: vcf_stream.write_record(variant) print(vcf2m260(variant)) if vcf_stream: vcf_stream.flush() vcf_stream.close()
def test_shady_haplotype_alignment(): """Test a real example of a bad alignment""" # v v ref = 'AACAACAACAA' + 'CCTGGTCAGGAGTTGAGCCTCCATACTATACTTACTAGTGGTGTACTAACATCCAAACTATTCCCGCGGGACTTAATATGTGATGTCCGCCGTGGTGCGCAATTACGTACGTAGGAAGAGATTGTTATCCAATCTTTTCACGT' h1 = 'AACAACAACAACGACAACCTGGTCAGGAGTTGAGCCTCCTTACTATACTTACTAGTGGTGTACTAACATCCAAACTATTCCCGCGGGACTTAATATGTAATGTCCGCCGTGGTGCGCAATTACGTACGTAGGAAGAGATTGTTATCCAATCTTTTCACGT' ref = 'AACAACAAC' + 'AACCTGGTCAGGAGTTGAGCCTCCATACTATACTT' h1 = 'AACAACAACAACGACAACCTGGTCAGGAGTTGAGCCTCCTTACTATACTT' expected_cigar, expected_mismatch = '12M6I32M', 1 offset, cigar, score, mismatch = banded_sw(ref, h1) gain, moves = _banded_sw_matrix(ref, h1) assert cigar == expected_cigar, 'E={} != O={}'.format( expected_cigar, cigar) assert mismatch == expected_mismatch, 'E={} != O={}'.format( expected_mismatch, mismatch)
def split_haplotype(region, haplotype, max_edit_per_10bp=1): offset, cigar, score, mismatch = banded_sw(region.reference, haplotype.seq, not_in_ref_penalty=40) if offset > 0: offset, cigar, score, mismatch = full_sw(region.reference, haplotype.seq, lenient=True) #print('splitting: {} // mismatch={} // pos={}'.format(cigar, mismatch, region.start)) cigar_elements = [ (int(size), oper) for size, oper in CIGAR_REGEX.findall(cigar) ] if (len(cigar_elements) + mismatch) / (len(haplotype.seq)/10.) > max_edit_per_10bp: # bad haplotype; kill the events print('Bad haplotype alignment at {}'.format((region.start, region.stop))) haplotype = haplotype.__class__('bad_alignment', seq=[], event_scores=[]) cigar_elements = iter(cigar_elements) cur_size, cur_oper = next(cigar_elements) ref_adj = 0 score = -2 for h_offset, h_base in enumerate(haplotype.seq): #print("o={}, ho={}/{}, adj={}, cop={}, csz={}, h_b={}, r_b={}".format(offset, h_offset, len(haplotype.seq), ref_adj, cur_oper, cur_size, h_base, region.reference[offset + h_offset + ref_adj])) if cur_oper == 'M': if h_base != region.reference[offset + h_offset + ref_adj]: score = _get_score(score, h_offset, haplotype.event_scores) # SNP (or MNP but we'll split those up) yield CalledEvent(region.start + h_offset + ref_adj, h_base, 'SNP', score) cur_size -= 1 if cur_size == 0 and h_offset < region.stop: # there's some next, non-M cigar string, so yield it as an event cur_size, cur_oper = next(cigar_elements) if cur_oper == 'I': score = _get_score(score, h_offset, haplotype.event_scores, indel_adj=1) # insertion is next base yield CalledEvent(region.start + h_offset + ref_adj, h_base + haplotype.seq[1+h_offset:(1+h_offset+cur_size)], 'INS', score) elif cur_oper == 'D': score = _get_score(score, h_offset, haplotype.event_scores, indel_adj=1) # deletion is next base yield CalledEvent(region.start + h_offset + ref_adj, region.reference[(h_offset+ref_adj):(ref_adj+1+h_offset+cur_size)], 'DEL', score) else: if cur_oper == 'I': ref_adj -= 1 cur_size -= 1 else: ref_adj += cur_size cur_size = 0 if cur_size == 0: # must be an 'M' here cur_size, cur_oper = next(cigar_elements)
def test_banded_sw(): """\ Test that the banded smith waterman captures the same events as the full. These are real examples of screw-ups. """ example = namedtuple('TestExample', ('ref', 'read', 'expected_cigar')) examples = list() ref1 = 'CCCTTTAGTTATGCTTTCTCTTCGGCGGGCGTGGGAC' + 'CGTAATGAGAACTGTACATCAGTCTG' read1 = 'TATGCTTTCTCTTCGGCGGGCGTGGGACAAAATCGTAATGAGAACTGTAC' cig1 = '28M5I17M' examples.append(example(ref1, read1, cig1)) for example in examples: _, cigar, _, _ = banded_sw(example.ref, example.read) if cigar != example.expected_cigar: import numpy numpy.set_printoptions(threshold='nan') full_mat, _ = _full_sw_matrix(example.ref, example.read) banded_mat, _ = _banded_sw_matrix(example.ref, example.read) raise ValueError( 'Expected cigar: {}, observed: {}'.format( example.expected_cigar, cigar) + '\nFull:\n{}\n\nBanded:\n{}'.format(full_mat, banded_mat))