def test_init(self): bp = Breakpoint(self.entry, fixed_slop=1) self.assertEqual(bp.l, self.entry) self.assertEqual(bp.sv_type, 'BND') self.assertEqual(bp.left.chrom, '1') self.assertEqual(bp.right.chrom, '10') self.assertEqual(bp.strands, '++:5') self.assertEqual(bp.left.start, 9572383 - 11) self.assertEqual(bp.left.end, 9572383 + 11) self.assertEqual(bp.right.start, 94079366 - 11) self.assertEqual(bp.right.end, 94079366 + 11) self.assertEqual(bp.left.p, self.prpos) self.assertEqual(bp.right.p, self.prend) # This was previously implemented in l_bp_tests, adding in here too test_line = '1 1000 2345_1 N [2:1100[N 0.00 . SVTYPE=BND;STRANDS=--:7;IMPRECISE;CIPOS=-2,2;CIEND=-2,2;CIPOS95=-1,1;CIEND95=-1,1;MATEID=2345_2;EVENT=2345;SU=7;PE=7;SR=0;PRPOS=0.025,0.25,0.45,0.25,0.025;PREND=0.025,0.25,0.45,0.25,0.025' no_slop = Breakpoint(test_line) self.assertEqual(no_slop.left.p, [0.025, 0.25, 0.45, 0.25, 0.025]) self.assertEqual(no_slop.right.p, [0.025, 0.25, 0.45, 0.25, 0.025]) fixed_slop = Breakpoint(test_line, fixed_slop = 1) self.assertEqual(fixed_slop.left.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100]) self.assertEqual(fixed_slop.right.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100]) percent_slop = Breakpoint(test_line, percent_slop = 0.2) print percent_slop self.assertEqual(percent_slop.left.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100]) self.assertEqual(percent_slop.right.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100]) percent_and_fixed_slop = Breakpoint(test_line, percent_slop = 0.2, fixed_slop = 2) self.assertEqual(percent_and_fixed_slop.left.p, [1e-100, 1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100, 1e-100]) self.assertEqual(percent_and_fixed_slop.right.p, [1e-100, 1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100, 1e-100])
def test_str(self): bp = Breakpoint(self.entry, fixed_slop=1) expected = [ '1', str(9572383 - 11), str(9572383 + 11), '10', str(94079366 - 11), str(94079366 + 11), 'BND', '++:5', str(self.prpos), str(self.prend) ] self.assertEqual(str(bp), '\t'.join(expected))
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') for headline in header: if headline[:8] == '##SAMPLE': sample_order.append(headline.rstrip()[13:-1]) hline = '' if include_genotypes: v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write(vcf.get_header(False) + '\n') continue b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_sv_type = b.sv_type BP_chr_l = b.left.chrom if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False): v_id = 0 vcf_lines = [] vcf_headers = list() infile = open(file_name, 'r') header = '' samples = '' for l in infile: if l[0] == '#': if l[1] != '#': samples = l.rstrip().split('\t')[9:] else: # ignore fileDate if l[:10] == '##fileDate': continue if l not in vcf_headers: vcf_headers.append(l) if l[0] != '#': break vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") sample_order = [] for header in vcf_headers: if header[:8] == '##SAMPLE': sample_order.append(header.rstrip()[13:-1]) print header, BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' b = Breakpoint(l_bp.parse_vcf_record(l), percent_slop=percent_slop, fixed_slop=fixed_slop) BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.end_l) BP_chr_l = b.chr_l BP_sv_type = b.sv_type for l in infile: b = Breakpoint(l_bp.parse_vcf_record(l), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.start_l <= BP_max_end_l) and (b.chr_l == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.end_l) BP_chr_l = b.chr_l BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product) BP_l = [b] BP_max_end_l = b.end_l BP_sv_type = b.sv_type BP_chr_l = b.chr_l if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product) infile.close()
def test_ovl(self): bp = Breakpoint(self.entry, fixed_slop=1) bp2 = Breakpoint(self.entry, fixed_slop=2) # Note that this is a regression test. This value was arrived at using the existing code. # It's correctness is unknown. self.assertEqual(bp.ovl(bp2), 1.0)
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split( '\t') # #CHROM line split -> list -D for headline in header: if headline[:8] == '##SAMPLE' and headline.rstrip( )[13:-1] != 'VARIOUS': sample_order.append( headline.rstrip()[13:-1] ) # maybe add sample name to samplr_order list. -D hline = '' # Parsed #CHROM line from 'v' -D if include_genotypes: v = v[:9] # Remove possible VARIOUS -D v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] # No FORMAT field here. -D hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write( vcf.get_header(include_samples=False) + '\n') # Not including samples here. -D continue # Header is now parsed, then the main dish. -D b = Breakpoint( l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) # percent_slop and fixed_slop is 0. -D if (len(BP_l) == 0) or ( (b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type) ): # Same chrom svtype and start is small than previous end. -D BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_chr_l = b.left.chrom BP_sv_type = b.sv_type if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def test_floats_from_tag(self): bp = Breakpoint(self.entry, fixed_slop=1) info = { 'TAG': '1.2,1.3'} self.assertEqual(bp.floats_from_tag(info, 'TAG'), [1.2, 1.3]) with self.assertRaises(RuntimeError): bp.floats_from_tag(info, 'AG')