Beispiel #1
0
    def test_init(self):
        bp = Breakpoint(self.entry, fixed_slop=1)
        self.assertEqual(bp.l, self.entry)
        self.assertEqual(bp.sv_type, 'BND')
        self.assertEqual(bp.left.chrom, '1')
        self.assertEqual(bp.right.chrom, '10')
        self.assertEqual(bp.strands, '++:5')
        self.assertEqual(bp.left.start, 9572383 - 11)
        self.assertEqual(bp.left.end, 9572383 + 11)
        self.assertEqual(bp.right.start, 94079366 - 11)
        self.assertEqual(bp.right.end, 94079366 + 11)
        self.assertEqual(bp.left.p, self.prpos)
        self.assertEqual(bp.right.p, self.prend)

        # This was previously implemented in l_bp_tests, adding in here too
        test_line = '1	1000	2345_1	N	[2:1100[N	0.00	.	SVTYPE=BND;STRANDS=--:7;IMPRECISE;CIPOS=-2,2;CIEND=-2,2;CIPOS95=-1,1;CIEND95=-1,1;MATEID=2345_2;EVENT=2345;SU=7;PE=7;SR=0;PRPOS=0.025,0.25,0.45,0.25,0.025;PREND=0.025,0.25,0.45,0.25,0.025'
        no_slop = Breakpoint(test_line)
        self.assertEqual(no_slop.left.p, [0.025, 0.25, 0.45, 0.25, 0.025])
        self.assertEqual(no_slop.right.p, [0.025, 0.25, 0.45, 0.25, 0.025])

        fixed_slop = Breakpoint(test_line, fixed_slop = 1)
        self.assertEqual(fixed_slop.left.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100])
        self.assertEqual(fixed_slop.right.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100])

        percent_slop = Breakpoint(test_line, percent_slop = 0.2)
        print percent_slop
        self.assertEqual(percent_slop.left.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100])
        self.assertEqual(percent_slop.right.p, [1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100])

        percent_and_fixed_slop = Breakpoint(test_line, percent_slop = 0.2, fixed_slop = 2)
        self.assertEqual(percent_and_fixed_slop.left.p, [1e-100, 1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100, 1e-100])
        self.assertEqual(percent_and_fixed_slop.right.p, [1e-100, 1e-100, 0.025, 0.25, 0.45, 0.25, 0.025, 1e-100, 1e-100])
Beispiel #2
0
    def test_str(self):
        bp = Breakpoint(self.entry, fixed_slop=1)
        expected = [
            '1',
            str(9572383 - 11),
            str(9572383 + 11), '10',
            str(94079366 - 11),
            str(94079366 + 11), 'BND', '++:5',
            str(self.prpos),
            str(self.prend)
        ]

        self.assertEqual(str(bp), '\t'.join(expected))
Beispiel #3
0
def l_cluster_by_line(file_name,
                      tempdir,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False,
                      include_genotypes=False,
                      weighting_scheme='unweighted'):

    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out = sys.stdout

    with InputStream(file_name, tempdir) as vcf_stream:

        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:

            if in_header:

                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    for headline in header:
                        if headline[:8] == '##SAMPLE':
                            sample_order.append(headline.rstrip()[13:-1])
                    hline = ''
                    if include_genotypes:
                        v.extend(sample_order)
                        hline = '\t'.join(v)
                    else:
                        v = v[:8]
                        hline = '\t'.join(v)
                    header.append(hline)
                    in_header = False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String',
                                 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header() + '\n')
                    else:
                        vcf_out.write(vcf.get_header(False) + '\n')

                continue

            b = Breakpoint(l_bp.parse_vcf_record(line),
                           percent_slop=percent_slop,
                           fixed_slop=fixed_slop)
            if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and
                                    (b.left.chrom == BP_chr_l) and
                                    (b.sv_type == BP_sv_type)):
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                                 vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_sv_type = b.sv_type
                BP_chr_l = b.left.chrom

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                             vcf_out, include_genotypes, weighting_scheme)
Beispiel #4
0
def l_cluster_by_line(file_name,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False):
    v_id = 0
    vcf_lines = []
    vcf_headers = list()
    infile = open(file_name, 'r')

    header = ''
    samples = ''

    for l in infile:
        if l[0] == '#':
            if l[1] != '#':
                samples = l.rstrip().split('\t')[9:]
            else:
                # ignore fileDate
                if l[:10] == '##fileDate':
                    continue
                if l not in vcf_headers:
                    vcf_headers.append(l)
        if l[0] != '#':
            break

    vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")

    sample_order = []
    for header in vcf_headers:
        if header[:8] == '##SAMPLE':
            sample_order.append(header.rstrip()[13:-1])
        print header,

    BP_l = []
    BP_sv_type = ''
    BP_max_end_l = -1
    BP_chr_l = ''

    b = Breakpoint(l_bp.parse_vcf_record(l),
                   percent_slop=percent_slop,
                   fixed_slop=fixed_slop)
    BP_l.append(b)
    BP_max_end_l = max(BP_max_end_l, b.end_l)
    BP_chr_l = b.chr_l
    BP_sv_type = b.sv_type

    for l in infile:
        b = Breakpoint(l_bp.parse_vcf_record(l),
                       percent_slop=percent_slop,
                       fixed_slop=fixed_slop)
        if (len(BP_l) == 0) or ((b.start_l <= BP_max_end_l) and
                                (b.chr_l == BP_chr_l) and
                                (b.sv_type == BP_sv_type)):
            BP_l.append(b)
            BP_max_end_l = max(BP_max_end_l, b.end_l)
            BP_chr_l = b.chr_l
            BP_sv_type = b.sv_type
        else:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product)
            BP_l = [b]
            BP_max_end_l = b.end_l
            BP_sv_type = b.sv_type
            BP_chr_l = b.chr_l

    if len(BP_l) > 0:
        v_id = r_cluster(BP_l, sample_order, v_id, use_product)

    infile.close()
Beispiel #5
0
 def test_ovl(self):
     bp = Breakpoint(self.entry, fixed_slop=1)
     bp2 = Breakpoint(self.entry, fixed_slop=2)
     # Note that this is a regression test. This value was arrived at using the existing code.
     # It's correctness is unknown.
     self.assertEqual(bp.ovl(bp2), 1.0)
Beispiel #6
0
def l_cluster_by_line(file_name,
                      tempdir,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False,
                      include_genotypes=False,
                      weighting_scheme='unweighted'):
    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out = sys.stdout

    with InputStream(file_name, tempdir) as vcf_stream:
        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v = line.rstrip().split(
                        '\t')  # #CHROM line split -> list -D
                    for headline in header:
                        if headline[:8] == '##SAMPLE' and headline.rstrip(
                        )[13:-1] != 'VARIOUS':
                            sample_order.append(
                                headline.rstrip()[13:-1]
                            )  # maybe add sample name to samplr_order list. -D
                    hline = ''  # Parsed #CHROM line from 'v' -D
                    if include_genotypes:
                        v = v[:9]  # Remove possible VARIOUS -D
                        v.extend(sample_order)
                        hline = '\t'.join(v)
                    else:
                        v = v[:8]  # No FORMAT field here. -D
                        hline = '\t'.join(v)
                    header.append(hline)
                    in_header = False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String',
                                 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header() + '\n')
                    else:
                        vcf_out.write(
                            vcf.get_header(include_samples=False) +
                            '\n')  # Not including samples here. -D

                continue

            # Header is now parsed, then the main dish. -D

            b = Breakpoint(
                l_bp.parse_vcf_record(line),
                percent_slop=percent_slop,
                fixed_slop=fixed_slop)  # percent_slop and fixed_slop is 0. -D
            if (len(BP_l) == 0) or (
                (b.left.start <= BP_max_end_l) and
                (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)
            ):  # Same chrom svtype and start is small than previous end. -D
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                                 vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                             vcf_out, include_genotypes, weighting_scheme)
Beispiel #7
0
 def test_ovl(self):
     bp = Breakpoint(self.entry, fixed_slop=1)
     bp2 = Breakpoint(self.entry, fixed_slop=2)
     # Note that this is a regression test. This value was arrived at using the existing code.
     # It's correctness is unknown.
     self.assertEqual(bp.ovl(bp2), 1.0)
Beispiel #8
0
 def test_floats_from_tag(self):
     bp = Breakpoint(self.entry, fixed_slop=1)
     info = { 'TAG': '1.2,1.3'}
     self.assertEqual(bp.floats_from_tag(info, 'TAG'), [1.2, 1.3])
     with self.assertRaises(RuntimeError):
         bp.floats_from_tag(info, 'AG')