コード例 #1
0
ファイル: genotype_tests.py プロジェクト: jeldred/svtools
class TestGenotype(TestCase):
    def setUp(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)
    
    def test_set_format(self):
        g = Genotype(self.variant, '0/1')
        self.assertFalse('INACTIVE' in self.variant.active_formats)
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.format['INACTIVE'], 10)
        self.assertTrue('INACTIVE' in self.variant.active_formats)

    def test_get_format(self):
        g = Genotype(self.variant, '0/1')
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.get_format('INACTIVE'), 10)

    def test_get_gt_string(self):
        g = Genotype(self.variant, '0/1')
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.get_gt_string(), '0/1:.:10')
コード例 #2
0
    def test_duplicate_sample(self):
        header_lines = [
            '##fileformat=VCFv4.2', '##fileDate=20090805',
            '##source=myImputationProgramV3.1',
            '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
            '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>',
            '##phasing=partial',
            '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
            '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
            '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
            '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
            '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
            '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
            '##ALT=<ID=DEL,Description="DELETION">',
            '##FILTER=<ID=q10,Description="Quality below 10">',
            '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
            '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
            '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00001'
        ]

        v = Vcf()
        with self.assertRaises(SystemExit):
            v.add_header(header_lines)
コード例 #3
0
ファイル: file_tests.py プロジェクト: hall-lab/svtools
    def test_duplicate_sample(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20090805',
                '##source=myImputationProgramV3.1',
                '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
                '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>',
                '##phasing=partial',
                '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
                '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
                '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
                '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
                '##ALT=<ID=DEL,Description="DELETION">',
                '##FILTER=<ID=q10,Description="Quality below 10">',
                '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
                '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
                '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00001']

        v = Vcf()
        with self.assertRaises(SystemExit):
            v.add_header(header_lines)
コード例 #4
0
ファイル: pairwise_ld.py プロジェクト: zyworship/svtools
def calc_ld(vcf_in, exclude_file, ld_outfile, winsz, minpos):

    vcf = Vcf()
    header = []
    in_header = True
    maxwin = 100

    exclude = []
    keep = []

    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if ld_outfile is not None:
        outf = open(ld_outfile, 'w', 4096)
        outf.write("id1\tid2\tnp1\tnp2\tr2\n")

    curlist = []
    curchr = -1

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)

        v = line.rstrip().split('\t')
        var = Variant(v, vcf)
        for s in var.sample_list:
            if s in exclude:
                continue
            keep.append(s)

        if var.info['NSAMP'] > minpos:
            if curchr != -1 and var.chr is not curchr:
                ld_calc(curlist, keep, ld_outfile, winsz)
                curlist = [var]
                curchr = var.chr
            elif len(curlist) > maxwin:
                ld_calc(curlist, keep, ld_outfile, winsz)
                curlist = curlist[(maxwin - 1 - winsz):]
                curlist.append(var)
            else:
                curlist.append(var)

    ld_calc(curlist, keep, ld_outfile, winsz)
    if ld_outfile is not None:
        outf.close()
    vcf_in.close()
    if exclude_file is not None:
        exclude_file.close()

    return
コード例 #5
0
 def test_parse_meta(self):
     line = '##FILTER=<ID=MSQ_20,Description="Variant without read-depth support with MSQ > 20">'
     expected_fields = [
         'ID=MSQ_20',
         'Description="Variant without read-depth support with MSQ > 20"'
     ]
     v = Vcf()
     values = v.parse_meta(line)
     self.assertEqual(values, expected_fields)
コード例 #6
0
ファイル: filter_del.py プロジェクト: zyworship/svtools
 def __init__(self, stream):
     self.vcf_obj = Vcf()
     self.stream = stream
     header = list()
     for line in stream:
         if line[0] != '#':
             raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line))
         header.append(line)
         if line.startswith('#CHROM\t'):
             # end of header
             break
     self.vcf_obj.add_header(header)
コード例 #7
0
 def setUp(self):
     self.converter = VcfToBedpeConverter()
     header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=20090805',
         '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
         '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
         '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
         '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001'
     ]
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
コード例 #8
0
 def setUp(self):
     header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=20151202',
         '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
         '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
         '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
         '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001'
     ]
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
     self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
     self.variant = Variant(self.variant_line.split('\t'), self.vcf)
コード例 #9
0
 def test_add_genotype(self):
     header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=20151202',
         '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
         '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
         '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
         '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
         '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878'
     ]
     vcf = Vcf()
     vcf.add_header(header_lines)
     variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9'
     variant = Variant(variant_line.split('\t'), vcf)
     self.assertEqual(variant.get_gt_string(), './.:9')
コード例 #10
0
ファイル: variant_tests.py プロジェクト: abelhj/svtools
 def test_add_genotype(self):
     header_lines = [
             '##fileformat=VCFv4.2',
             '##fileDate=20151202',
             '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
             '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
             '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
             '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
             '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
             '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
     vcf = Vcf()
     vcf.add_header(header_lines)
     variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9'
     variant = Variant(variant_line.split('\t'), vcf)
     self.assertEqual(variant.get_gt_string(), './.:9')
コード例 #11
0
 def test_init(self):
     a = Vcf.Alt('DEL:ME:ALU', '"Deletion of ALU element"')
     self.assertEqual(a.id, 'DEL:ME:ALU')
     self.assertEqual(a.desc, 'Deletion of ALU element')
     self.assertEqual(
         a.hstring,
         '##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">')
コード例 #12
0
 def test_eight_column_vcf(self):
     header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=20090805',
         '##source=myImputationProgramV3.1',
         '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
         '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>',
         '##phasing=partial',
         '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
         '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
         '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
         '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
         '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
         '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
         '##ALT=<ID=DEL,Description="DELETION">',
         '##FILTER=<ID=q10,Description="Quality below 10">',
         '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
         '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
         '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO'
     ]
     v = Vcf()
     v.add_header(header_lines)
     expected_header_lines = header_lines
     expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
     self.assertEqual(v.get_header(), '\n'.join(expected_header_lines))
     v.add_sample('ScottPilgrim')
     self.assertEqual(v.sample_to_col('ScottPilgrim'), 9)
     post_sample_add_header_lines = [
         '##fileformat=VCFv4.2', '##fileDate=' + time.strftime('%Y%m%d'),
         '##source=myImputationProgramV3.1',
         '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
         '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>',
         '##phasing=partial',
         '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
         '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
         '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
         '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
         '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
         '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
         '##ALT=<ID=DEL,Description="DELETION">',
         '##FILTER=<ID=q10,Description="Quality below 10">',
         '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
         '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
         '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
         '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	ScottPilgrim'
     ]
     self.assertEqual(v.get_header(),
                      '\n'.join(post_sample_add_header_lines))
コード例 #13
0
ファイル: filter_del.py プロジェクト: zyworship/svtools
class VCFReader(object):
    def __init__(self, stream):
        self.vcf_obj = Vcf()
        self.stream = stream
        header = list()
        for line in stream:
            if line[0] != '#':
                raise RuntimeError('Error parsing VCF header. Line is not a header line. {}'.format(line))
            header.append(line)
            if line.startswith('#CHROM\t'):
                # end of header
                break
        self.vcf_obj.add_header(header)

    def __iter__(self):
        for line in self.stream:
            yield Variant(line.rstrip().split('\t'), self.vcf_obj)
コード例 #14
0
 def test_init(self):
     f = Vcf.Filter('s50', '"Less than 50% of samples have data"')
     self.assertEqual(f.id, 's50')
     self.assertEqual(f.desc, 'Less than 50% of samples have data')
     self.assertEqual(
         f.hstring,
         '##FILTER=<ID=s50,Description="Less than 50% of samples have data">'
     )
コード例 #15
0
ファイル: sname_overlap.py プロジェクト: zyworship/svtools
def sname_filter(input_stream, filter_file, output_stream, complement):
    '''
    This reads a VCF stream, determines if the line overlaps any from the filter_file by sname and outputs.
    '''
    filter_list = load_filter_file(filter_file)

    vcf = Vcf()
    in_header = True
    header_lines = list()
    sample_list = None
    for line in input_stream:
        if in_header:
            header_lines.append(line)
            if line[0:6] == '#CHROM':
                in_header = False
                vcf.add_header(header_lines)
                vcf.add_info('FOUND', '.', 'String',
                             'Variant id in other file')
                output_stream.write(vcf.get_header() + '\n')
        else:
            v = Variant(line.rstrip().split('\t'), vcf)
            sname_set = set_from_string(v.get_info('SNAME'))
            found = overlapping_ids(sname_set, filter_list)
            if bool(found) != complement:
                v.set_info('FOUND', ','.join(found))
                output_stream.write(v.get_var_string() + '\n')
コード例 #16
0
ファイル: file_tests.py プロジェクト: mkiwala/svtools
    def test_all(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20090805',
                '##source=myImputationProgramV3.1',
                '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
                '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>',
                '##phasing=partial',
                '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">',
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
                '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">',
                '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">',
                '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">',
                '##FILTER=<ID=q10,Description="Quality below 10">',
                '##FILTER=<ID=s50,Description="Less than 50% of samples have data">',
                '##ALT=<ID=DEL,Description="DELETION">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
                '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
                '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003']

        v = Vcf()
        v.add_header(header_lines)
        expected_header_lines = header_lines[:2] + header_lines[3:4] + header_lines[6:12] + header_lines[14:]
        expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
        self.assertEqual(v.get_header(), '\n'.join(expected_header_lines))
        v.add_sample('ScottPilgrim')
        self.assertEqual(v.sample_to_col('ScottPilgrim'), 12)
コード例 #17
0
 def test_init(self):
     f = Vcf.Format('GT', 1, 'String', '"Genotype"')
     self.assertEqual(f.id, 'GT')
     self.assertEqual(f.number, '1')
     self.assertEqual(f.type, 'String')
     self.assertEqual(f.desc, 'Genotype')
     self.assertEqual(
         f.hstring,
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
コード例 #18
0
ファイル: variant_tests.py プロジェクト: jeldred/svtools
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU') 

    def test_genotype(self):
        self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
コード例 #19
0
 def test_init(self):
     i = Vcf.Info('NS', 1, 'Integer', '"Number of Samples With Data"')
     self.assertEqual(i.id, 'NS')
     self.assertEqual(i.number, '1')
     self.assertEqual(i.type, 'Integer')
     self.assertEqual(i.desc, 'Number of Samples With Data')
     self.assertEqual(
         i.hstring,
         '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">'
     )
コード例 #20
0
ファイル: variant_tests.py プロジェクト: hall-lab/svtools
 def test_var_string_format_caching(self):
     header_lines = [
         "##fileformat=VCFv4.2",
         "##fileDate=20151202",
         '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
         '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
         '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
         '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
         '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
         "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
     ]
     vcf = Vcf()
     vcf.add_header(header_lines)
     variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:AS:SU	0/0:1:9"
     uncached_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU:AS	0/0:9:1"
     variant = Variant(variant_line.split("\t"), vcf)
     gt = variant.genotypes()  # force parsing
     self.assertEqual(variant.get_var_string(), uncached_line)
     self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line)
コード例 #21
0
def bedpeToVcf(bedpe_file, vcf_out):
    myvcf = Vcf()
    converter = BedpeToVcfConverter(myvcf)
    in_header = True
    # parse the bedpe data
    header = list()
    for line in bedpe_file:
        if in_header:
            if line[0:2] == '##':
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list_str = line.rstrip().split('\t', 20)[-1]
                header.append('\t'.join([
                    '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER',
                    'INFO', sample_list_str
                ]))
                continue
            else:
                in_header = False
                myvcf.add_header(header)
                myvcf.file_format = 'VCFv4.2'
                vcf_out.write(myvcf.get_header() + '\n')
        #
        bedpe = Bedpe(line.rstrip().split('\t'))
        variants = converter.convert(bedpe)
        for v in variants:
            vcf_out.write(v.get_var_string() + '\n')

    # close the VCF output file
    vcf_out.close()

    return
コード例 #22
0
ファイル: bedpetovcf.py プロジェクト: abelhj/svtools
def bedpeToVcf(bedpe_file, vcf_out):
    myvcf = Vcf()
    converter = BedpeToVcfConverter(myvcf)
    in_header = True
    # parse the bedpe data
    header = list()
    for line in bedpe_file:
        if in_header:
            if line[0:2] == '##':
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':    
                sample_list_str = line.rstrip().split('\t', 20)[-1]
                header.append('\t'.join([
                                    '#CHROM',
                                    'POS',
                                    'ID',
                                    'REF',
                                    'ALT',
                                    'QUAL',
                                    'FILTER',
                                    'INFO',
                                    sample_list_str
                                    ] ))
                continue
            else:
                in_header = False
                myvcf.add_header(header)
                myvcf.file_format='VCFv4.2'
                vcf_out.write(myvcf.get_header() + '\n')
        # 
        bedpe = Bedpe(line.rstrip().split('\t'))
        variants = converter.convert(bedpe)
        for v in variants:
            vcf_out.write(v.get_var_string() + '\n')

    # close the VCF output file
    vcf_out.close()
    
    return
コード例 #23
0
class TestGenotype(TestCase):
    def setUp(self):
        header_lines = [
            '##fileformat=VCFv4.2', '##fileDate=20151202',
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878'
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_equal(self):
        g1 = Genotype(self.variant, ['0/1'])
        g1.set_format('INACTIVE', 10)
        g2 = Genotype(self.variant, ['0/1'])
        g2.set_format('INACTIVE', 10)
        self.assertEqual(g1, g2)

    def test_set_format(self):
        g = Genotype(self.variant, ['0/1'])
        self.assertFalse('INACTIVE' in self.variant.format_dict)
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.get_format('INACTIVE'), 10)
        self.assertTrue('INACTIVE' in self.variant.format_dict)

    def test_get_format(self):
        g = Genotype(self.variant, ['0/1'])
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.get_format('INACTIVE'), 10)

    def test_get_gt_string(self):
        g = Genotype(self.variant, ['0/1'])
        g.set_format('INACTIVE', 10)
        self.assertEqual(g.get_gt_string(), '0/1:.:10')
コード例 #24
0
 def setUp(self):
     self.converter = VcfToBedpeConverter()
     header_lines = [
             '##fileformat=VCFv4.2',
             '##fileDate=20090805',
             '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
             '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
             '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
             '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
             '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
             '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001']
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
コード例 #25
0
ファイル: sname_overlap.py プロジェクト: zyworship/svtools
def load_filter_file(filter_file):
    '''
    Read the file we're going to use as a filter to determine if lines should be output.
    This returns a list containing tuples where the first item is the variant id and the second is the set of ids from sname.
    '''
    filter_list = list()

    vcf = Vcf()
    header_lines = list()
    in_header = True
    for line in filter_file:
        if in_header:
            header_lines.append(line)
            if line[0:6] == '#CHROM':
                in_header = False
                vcf.add_header(header_lines)
        else:
            v = line.rstrip().split('\t')
            var = Variant(v, vcf)
            filter_list.append(
                (var.var_id, set_from_string(var.get_info('SNAME'))))
    return filter_list
コード例 #26
0
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                try:
                    s_index = line.rstrip().split('\t').index(sample)
                except ValueError:
                    sys.stderr.write(
                        "Please input valid VCF, format field for " + sample +
                        " not found in VCF")
                    sys.exit(1)
                line = '\t'.join(
                    map(str,
                        line.rstrip().split('\t')[:9] + [sample]))
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float',
                               'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write("Input a valid sample name: " + sample +
                             " not found in a provided VCF")
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            if "CN" not in v[8]:
                v[8] = v[8] + ":CN"
                v[9] = v[9] + ":" + str(cn_list[i])
            else:
                cn_index = v[8].rstrip().split(":").index("CN")
                gts = v[9].rstrip().split(":")
                gts[cn_index] = str(cn_list[i])
                v[9] = ":".join(gts)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
コード例 #27
0
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    cn_bad = -1 in cn_list
    if cn_bad:
        sys.stderr.write(
            'cnvnator was unable to produce a copynumber value for one or more chromosomes. All copynumber values will be set to missing.'
        )
        cn_list = ['.'] * len(cn_list)
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                try:
                    s_index = line.rstrip().split('\t').index(sample)
                except ValueError:
                    sys.stderr.write(
                        "Please input valid VCF, format field for {0} not found in VCF"
                        .format(sample))
                    sys.exit(1)
                line = '\t'.join(
                    map(str,
                        line.rstrip().split('\t')[:9] + [sample]))
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float',
                               'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write(
                "Input a valid sample name: {0} not found in a provided VCF".
                format(sample))
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            update_line_copynumber(v, cn_list, i)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
コード例 #28
0
ファイル: variant_tests.py プロジェクト: abelhj/svtools
 def setUp(self):
     header_lines = [
             '##fileformat=VCFv4.2',
             '##fileDate=20151202',
             '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
             '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
             '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
             '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
             '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
             '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
             '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001' ]
     self.vcf = Vcf()
     self.vcf.add_header(header_lines)
     self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
     self.variant = Variant(self.variant_line.split('\t'), self.vcf)
コード例 #29
0
ファイル: gt_silhouette.py プロジェクト: scchess/svtools
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('SIL_GT_AVG', '1', 'Float',
                             'Average silhouette of genotype clusters')
                #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
                vcf_out.write(vcf.get_header() + '\n')

        var = Variant(line.rstrip().split('\t'), vcf)
        df = load_df(var, sex)
        df1 = get_silhouette(df)

        sil_avg = df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
        #sil_ind=df1.loc[:, 'sil_gt']
        var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
        vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')

        if ct == 1:
            df1.to_csv(outf, header=True)
            ct += 1
        else:
            df1.to_csv(outf, header=False)

    vcf_out.close()
    vcf_in.close()
    outf.close()
    gender_file.close()

    return
コード例 #30
0
ファイル: gt_silhouette.py プロジェクト: abelhj/svtools
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters')
                #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette')
                vcf_out.write(vcf.get_header() + '\n')

        var = Variant(line.rstrip().split('\t'), vcf)
        df=load_df(var,  sex)
        df1=get_silhouette(df)

        sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')]
        #sil_ind=df1.loc[:, 'sil_gt']
        var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg
        vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        
        if ct==1:
            df1.to_csv(outf, header=True)
            ct += 1
        else:
            df1.to_csv(outf, header=False)

    vcf_out.close()
    vcf_in.close()
    outf.close()
    gender_file.close()

    return
コード例 #31
0
ファイル: copynumber.py プロジェクト: abelhj/svtools
def write_copynumber(vcf_file, sample, vcf_out, cn_list):
    #go through the VCF and add the read depth annotations
    in_header = True
    header = []
    vcf = Vcf()
    i = 0
    s_index = -1
    for line in vcf_file:
        if in_header:
            if line[0] == '#' and line[1] == '#':
                header.append(line)
                continue
            if line[0] == '#' and line[1] != '#':
                  try:
                        s_index = line.rstrip().split('\t').index(sample)
                  except ValueError:
                        sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF")
                        sys.exit(1)
                  line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample]))
                  header.append(line)
                  continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.')
                vcf_out.write(vcf.get_header() + '\n')
        v = line.rstrip().split('\t')
        # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this?
        if s_index == -1:
            sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF")
            sys.exit(1)
        v = v[:9] + [v[s_index]]
        if not any("SVTYPE=BND" in s for s in v):
            if "CN" not in v[8]:
                v[8] = v[8] + ":CN"
                v[9] = v[9] + ":" + str(cn_list[i])
            else:
                cn_index = v[8].rstrip().split(":").index("CN")
                gts = v[9].rstrip().split(":")
                gts[cn_index] = str(cn_list[i])
                v[9] = ":".join(gts)
            i += 1
        # write the VCF
        vcf_out.write('\t'.join(v) + '\n')
    vcf_out.close()
    return
コード例 #32
0
ファイル: sv_classifier.py プロジェクト: jeldred/svtools
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold):
    vcf_out = sys.stdout
    vcf = Vcf()
    header = []
    in_header = True
    min_pos_samps_for_regression = 10

    gender = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        gender[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        # split variant line, quick pre-check if the SVTYPE is BND, and skip if so
        v = line.rstrip().split('\t')

        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break

        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue

        # parse the VCF line
        var = Variant(v, vcf, True)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue

        # # write to directory
        # writedir = 'data/r11.100kb.dup'

        # annotate based on read depth
        if var.info['SVTYPE'] in ['DEL', 'DUP']:
            # count the number of positively genotyped samples
            num_pos_samps = 0;
            for s in var.sample_list:
                if s in exclude:
                    continue
                if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                    num_pos_samps += 1

            if num_pos_samps < min_pos_samps_for_regression:
                if has_low_freq_depth_support(var, gender, exclude):
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd')
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd')
                    # write variant
                    #vcf_out.write(var.get_var_string(True) + '\n')
                    vcf_out.write(line)
                else:
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd')
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd')
                    for m_var in to_bnd_strings(var):
                        vcf_out.write(m_var + '\n')
            else:
                if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold):
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd')
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd')
                    # write variant
                    #vcf_out.write(var.get_var_string(True) + '\n')
                    vcf_out.write(line)
                else:
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd')
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd')
                    for m_var in to_bnd_strings(var):
                        vcf_out.write(m_var + '\n')
    vcf_out.close()
    return
コード例 #33
0
ファイル: file_tests.py プロジェクト: hall-lab/svtools
 def test_parse_meta(self):
     line = '##FILTER=<ID=MSQ_20,Description="Variant without read-depth support with MSQ > 20">'
     expected_fields = ['ID=MSQ_20', 'Description="Variant without read-depth support with MSQ > 20"']
     v = Vcf()
     values = v.parse_meta(line)
     self.assertEqual(values, expected_fields)
コード例 #34
0
ファイル: reclass_combined.py プロジェクト: mkiwala/svtools
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile):

    vcf_out = sys.stdout
    vcf = Vcf()
    header = []
    in_header = True
    min_pos_samps_for_regression = 10

    sex = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if diag_outfile is not None:
        outf=open(diag_outfile, 'w', 4096)

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf_out.write(vcf.get_header() + '\n')

        # split variant line, quick pre-check if the SVTYPE is BND, and skip if so
        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break

        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue
        
        # parse the VCF line
        var = Variant(v, vcf, True)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue


        # for now, don't worry about sex chromosomes
        if (var.chrom == 'X' or var.chrom == 'Y'):
            vcf_out.write(line)
            continue

        #count positively genotyped samples
        num_pos_samps = 0;
        for s in var.sample_list:
            if s in exclude:
                continue
            if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                num_pos_samps += 1

        high_freq_support = False
        low_freq_support = False
        nb_support = False

        if num_pos_samps == 0:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)

            if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params):
                nb_support = True

            if num_pos_samps < min_pos_samps_for_regression:
                if has_low_freq_depth_support(df):
                    low_freq_support = True
                    vcf_out.write(line)
                else:
                    for m_var in to_bnd_strings(var, True ):
                        vcf_out.write(m_var + '\n')
            else:
                if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold):
                    high_freq_support = True
                    vcf_out.write(line)
                else:
                    for m_var in to_bnd_strings(var, True):
                        vcf_out.write(m_var + '\n')
            
        if diag_outfile is not None:
            svlen=df['svlen'][0]
            outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(high_freq_support)+"\t"+str(low_freq_support)+"\n")


    vcf_out.close()
    if diag_outfile is not None:
        outf.close()
    return
コード例 #35
0
ファイル: reclass_combined.py プロジェクト: mkiwala/svtools
def calc_params(vcf_file):

    tSet = list()
    epsilon=0.1
    header=[]
    

    in_header = True
    vcf = Vcf()
    for line in vcf_file:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                    in_header = False
                    vcf.add_header(header)
                continue
        else:
            # split variant line, quick pre-check if the SVTYPE is BND, and skip if so
            v = line.rstrip().split('\t')
            info = v[7].split(';')
            svtype = None
            for x in info:
                if x.startswith('SVTYPE='):
                    svtype = x.split('=')[1]
                    break

            if svtype not in ['DEL', 'DUP'] or v[0]=="X" or v[0]=="Y":
                continue

            var = Variant(v, vcf)
    
            for sample in vcf_samples:
                if var.gts[sample].get_format('GT') != './.':
                    log2r = math.log((float(var.gts[sample].get_format('CN'))+ epsilon)/2,2)  #to avoid log(0)
                    tSet.append(CN_rec1(var.var_id, sample, var.info['SVTYPE'], abs(float(var.info['SVLEN'])), var.info['AF'],
                        var.gts[sample].get_format('GT'),  var.gts[sample].get_format('CN'), var.gts[sample].get_format('AB'), math.log(abs(float(var.info['SVLEN']))), log2r))

    df=pd.DataFrame(tSet, columns=CN_rec1._fields)
    df['q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
    df['q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
    df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
    df.to_csv('./train.csv')

    #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far

    small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=100)]
    small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=100)]
    het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
    hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
    small_het_dels['offset']=small_het_dels['log2r']-het_del_mean
    small_hom_dels['offset']=small_hom_dels['log2r']-hom_del_mean
    

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
        het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
        print hom_del_fit.summary()
        print het_del_fit.summary()
        small_hom_dels['log2r_adj'] = small_hom_dels['log2r'] - hom_del_fit.predict(small_hom_dels)
        small_het_dels['log2r_adj'] = small_het_dels['log2r'] - het_del_fit.predict(small_het_dels)

    small_dels=small_hom_dels.append(small_het_dels)
    small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]

    # dels of length<100 bp are excluded here
    df1=df[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000)]
    df1['log2r_adj']=df1['log2r']
    df1=df1.append(small_dels)


    params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
    params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()
    
    params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
    params['std_pooled']=np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
    params.to_csv('./params.csv')
    return (params, het_del_fit, hom_del_fit)
コード例 #36
0
ファイル: varlookup.py プロジェクト: jeldred/svtools
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
    # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
    bList = list()
    headerObj=Vcf() #co-opt the VCF header object
    if cohort_name is None:
        cohort_name=str(str(bFile).split('/')[-1])
        
    if bFile == "stdin":
        bData = sys.stdin
    elif bFile.endswith('.gz'):
        bData = gzip.open(bFile, 'rb')
    else:
        bData = open(bFile, 'r')
    for bLine in bData:
        if bLine.startswith(pass_prefix):
            continue
        bentry = Bedpe(bLine.rstrip().split('\t'))
        if bentry.af is None:
            sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
            sys.exit(1)
        bList.append(bentry)
    
    if aFile == "stdin":
        aData = sys.stdin
    elif aFile.endswith('.gz'):
        aData = gzip.open(aFile, 'rb')
    else:
        aData = open(aFile, 'r')
    in_header=True    
    header_lines = []
    sample_list = None
    for aLine in aData:
        if pass_prefix is not None and aLine.startswith(pass_prefix):
            if aLine[0] == '#' and aLine[1] != '#':
                sample_list = aLine.rstrip().split('\t', 14)[-1]
            else:
                header_lines.append(aLine)
            continue
        else:
            if in_header == True:
                headerObj.add_header(header_lines)
                headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )
                headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' )

                header = headerObj.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')                
                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(['#CHROM_A',
                                               'START_A',
                                               'END_A',
                                               'CHROM_B',
                                               'START_B',
                                               'END_B',
                                               'ID',
                                               'QUAL',
                                               'STRAND_A',
                                               'STRAND_B',
                                               'TYPE',
                                               'FILTER',
                                               'INFO_A','INFO_B',
                                               sample_list]
                                             ) + '\n')
                else:
                    bedpe_out.write('\t'.join(['#CHROM_A',
                                               'START_A',
                                               'END_A',
                                               'CHROM_B',
                                               'START_B',
                                               'END_B',
                                               'ID',
                                               'QUAL',
                                               'STRAND_A',
                                               'STRAND_B',
                                               'TYPE',
                                               'FILTER',
                                               'INFO_A','INFO_B']
                                              ) + '\n')
                in_header=False
            a = Bedpe(aLine.rstrip().split('\t'))
            if a.af is None:
                sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n')
                sys.exit(1)
            for b in bList:
                add(a,b,max_distance)
            bedpe_out.write(get_var_string(a, cohort_name))
コード例 #37
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float',
                             'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float',
                             'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float',
                               'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format(
                        "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(
                var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
コード例 #38
0
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
            '##fileformat=VCFv4.2', '##fileDate=20151202',
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001'
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ['0/1:20', '0/0:15']
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant,
                              genotype_field_strings[0].split(':'))
        na0001_gt = Genotype(self.variant,
                             genotype_field_strings[1].split(':'))
        expected_genotype_dict = {'NA12878': na12878_gt, 'NA0001': na0001_gt}

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(),
                         'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(),
                         'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU')

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), '0/0:9	1/1:15')

    def test_genotype(self):
        self.assertEqual(
            self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_genotypes(self):
        self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()],
                         ['0/0:9', '1/1:15'])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype('NA12878').set_format('GT', './.')
        self.assertEqual(
            self.variant.get_var_string(use_cached_gt_string=True),
            self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)
コード例 #39
0
ファイル: variant_tests.py プロジェクト: hall-lab/svtools
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001",
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = (
            "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15"
        )
        self.variant = Variant(self.variant_line.split("\t"), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ["0/1:20", "0/0:15"]
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(":"))
        na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(":"))
        expected_genotype_dict = {"NA12878": na12878_gt, "NA0001": na0001_gt}

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info("SVTYPE", "INV")
        self.assertEqual(self.variant.info["SVTYPE"], "INV")
        self.variant.set_info("IMAFLAG", False)
        self.assertEqual(self.variant.info["IMAFLAG"], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info("SUPER", True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info("IMAFLAG"), True)
        self.assertEqual(self.variant.get_info("SVTYPE"), "BND")
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info("CALI")

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9;IMAFLAG")
        self.variant.set_info("IMAFLAG", False)
        self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9")

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), "GT:SU")

    def test_get_format_string_caching(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:AS:SU	0/0:1:9"
        variant = Variant(variant_line.split("\t"), vcf)
        self.assertEqual(variant.get_format_string(), "GT:AS:SU")

        gts = variant.genotypes()
        self.assertEqual(variant.get_format_string(), "GT:SU:AS")

        self.assertEqual(variant.get_format_string(True), "GT:AS:SU")

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), "0/0:9	1/1:15")

    def test_genotype(self):
        self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/0:9")

    def test_set_genotype(self):
        new_genotype = Genotype(self.variant, ["0/1", "9"])
        self.variant.set_genotype("NA12878", new_genotype)
        self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/1:9")

    def test_genotypes(self):
        self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()], ["0/0:9", "1/1:15"])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype("NA12878").set_format("GT", "./.")
        self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)

    def test_var_string_format_caching(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:AS:SU	0/0:1:9"
        uncached_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU:AS	0/0:9:1"
        variant = Variant(variant_line.split("\t"), vcf)
        gt = variant.genotypes()  # force parsing
        self.assertEqual(variant.get_var_string(), uncached_line)
        self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line)

    def test_add_genotype(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9"
        variant = Variant(variant_line.split("\t"), vcf)
        self.assertEqual(variant.get_gt_string(), "./.:9")
コード例 #40
0
ファイル: lmerge.py プロジェクト: thone123/svtools
def l_cluster_by_line(file_name,
                      tempdir,
                      percent_slop=0,
                      fixed_slop=0,
                      use_product=False,
                      include_genotypes=False,
                      weighting_scheme='unweighted'):

    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out = sys.stdout

    with InputStream(file_name, tempdir) as vcf_stream:

        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:

            if in_header:

                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    for headline in header:
                        if headline[:8] == '##SAMPLE':
                            sample_order.append(headline.rstrip()[13:-1])
                    hline = ''
                    if include_genotypes:
                        v.extend(sample_order)
                        hline = '\t'.join(v)
                    else:
                        v = v[:8]
                        hline = '\t'.join(v)
                    header.append(hline)
                    in_header = False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String',
                                 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header() + '\n')
                    else:
                        vcf_out.write(vcf.get_header(False) + '\n')

                continue

            b = Breakpoint(l_bp.parse_vcf_record(line),
                           percent_slop=percent_slop,
                           fixed_slop=fixed_slop)
            if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and
                                    (b.left.chrom == BP_chr_l) and
                                    (b.sv_type == BP_sv_type)):
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                                 vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_sv_type = b.sv_type
                BP_chr_l = b.left.chrom

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf,
                             vcf_out, include_genotypes, weighting_scheme)
コード例 #41
0
ファイル: lmerge.py プロジェクト: hall-lab/svtools
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'):

    v_id = 0

    in_header = True
    header = []
    vcf = Vcf()
    vcf_out=sys.stdout

    with InputStream(file_name) as vcf_stream:

        BP_l = []
        BP_sv_type = ''
        BP_max_end_l = -1
        BP_chr_l = ''
        sample_order = []

        for line in vcf_stream:

            if in_header:

                if line.startswith('##'):
                    header.append(line)
                    continue

                elif line.startswith('#CHROM'):
                    v=line.rstrip().split('\t')
                    for headline in header:
                        if headline[:8] == '##SAMPLE':
                            sample_order.append(headline.rstrip()[13:-1])
                    hline=''
                    if include_genotypes :
                        v.extend(sample_order)
                        hline='\t'.join(v)
                    else :
                        v=v[:8]
                        hline='\t'.join(v)
                    header.append(hline)
                    in_header=False
                    vcf.add_header(header)
                    vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint')

                    if include_genotypes:
                        vcf_out.write(vcf.get_header()+'\n')
                    else:
                        vcf_out.write(vcf.get_header(False)+'\n')

                continue

            b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop)
            if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)):
                BP_l.append(b)
                BP_max_end_l = max(BP_max_end_l, b.left.end)
                BP_chr_l = b.left.chrom
                BP_sv_type = b.sv_type

            else:
                v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
                BP_l = [b]
                BP_max_end_l = b.left.end
                BP_sv_type = b.sv_type
                BP_chr_l = b.left.chrom

        if len(BP_l) > 0:
            v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
コード例 #42
0
ファイル: sv_classifier.py プロジェクト: MMesbahU/svtools
def sv_classify(vcf_in, vcf_out, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if diag_outfile is not None:
        outf=open(diag_outfile, 'w', 4096)
        outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n")

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue

        #count positively genotyped samples
        num_pos_samps = 0
        num_total_samps=len(var.sample_list)

        for s in var.sample_list:
            if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                num_pos_samps += 1

        nb_support = False
        ls_support = False
        hybrid_support = False
        has_rd_support = False

        if num_pos_samps == 0:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            if method=='large_sample':
                ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps)
                has_rd_support=ls_support
            elif method=='naive_bayes':
                nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv)
                has_rd_support=nb_support
            elif method=='hybrid':
                ls_support, nb_support, hybrid_support = has_rd_support_hybrid(df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps)
                has_rd_support=hybrid_support

            if has_rd_support:
               vcf_out.write(line)
            else:
                for m_var in to_bnd_strings(var, True):
                    vcf_out.write(m_var + '\n')

            if diag_outfile is not None:
              svlen=df['svlen'][0]
              outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(ls_support)+"\t"+str(hybrid_support)+"\t"+str(has_rd_support)+"\n")

    vcf_out.close()
    if diag_outfile is not None:
        outf.close()
    vcf_in.close()
    vcf_out.close()
    gender_file.close()
    if exclude_file is not None:
        exclude_file.close()

    return
コード例 #43
0
ファイル: file_tests.py プロジェクト: hall-lab/svtools
 def test_add_info_after(self):
     header_lines = [
             '##fileformat=VCFv4.2',
             '##fileDate=20090805',
             '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
             '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
             '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
             '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
             '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003']
     extra_line = '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">'
     v = Vcf()
     v.add_header(header_lines)
     v.add_info_after('DP', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
     expected_lines = header_lines[0:4] + [extra_line] + header_lines[4:]
     expected_lines[1] = '##fileDate=' + time.strftime('%Y%m%d')
     self.assertEqual(v.get_header(), '\n'.join(expected_lines))
     v2 = Vcf()
     v2.add_header(header_lines)
     v2.add_info_after('AF', 'DB', 0, 'Flag', 'dbSNP membership, build 129')
     expected_lines2 = header_lines[0:5] + [extra_line] + header_lines[5:]
     expected_lines2[1] = '##fileDate=' + time.strftime('%Y%m%d')
     self.assertEqual(v2.get_header(), '\n'.join(expected_lines2))
コード例 #44
0
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0
            sum_sq = 0.0

            for gt in var.genotypes():
                gt_string = gt.get_format('GT')

                if '.' not in gt_string:
                    indexes = self.numeric_alleles(gt_string)

                    for i in indexes:
                        alleles[i] += 1

                    # iterate the number of non-reference samples
                    if sum(indexes) > 0:
                        num_samp += 1
                        try:
                            sum_sq += float(gt.get_format('SQ'))
                        except KeyError:
                            pass

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            if num_samp > 0:
                msq = '%0.2f' % (sum_sq / num_samp)
            else:
                msq = '.'
            var.info['MSQ'] = msq

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
コード例 #45
0
ファイル: afreq.py プロジェクト: jeldred/svtools
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf, fixed_genotypes=True)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0

            gt = [var.genotype(s).get_format('GT') for s in var.sample_list]
            for gt_string in gt:

                if '.' in  gt_string:
                    continue
                gt = gt_string.split('/')
                if len(gt) == 1:
                    gt = gt_string.split('|')
                gt = map(int, gt)

                for i in xrange(len(gt)):
                    alleles[gt[i]] += 1

                # iterate the number of non-reference samples
                if sum(gt) > 0:
                    num_samp += 1

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            var.info['MSQ'] = self.calc_msq(var)

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
コード例 #46
0
 def test_eq(self):
     f = Vcf.Filter('s50', '"Less than 50% of samples have data"')
     g = Vcf.Filter('s50', '"Less than 50% of samples have data"')
     self.assertEqual(f, g)
コード例 #47
0
ファイル: afreq.py プロジェクト: MMesbahU/svtools
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0
            sum_sq = 0.0

            for gt in var.genotypes():
                gt_string = gt.get_format('GT')

                if '.' not in gt_string:
                    indexes = self.numeric_alleles(gt_string)

                    for i in indexes:
                        alleles[i] += 1

                    # iterate the number of non-reference samples
                    if sum(indexes) > 0:
                        num_samp += 1
                        try:
                            sum_sq += float(gt.get_format('SQ'))
                        except KeyError:
                            pass

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            if num_samp > 0:
                msq = '%0.2f' % (sum_sq / num_samp)
            else:
                msq = '.'
            var.info['MSQ'] = msq

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
コード例 #48
0
ファイル: geno_refine_12.py プロジェクト: cc2qe/svtools
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}
    
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)
        
        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF'))<0.01:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            recdf=recluster(df)
            if ct==1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
              recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
                    var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
コード例 #49
0
ファイル: varlookup.py プロジェクト: jasper1918/svtools
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name):
    # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!!
    bList = list()
    headerObj = Vcf()  #co-opt the VCF header object
    if cohort_name is None:
        cohort_name = str(str(bFile).split('/')[-1])

    if bFile == "stdin":
        bData = sys.stdin
    elif bFile.endswith('.gz'):
        bData = gzip.open(bFile, 'rb')
    else:
        bData = open(bFile, 'r')
    for bLine in bData:
        if bLine.startswith(pass_prefix):
            continue
        bentry = Bedpe(bLine.rstrip().split('\t'))
        if bentry.af is None:
            sys.stderr.write(
                'No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n'
            )
            sys.exit(1)
        bList.append(bentry)

    if aFile == "stdin":
        aData = sys.stdin
    elif aFile.endswith('.gz'):
        aData = gzip.open(aFile, 'rb')
    else:
        aData = open(aFile, 'r')
    in_header = True
    header_lines = []
    sample_list = None
    for aLine in aData:
        if pass_prefix is not None and aLine.startswith(pass_prefix):
            if aLine[0] == '#' and aLine[1] != '#':
                sample_list = aLine.rstrip().split('\t', 14)[-1]
            else:
                header_lines.append(aLine)
            continue
        else:
            if in_header == True:
                headerObj.add_header(header_lines)
                headerObj.add_info(
                    cohort_name + '_AF', '.', 'Float',
                    'Allele frequency(ies) for matching variants found in the '
                    + cohort_name + ' vcf' + ' (' +
                    str(str(bFile).split('/')[-1]) + ')')
                headerObj.add_info(
                    cohort_name + '_VarID', '.', 'Integer',
                    'List of Variant ID(s) for matching variants found in the '
                    + cohort_name + ' vcf' + ' (' +
                    str(str(bFile).split('/')[-1]) + ')')

                header = headerObj.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join([
                        '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                        'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                        'FILTER', 'INFO_A', 'INFO_B', sample_list
                    ]) + '\n')
                else:
                    bedpe_out.write('\t'.join([
                        '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                        'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                        'FILTER', 'INFO_A', 'INFO_B'
                    ]) + '\n')
                in_header = False
            a = Bedpe(aLine.rstrip().split('\t'))
            if a.af is None:
                sys.stderr.write(
                    'No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n'
                )
                sys.exit(1)
            for b in bList:
                add(a, b, max_distance)
            bedpe_out.write(get_var_string(a, cohort_name) + '\n')
コード例 #50
0
 def test_eq(self):
     a = Vcf.Alt('DEL:ME:ALU', '"Deletion of ALU element"')
     b = Vcf.Alt('DEL:ME:ALU', 'Deletion of ALU element')
     self.assertEqual(a, b)
コード例 #51
0
 def test_init(self):
     f = Vcf.Format('GT', 1, 'String', 'Genotype')
     vcf = Vcf()
     self.assertEqual(vcf.file_format, 'VCFv4.2')
     self.assertEqual(vcf.format_list, [f])
コード例 #52
0
ファイル: variant_tests.py プロジェクト: abelhj/svtools
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001' ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ['0/1:20', '0/0:15']
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(':'))
        na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(':'))
        expected_genotype_dict = { 'NA12878': na12878_gt, 'NA0001': na0001_gt }

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU') 

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), '0/0:9	1/1:15')

    def test_genotype(self):
        self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_genotypes(self):
        self.assertEqual([ x.get_gt_string() for x in self.variant.genotypes() ], ['0/0:9', '1/1:15'])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype('NA12878').set_format('GT', './.')
        self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)

    def test_add_genotype(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9'
        variant = Variant(variant_line.split('\t'), vcf)
        self.assertEqual(variant.get_gt_string(), './.:9')
コード例 #53
0
 def test_eq(self):
     i = Vcf.Info('NS', 1, 'Integer', '"Number of Samples With Data"')
     j = Vcf.Info('NS', 1, 'Integer', 'Number of Samples With Data')
     self.assertEqual(i, j)
コード例 #54
0
class TestVcfToBedpeConverter(TestCase):
    def setUp(self):
        self.converter = VcfToBedpeConverter()
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20090805',
                '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta',
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
                '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001']
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)

    def test_bnd_alt_string(self):
        self.assertEqual(self.converter.parse_bnd_alt_string('A[1:6['), ('[', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string('A]1:6]'), (']', '1', 6))
        self.assertEqual(self.converter.parse_bnd_alt_string(']1:6]A'), (']', '1', 6))
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string(']1:6[A')
        with self.assertRaises(AssertionError):
            self.converter.parse_bnd_alt_string('1')

    def test_bnd_breakpoints(self):
        vcf_array1 = ['1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.bnd_breakpoints(v1),
                ('1', 20000, 20000, '1', 5, 5, '+', '-'))
        vcf_array2 = ['1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(
                self.converter.bnd_breakpoints(v2),
                ('1', 19999, 19999, '1', 6, 6, '-', '+'))

    def test_simple_breakpoints(self):
        vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.simple_breakpoints(v1),
                ('1', 20000, 20000, '1', 20500, 20500, '+', '-'))
        vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500;STRANDS=-+:2', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        self.assertEqual(
                self.converter.simple_breakpoints(v2),
                ('1', 20000, 20000, '1', 20500, 20500, '-', '+'))
        vcf_array3 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2', 'GT', '0/0']
        v3 = Variant(vcf_array3, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.simple_breakpoints(v3)

    def test_adjust_coordinate(self):
        vcf_array1 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50', 'GT', '0/0']
        v1 = Variant(vcf_array1, self.vcf)
        self.assertEqual(
                self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000),
                (450, 1050))
        self.assertEqual(
                self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000),
                (500, 1000))
        vcf_array2 = ['1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT', '0/0']
        v2 = Variant(vcf_array2, self.vcf)
        with self.assertRaises(ValueError):
            self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
コード例 #55
0
 def setUp(self):
     vcf = Vcf()
     self.converter = BedpeToVcfConverter(vcf)
コード例 #56
0
ファイル: bedpetovcf.py プロジェクト: mkiwala/svtools
def bedpeToVcf(bedpe_file, vcf_out):
    myvcf = Vcf()
    in_header = True
    # parse the bedpe data
    header = list()
    for line in bedpe_file:
        if in_header:
            if line[0:2] == '##':
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':    
                sample_list_str = line.rstrip().split('\t', 14)[-1]
                header.append('\t'.join([
                                    '#CHROM',
                                    'POS',
                                    'ID',
                                    'REF',
                                    'ALT',
                                    'QUAL',
                                    'FILTER',
                                    'INFO',
                                    sample_list_str
                                    ] ))
                continue
            else:
                in_header = False
                myvcf.add_header(header)
                myvcf.file_format='VCFv4.2'
                vcf_out.write(myvcf.get_header() + '\n')
        # 
        bedpe = Bedpe(line.rstrip().split('\t'))
        if bedpe.svtype == 'BND':
            bedpe1_list = [
                    bedpe.c1, 
                    bedpe.b1 + 1,
                    bedpe.name + '_1', #ID
                    'N',
                    '<' + str(bedpe.svtype) + '>', #ALT
                    bedpe.score,
                    bedpe.filter
                    ]
            bedpe1_list.extend(bedpe.misc)
            var1 = Variant(bedpe1_list, myvcf)
            if bedpe.o1 == '+':
                if bedpe.o2 == '-':
                    var1.alt = '%s[%s:%s[' % (var1.ref, bedpe.c2, bedpe.b2 + 1)
                elif bedpe.o2 == '+':
                    var1.alt = '%s]%s:%s]' % (var1.ref, bedpe.c2, bedpe.b2 + 1)
            elif bedpe.o1 == '-':
                if bedpe.o2 == '+':
                    var1.alt = ']%s:%s]%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref)
                elif bedpe.o2 == '-':
                    var1.alt = '[%s:%s[%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref)
            misc = copy.deepcopy(bedpe.misc)
            strands = re.split('=|:',''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))))
            strands_str = str(strands[0]) + '=' + str(strands[1][::-1]) + ':' + str(strands[2])
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))), strands_str)
            #add the cipos ciend,cipos95 and ciend95 variables
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))),'CIPOS='+ re.split('=',''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))))[1])            
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND='  in x, bedpe.misc[0].split(";"))),'CIEND='+ re.split('=',''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))))[1])
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))),'CIPOS95='+ re.split('=',''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))))[1])
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))),'CIEND95='+ re.split('=',''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))))[1])
            #Change MATEID
            misc[0]= misc[0].replace(''.join(filter(lambda x: 'MATEID=' in x, bedpe.misc[0].split(";"))),'MATEID=' + bedpe.name + '_2')
            #ADD IDENTIFIER FOR SECONDARY BREAKEND MATE
            misc[0]=misc[0].replace(''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))),''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))) + ';SECONDARY;')

            bedpe2_list = [
                    bedpe.c2,  #chrom1
                    bedpe.b2 + 1,
                    bedpe.name + '_2', #ID
                    'N',
                    '<' + str(bedpe.svtype) + '>', #ALT
                    bedpe.score,
                    bedpe.filter
                    ]
            bedpe2_list.extend(misc)

            var2 = Variant(bedpe2_list, myvcf)
            # add the strands field. For variant 2 must switch the order
            if bedpe.o2 == '+':
                if bedpe.o1 == '-':
                    var2.alt = '%s[%s:%s[' % (var2.ref, bedpe.c1, bedpe.b1 + 1)
                elif bedpe.o1 == '+':
                    var2.alt = '%s]%s:%s]' % (var2.ref, bedpe.c1, bedpe.b1 + 1)
            elif bedpe.o2 == '-':
                if bedpe.o1 == '+':
                    var2.alt = ']%s:%s]%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref)
                elif bedpe.o1 == '-':
                    var2.alt = '[%s:%s[%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref)
            if bedpe.malformedFlag == 0:
                vcf_out.write(var1.get_var_string() + '\n')
                vcf_out.write(var2.get_var_string() + '\n')
            elif bedpe.malformedFlag == 1:
                vcf_out.write(var2.get_var_string() + '\n')
            elif bedpe.malformedFlag == 2:
                vcf_out.write(var1.get_var_string() + '\n')
        else:
            # set VCF info elements for simple events
            bedpe_list = [
                    bedpe.c1,  #chrom1
                    bedpe.b1 + 1,
                    bedpe.name, #ID
                    'N',
                    '<' + str(bedpe.svtype) + '>', #ALT
                    bedpe.score,
                    bedpe.filter
                    ]
            bedpe_list.extend(bedpe.misc)

            var = Variant(bedpe_list, myvcf)
            # write the record to the VCF output file
            vcf_out.write(var.get_var_string() + '\n')

    # close the VCF output file
    vcf_out.close()
    
    return