Ejemplo n.º 1
0
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU') 

    def test_genotype(self):
        self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
Ejemplo n.º 2
0
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
            '##fileformat=VCFv4.2', '##fileDate=20151202',
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001'
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ['0/1:20', '0/0:15']
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant,
                              genotype_field_strings[0].split(':'))
        na0001_gt = Genotype(self.variant,
                             genotype_field_strings[1].split(':'))
        expected_genotype_dict = {'NA12878': na12878_gt, 'NA0001': na0001_gt}

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(),
                         'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(),
                         'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU')

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), '0/0:9	1/1:15')

    def test_genotype(self):
        self.assertEqual(
            self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_genotypes(self):
        self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()],
                         ['0/0:9', '1/1:15'])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype('NA12878').set_format('GT', './.')
        self.assertEqual(
            self.variant.get_var_string(use_cached_gt_string=True),
            self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)
Ejemplo n.º 3
0
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001",
        ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = (
            "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15"
        )
        self.variant = Variant(self.variant_line.split("\t"), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ["0/1:20", "0/0:15"]
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(":"))
        na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(":"))
        expected_genotype_dict = {"NA12878": na12878_gt, "NA0001": na0001_gt}

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info("SVTYPE", "INV")
        self.assertEqual(self.variant.info["SVTYPE"], "INV")
        self.variant.set_info("IMAFLAG", False)
        self.assertEqual(self.variant.info["IMAFLAG"], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info("SUPER", True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info("IMAFLAG"), True)
        self.assertEqual(self.variant.get_info("SVTYPE"), "BND")
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info("CALI")

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9;IMAFLAG")
        self.variant.set_info("IMAFLAG", False)
        self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9")

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), "GT:SU")

    def test_get_format_string_caching(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:AS:SU	0/0:1:9"
        variant = Variant(variant_line.split("\t"), vcf)
        self.assertEqual(variant.get_format_string(), "GT:AS:SU")

        gts = variant.genotypes()
        self.assertEqual(variant.get_format_string(), "GT:SU:AS")

        self.assertEqual(variant.get_format_string(True), "GT:AS:SU")

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), "0/0:9	1/1:15")

    def test_genotype(self):
        self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/0:9")

    def test_set_genotype(self):
        new_genotype = Genotype(self.variant, ["0/1", "9"])
        self.variant.set_genotype("NA12878", new_genotype)
        self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/1:9")

    def test_genotypes(self):
        self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()], ["0/0:9", "1/1:15"])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype("NA12878").set_format("GT", "./.")
        self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)

    def test_var_string_format_caching(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:AS:SU	0/0:1:9"
        uncached_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU:AS	0/0:9:1"
        variant = Variant(variant_line.split("\t"), vcf)
        gt = variant.genotypes()  # force parsing
        self.assertEqual(variant.get_var_string(), uncached_line)
        self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line)

    def test_add_genotype(self):
        header_lines = [
            "##fileformat=VCFv4.2",
            "##fileDate=20151202",
            '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
            '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
            '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
            '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
            '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878",
        ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = "1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9"
        variant = Variant(variant_line.split("\t"), vcf)
        self.assertEqual(variant.get_gt_string(), "./.:9")
Ejemplo n.º 4
0
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile):

    vcf_out = sys.stdout
    vcf = Vcf()
    header = []
    in_header = True
    min_pos_samps_for_regression = 10

    sex = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if diag_outfile is not None:
        outf=open(diag_outfile, 'w', 4096)

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf_out.write(vcf.get_header() + '\n')

        # split variant line, quick pre-check if the SVTYPE is BND, and skip if so
        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break

        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue
        
        # parse the VCF line
        var = Variant(v, vcf, True)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue


        # for now, don't worry about sex chromosomes
        if (var.chrom == 'X' or var.chrom == 'Y'):
            vcf_out.write(line)
            continue

        #count positively genotyped samples
        num_pos_samps = 0;
        for s in var.sample_list:
            if s in exclude:
                continue
            if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                num_pos_samps += 1

        high_freq_support = False
        low_freq_support = False
        nb_support = False

        if num_pos_samps == 0:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)

            if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params):
                nb_support = True

            if num_pos_samps < min_pos_samps_for_regression:
                if has_low_freq_depth_support(df):
                    low_freq_support = True
                    vcf_out.write(line)
                else:
                    for m_var in to_bnd_strings(var, True ):
                        vcf_out.write(m_var + '\n')
            else:
                if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold):
                    high_freq_support = True
                    vcf_out.write(line)
                else:
                    for m_var in to_bnd_strings(var, True):
                        vcf_out.write(m_var + '\n')
            
        if diag_outfile is not None:
            svlen=df['svlen'][0]
            outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(high_freq_support)+"\t"+str(low_freq_support)+"\n")


    vcf_out.close()
    if diag_outfile is not None:
        outf.close()
    return
Ejemplo n.º 5
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float',
                             'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float',
                             'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float',
                               'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR",
                         '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format(
                        "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(
                var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Ejemplo n.º 6
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex={}
    
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    outf=open(diag_outfile, 'w', 4096)
    ct=1
    
    for line in vcf_in:
        if in_header:
            if line[0] == "#":
               header.append(line)
               continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.')
                vcf.add_format('GTR', 1, 'String', 'Refined genotype.')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)
        
        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF'))<0.01:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            recdf=recluster(df)
            if ct==1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
              recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                if s in recdf.index:
                    var.genotype(s).set_format("GTR", recdf.loc[s,'GTR'])
                    var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re']))
                else:
                    var.genotype(s).set_format("GTR", "./.")
                    var.genotype(s).set_format("GQR", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return
Ejemplo n.º 7
0
def sv_classify(vcf_in, vcf_out, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if diag_outfile is not None:
        outf=open(diag_outfile, 'w', 4096)
        outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n")

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue
        
        var = Variant(v, vcf)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue

        #count positively genotyped samples
        num_pos_samps = 0
        num_total_samps=len(var.sample_list)

        for s in var.sample_list:
            if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                num_pos_samps += 1

        nb_support = False
        ls_support = False
        hybrid_support = False
        has_rd_support = False

        if num_pos_samps == 0:
            vcf_out.write(line)
        else:
            df=load_df(var, exclude, sex)
            if method=='large_sample':
                ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps)
                has_rd_support=ls_support
            elif method=='naive_bayes':
                nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv)
                has_rd_support=nb_support
            elif method=='hybrid':
                ls_support, nb_support, hybrid_support = has_rd_support_hybrid(df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps)
                has_rd_support=hybrid_support

            if has_rd_support:
               vcf_out.write(line)
            else:
                for m_var in to_bnd_strings(var, True):
                    vcf_out.write(m_var + '\n')

            if diag_outfile is not None:
              svlen=df['svlen'][0]
              outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(ls_support)+"\t"+str(hybrid_support)+"\t"+str(has_rd_support)+"\n")

    vcf_out.close()
    if diag_outfile is not None:
        outf.close()
    vcf_in.close()
    vcf_out.close()
    gender_file.close()
    if exclude_file is not None:
        exclude_file.close()

    return
Ejemplo n.º 8
0
def calc_params(vcf_path):

    tSet = list()
    epsilon=0.1
    header=[]
    
    in_header = True
    vcf = Vcf()
    if vcf_path.endswith('.gz'):
        vcf_file = gzip.open(vcf_path, 'rb')
    else:
        vcf_file = open(vcf_path, 'r')

    for line in vcf_file:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                    in_header = False
                    vcf.add_header(header)
                continue
        else:
            v = line.rstrip().split('\t')
            info = v[7].split(';')
            svtype = None
            for x in info:
                if x.startswith('SVTYPE='):
                    svtype = x.split('=')[1]
                    break

            if svtype not in ['DEL', 'DUP'] or v[0]=="X" or v[0]=="Y":
                continue

            var = Variant(v, vcf)
    
            for sample in vcf_samples:
                sample_genotype = var.genotype(sample)
                if sample_genotype.get_format('GT') != './.':
                    log2r = math.log((float(sample_genotype.get_format('CN'))+ epsilon)/2,2)  #to avoid log(0)
                    tSet.append(CN_rec(var.var_id, sample, var.info['SVTYPE'], abs(float(var.info['SVLEN'])), var.info['AF'],
                        sample_genotype.get_format('GT'),  sample_genotype.get_format('CN'), sample_genotype.get_format('AB'), math.log(abs(float(var.info['SVLEN']))), log2r))

    df=pd.DataFrame(tSet, columns=CN_rec._fields)
    #exclude from training data, DELs and DUPs with CN in the tails of the distribution
    df['q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
    df['q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
    df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
    #df.to_csv('./train.csv')

    #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far
    small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)]
    small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)]
    het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
    hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
    small_het_dels['offset']=small_het_dels['log2r']-het_del_mean
    small_hom_dels['offset']=small_hom_dels['log2r']-hom_del_mean
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
        het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
        #print hom_del_fit.summary()
        #print het_del_fit.summary()
        small_hom_dels['log2r_adj'] = small_hom_dels['log2r'] - hom_del_fit.predict(small_hom_dels)
        small_het_dels['log2r_adj'] = small_het_dels['log2r'] - het_del_fit.predict(small_het_dels)

    small_dels=small_hom_dels.append(small_het_dels)
    small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]

    # dels of length<100 bp are excluded here
    df1=df[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000)]
    df1['log2r_adj']=df1['log2r']
    df1=df1.append(small_dels)

    params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
    params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()    
    params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
    params['std_pooled']=np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
    #params.to_csv('./params.csv')
    return (params, het_del_fit, hom_del_fit)
Ejemplo n.º 9
0
class TestVariant(TestCase):
    def setUp(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878	NA0001' ]
        self.vcf = Vcf()
        self.vcf.add_header(header_lines)
        self.variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	GT:SU	0/0:9	1/1:15'
        self.variant = Variant(self.variant_line.split('\t'), self.vcf)

    def test_parse_genotypes(self):
        genotype_field_strings = ['0/1:20', '0/0:15']
        parsed_dict = self.variant._parse_genotypes(genotype_field_strings)

        na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(':'))
        na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(':'))
        expected_genotype_dict = { 'NA12878': na12878_gt, 'NA0001': na0001_gt }

        self.assertEqual(parsed_dict, expected_genotype_dict)

    def test_set_info(self):
        self.variant.set_info('SVTYPE', 'INV')
        self.assertEqual(self.variant.info['SVTYPE'], 'INV')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.info['IMAFLAG'], False)
        with self.assertRaises(SystemExit) as cm:
            self.variant.set_info('SUPER', True)

    def test_get_info(self):
        self.assertEqual(self.variant.get_info('IMAFLAG'), True)
        self.assertEqual(self.variant.get_info('SVTYPE'), 'BND')
        with self.assertRaises(KeyError) as cm:
            self.variant.get_info('CALI')

    def test_get_info_string(self):
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG')
        self.variant.set_info('IMAFLAG', False)
        self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9')

    def test_get_format_string(self):
        self.assertEqual(self.variant.get_format_string(), 'GT:SU') 

    def test_get_gt_string(self):
        self.assertEqual(self.variant.get_gt_string(), '0/0:9	1/1:15')

    def test_genotype(self):
        self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9')

    def test_genotypes(self):
        self.assertEqual([ x.get_gt_string() for x in self.variant.genotypes() ], ['0/0:9', '1/1:15'])

    def test_var_string(self):
        self.assertEqual(self.variant.get_var_string(), self.variant_line)
        self.variant.genotype('NA12878').set_format('GT', './.')
        self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line)
        self.assertNotEqual(self.variant.get_var_string(), self.variant_line)

    def test_add_genotype(self):
        header_lines = [
                '##fileformat=VCFv4.2',
                '##fileDate=20151202',
                '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">',
                '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">',
                '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">',
                '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">',
                '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA12878' ]
        vcf = Vcf()
        vcf.add_header(header_lines)
        variant_line = '1	820915	5838_1	N	]GL000232.1:20940]N	0.00	.	SVTYPE=BND;STRANDS=-+:9;IMAFLAG	SU	9'
        variant = Variant(variant_line.split('\t'), vcf)
        self.assertEqual(variant.get_gt_string(), './.:9')
Ejemplo n.º 10
0
def sv_classify(vcf_in, vcf_out, gender_file, sex_chrom_names, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    if diag_outfile is not None:
        outf=open(diag_outfile, 'w', 4096)
        outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n")

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue

        #count positively genotyped samples
        num_pos_samps = 0
        num_total_samps=len(var.sample_list)

        for s in var.sample_list:
            if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                num_pos_samps += 1

        nb_support = False
        ls_support = False
        hybrid_support = False
        has_rd_support = False

        if num_pos_samps == 0:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex, sex_chrom_names)
            if method == 'large_sample':
                ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps)
                has_rd_support = ls_support
            elif method == 'naive_bayes':
                nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv)
                has_rd_support = nb_support
            elif method == 'hybrid':
                ls_support, nb_support, hybrid_support = has_rd_support_hybrid(
                        df,
                        het_del_fit,
                        hom_del_fit,
                        params, p_cnv,
                        slope_threshold,
                        rsquared_threshold,
                        num_pos_samps
                        )
                has_rd_support=hybrid_support

            if has_rd_support:
               vcf_out.write(line)
            else:
                for m_var in to_bnd_strings(var, True):
                    vcf_out.write(m_var + '\n')

            if diag_outfile is not None:
              svlen=df['svlen'][0]
              outf.write(
                      '\t'.join((
                          var.var_id,
                          svtype,
                          str(svlen),
                          str(num_pos_samps),
                          str(nb_support),
                          str(ls_support),
                          str(hybrid_support),
                          str(has_rd_support)
                          )) + "\n"
                      )

    vcf_out.close()
    if diag_outfile is not None:
        outf.close()
    vcf_in.close()
    vcf_out.close()
    gender_file.close()
    if exclude_file is not None:
        exclude_file.close()

    return
Ejemplo n.º 11
0
def calc_params(vcf_path, sex_chrom_names):

    tSet = list()
    epsilon=0.1
    header=[]

    in_header = True
    vcf = Vcf()
    if vcf_path.endswith('.gz'):
        vcf_file = gzip.open(vcf_path, 'rb')
    else:
        vcf_file = open(vcf_path, 'r')

    for line in vcf_file:
        if in_header:
            if line[0] == '#':
                header.append(line)
                if line[1] != '#':
                    vcf_samples = line.rstrip().split('\t')[9:]
                    in_header = False
                    vcf.add_header(header)
                continue
        else:
            v = line.rstrip().split('\t')
            info = v[7].split(';')
            svtype = None
            for x in info:
                if x.startswith('SVTYPE='):
                    svtype = x.split('=')[1]
                    break

            if svtype not in ['DEL', 'DUP'] or v[0] in sex_chrom_names:
                continue

            var = Variant(v, vcf)

            for sample in vcf_samples:
                sample_genotype = var.genotype(sample)
                if sample_genotype.get_format('GT') != './.':
                    log2r = math.log((float(sample_genotype.get_format('CN'))+ epsilon)/2,2)  #to avoid log(0)
                    tSet.append(
                            CN_rec(
                                var.var_id,
                                sample,
                                var.info['SVTYPE'],
                                abs(float(var.info['SVLEN'])),
                                var.info['AF'],
                                sample_genotype.get_format('GT'),
                                sample_genotype.get_format('CN'),
                                sample_genotype.get_format('AB'),
                                math.log(abs(float(var.info['SVLEN']))), log2r
                                )
                            )

    df=pd.DataFrame(tSet, columns=CN_rec._fields)
    #exclude from training data, DELs and DUPs with CN in the tails of the distribution
    df.loc[:,'q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile)
    df.loc[:,'q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile)
    df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)]
    #df.to_csv('./train.csv')
    #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far
    small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
    small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)].copy()
    het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r'])
    hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r'])
    small_het_dels.loc[:,'offset']=small_het_dels.loc[:,'log2r']-het_del_mean
    small_hom_dels.loc[:,'offset']=small_hom_dels.loc[:,'log2r']-hom_del_mean
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit()
        het_del_fit=smf.ols('offset~log_len',small_het_dels).fit()
        #print hom_del_fit.summary()
        #print het_del_fit.summary()
        small_hom_dels.loc[:,'log2r_adj'] = small_hom_dels.loc[:,'log2r'] - hom_del_fit.predict(small_hom_dels)
        small_het_dels.loc[:,'log2r_adj'] = small_het_dels.loc[:,'log2r'] - het_del_fit.predict(small_het_dels)
    small_dels=small_hom_dels.append(small_het_dels)
    small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']]
    # dels of length<100 bp are excluded here
    df1=df.loc[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000), :].copy()
    df1.loc[:,'log2r_adj']=df1.loc[:,'log2r']
    df1=df1.append(small_dels)
    params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index()
    params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index()
    params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2']
    params['std_pooled'] = np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2']))
    #params.to_csv('./params.csv')
    return (params, het_del_fit, hom_del_fit)
Ejemplo n.º 12
0
    def execute(self, output_handle=sys.stdout):
        in_header = True
        header = []
        vcf = Vcf()
        vcf_out = output_handle

        # read input VCF
        for line in self.vcf_stream:
            if in_header:
                if line.startswith('##'):
                    header.append(line) 
                    continue
                elif line.startswith('#CHROM'):
                    v = line.rstrip().split('\t')
                    header.append('\t'.join(v))

                    in_header = False
                    vcf.add_header(header)
                    
                    vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed')
                    vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes')
                    vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples')

                    # write header
                    vcf_out.write(vcf.get_header() + '\n')
                    #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n')
                continue

            v = line.rstrip().split('\t')
            var = Variant(v, vcf, fixed_genotypes=True)

            # extract genotypes from VCF
            num_alt = len(var.alt.split(','))
            alleles = [0] * (num_alt + 1)
            num_samp = 0

            gt = [var.genotype(s).get_format('GT') for s in var.sample_list]
            for gt_string in gt:

                if '.' in  gt_string:
                    continue
                gt = gt_string.split('/')
                if len(gt) == 1:
                    gt = gt_string.split('|')
                gt = map(int, gt)

                for i in xrange(len(gt)):
                    alleles[gt[i]] += 1

                # iterate the number of non-reference samples
                if sum(gt) > 0:
                    num_samp += 1

            allele_sum = float(sum(alleles))
            allele_freq = ['.'] * len(alleles)

            # populate AF
            if allele_sum > 0:
                for i in xrange(len(alleles)):
                    allele_freq[i] = alleles[i] / allele_sum
                var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]]))
            else:
                var.info['AF'] = ','.join(map(str, allele_freq[1:]))
            
            # populate NSAMP
            var.info['NSAMP'] = num_samp
            var.info['MSQ'] = self.calc_msq(var)

            # after all samples have been processed, write
            vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n')
        vcf_out.close()
Ejemplo n.º 13
0
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold):
    vcf_out = sys.stdout
    vcf = Vcf()
    header = []
    in_header = True
    min_pos_samps_for_regression = 10

    gender = {}
    # read sample genders
    for line in gender_file:
        v = line.rstrip().split('\t')
        gender[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    for line in vcf_in:
        if in_header:
            if line[0] == '#':
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                # write the output header
                vcf_out.write(vcf.get_header() + '\n')

        # split variant line, quick pre-check if the SVTYPE is BND, and skip if so
        v = line.rstrip().split('\t')

        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break

        # bail if not DEL or DUP prior to reclassification
        if svtype not in ['DEL', 'DUP']:
            vcf_out.write(line)
            continue

        # parse the VCF line
        var = Variant(v, vcf, True)

        # check intersection with mobile elements
        if ae_dict is not None and var.info['SVTYPE'] in ['DEL']:
            ae = annotation_intersect(var, ae_dict, f_overlap)
            if ae is not None:
                if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'):
                    ae = 'ME:' + ae
                var.alt = '<DEL:%s>' % ae
                var.info['SVTYPE'] = 'MEI'
                vcf_out.write(var.get_var_string(True) + '\n')
                continue

        # # write to directory
        # writedir = 'data/r11.100kb.dup'

        # annotate based on read depth
        if var.info['SVTYPE'] in ['DEL', 'DUP']:
            # count the number of positively genotyped samples
            num_pos_samps = 0;
            for s in var.sample_list:
                if s in exclude:
                    continue
                if var.genotype(s).get_format('GT') not in ["./.", "0/0"]:
                    num_pos_samps += 1

            if num_pos_samps < min_pos_samps_for_regression:
                if has_low_freq_depth_support(var, gender, exclude):
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd')
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd')
                    # write variant
                    #vcf_out.write(var.get_var_string(True) + '\n')
                    vcf_out.write(line)
                else:
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd')
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd')
                    for m_var in to_bnd_strings(var):
                        vcf_out.write(m_var + '\n')
            else:
                if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold):
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd')
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd')
                    # write variant
                    #vcf_out.write(var.get_var_string(True) + '\n')
                    vcf_out.write(line)
                else:
                    # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd')
                    # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd')
                    for m_var in to_bnd_strings(var):
                        vcf_out.write(m_var + '\n')
    vcf_out.close()
    return
Ejemplo n.º 14
0
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file):

    vcf = Vcf()
    header = []
    in_header = True
    sex = {}

    for line in gender_file:
        v = line.rstrip().split('\t')
        sex[v[0]] = int(v[1])

    exclude = []
    if exclude_file is not None:
        for line in exclude_file:
            exclude.append(line.rstrip())

    batch = dict()
    if batch_file is not None:
        for line in batch_file:
            fields = line.rstrip().split('\t')
            if fields[1] == 'None':
                raise RuntimeError('Batch file contains a batch label of None. This label is reserved.')
            batch[fields[0]] = fields[1]

    outf = open(diag_outfile, 'w', 4096)
    ct = 1

    for line in vcf_in:
        if in_header:
            if line[0] == "#":
                header.append(line)
                continue
            else:
                in_header = False
                vcf.add_header(header)
                vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT')
                vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT')
                vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype')
                vcf.add_format('GTO', 1, 'String', 'Genotype before refinement')
                vcf_out.write(vcf.get_header() + '\n')

        v = line.rstrip().split('\t')
        info = v[7].split(';')
        svtype = None
        for x in info:
            if x.startswith('SVTYPE='):
                svtype = x.split('=')[1]
                break
        # bail if not DEL prior to reclassification
        # DUPs can be quite complicated in their allelic structure
        # and thus less amenable to refinement by clustering in many cases
        # INV and BNDs are also unclear.
        # See earlier commits for code of previous attempts to refine these.
        if svtype not in ['DEL', 'MEI']:
            vcf_out.write(line)
            continue

        var = Variant(v, vcf)
        sys.stderr.write("%s\n" % var.var_id)

        sys.stderr.write("%f\n" % float(var.get_info('AF')))
        if float(var.get_info('AF')) < 0.01:
            vcf_out.write(line)
        else:
            df = load_df(var, exclude, sex, batch)
            recdf = recluster(df)
            if ct == 1:
                recdf.to_csv(outf, header=True)
                ct += 1
            else:
                recdf.to_csv(outf, header=False)
            var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re']))
            var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re']))
            recdf.set_index('sample', inplace=True)
            for s in var.sample_list:
                g = var.genotype(s)
                g.set_format("GTO", g.get_format("GT"))
                g.set_format("GQO", g.get_format("GQ"))
                if s in recdf.index:
                    var.genotype(s).set_format("GT", recdf.loc[s, 'GTR'])
                    var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re']))
                else:
                    var.genotype(s).set_format("GT", "./.")
                    var.genotype(s).set_format("GQ", 0)
            vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n')

    vcf_out.close()
    vcf_in.close()
    gender_file.close()
    outf.close()
    if exclude_file is not None:
        exclude_file.close()
    return