def run_from_args(args): vcf = Vcf() vcf_out=sys.stdout in_header = True header_lines = list() with su.InputStream(args.manta_vcf) as input_stream: for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header=False vcf.add_header(header_lines) vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist') vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist') vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--') vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples') vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples') vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples') vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length') vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants') vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants') vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant') vcf_out.write(vcf.get_header()+'\n') else: v = Variant(line.rstrip().split('\t'), vcf) convert_variant(v, args.max_ins) vcf_out.write(v.get_var_string()+"\n")
def convert(self, bedpe): ''' Convert a bedpe object to Vcf object(s). Returns a list of entries. ''' adjust_tag1, adjust_tag2 = 'CIPOS', 'CIEND' if bedpe.malformedFlag == 1: adjust_tag2, adjust_tag1 = adjust_tag1, adjust_tag2 b1 = self.adjust_by_tag(bedpe, adjust_tag1, bedpe.o1, bedpe.s1) primary_bedpe_list = [ bedpe.c1, b1, bedpe.orig_name1, bedpe.orig_ref1, bedpe.orig_alt1, bedpe.score, bedpe.filter, bedpe.info1 ] + bedpe.misc var = Variant(primary_bedpe_list, self.vcf_header) to_return = [var] if bedpe.svtype == 'BND': b2 = self.adjust_by_tag(bedpe, adjust_tag2, bedpe.o2, bedpe.s2) secondary_bedpe_list = [ bedpe.c2, b2, bedpe.orig_name2, bedpe.orig_ref2, bedpe.orig_alt2, bedpe.score, bedpe.filter, bedpe.info2 ] + bedpe.misc var2 = Variant(secondary_bedpe_list, self.vcf_header) if bedpe.malformedFlag == 0: to_return += [var2] elif bedpe.malformedFlag == 1: #Only returning one of our entries to_return[0] = var2 return to_return
def test_bnd_breakpoints(self): vcf_array1 = [ '1', '20000', '235', 'T', 'A[1:6[', '0.00', '.', '.', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual(self.converter.bnd_breakpoints(v1), ('1', 20000, 20000, '1', 5, 5, '+', '-')) vcf_array2 = [ '1', '20000', '235', 'T', ']1:6]N', '0.00', '.', '.', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) self.assertEqual(self.converter.bnd_breakpoints(v2), ('1', 19999, 19999, '1', 6, 6, '-', '+'))
def test_add_genotype(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ] vcf = Vcf() vcf.add_header(header_lines) variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG SU 9' variant = Variant(variant_line.split('\t'), vcf) self.assertEqual(variant.get_gt_string(), './.:9')
def setUp(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA0001' ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9 1/1:15' self.variant = Variant(self.variant_line.split('\t'), self.vcf)
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df = load_df(var, sex) df1 = get_silhouette(df) sil_avg = df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct == 1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df=load_df(var, sex) df1=get_silhouette(df) sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct==1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def calc_ld(vcf_in, exclude_file, ld_outfile, winsz, minpos): vcf = Vcf() header = [] in_header = True maxwin = 100 exclude = [] keep = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if ld_outfile is not None: outf = open(ld_outfile, 'w', 4096) outf.write("id1\tid2\tnp1\tnp2\tr2\n") curlist = [] curchr = -1 for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) v = line.rstrip().split('\t') var = Variant(v, vcf) for s in var.sample_list: if s in exclude: continue keep.append(s) if var.info['NSAMP'] > minpos: if curchr != -1 and var.chr is not curchr: ld_calc(curlist, keep, ld_outfile, winsz) curlist = [var] curchr = var.chr elif len(curlist) > maxwin: ld_calc(curlist, keep, ld_outfile, winsz) curlist = curlist[(maxwin - 1 - winsz):] curlist.append(var) else: curlist.append(var) ld_calc(curlist, keep, ld_outfile, winsz) if ld_outfile is not None: outf.close() vcf_in.close() if exclude_file is not None: exclude_file.close() return
def test_adjust_coordinate(self): vcf_array1 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=-50,50', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIEND', 500, 1000), (450, 1050)) self.assertEqual( self.converter.adjust_coordinate(v1, 'CIPOS', 500, 1000), (500, 1000)) vcf_array2 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'CIEND=50', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) with self.assertRaises(ValueError): self.converter.adjust_coordinate(v2, 'CIEND', 500, 1000)
def test_var_string_format_caching(self): header_lines = [ "##fileformat=VCFv4.2", "##fileDate=20151202", '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878", ] vcf = Vcf() vcf.add_header(header_lines) variant_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:AS:SU 0/0:1:9" uncached_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU:AS 0/0:9:1" variant = Variant(variant_line.split("\t"), vcf) gt = variant.genotypes() # force parsing self.assertEqual(variant.get_var_string(), uncached_line) self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line)
def sname_filter(input_stream, filter_file, output_stream, complement): ''' This reads a VCF stream, determines if the line overlaps any from the filter_file by sname and outputs. ''' filter_list = load_filter_file(filter_file) vcf = Vcf() in_header = True header_lines = list() sample_list = None for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header = False vcf.add_header(header_lines) vcf.add_info('FOUND', '.', 'String', 'Variant id in other file') output_stream.write(vcf.get_header() + '\n') else: v = Variant(line.rstrip().split('\t'), vcf) sname_set = set_from_string(v.get_info('SNAME')) found = overlapping_ids(sname_set, filter_list) if bool(found) != complement: v.set_info('FOUND', ','.join(found)) output_stream.write(v.get_var_string() + '\n')
def load_filter_file(filter_file): ''' Read the file we're going to use as a filter to determine if lines should be output. This returns a list containing tuples where the first item is the variant id and the second is the set of ids from sname. ''' filter_list = list() vcf = Vcf() header_lines = list() in_header = True for line in filter_file: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header = False vcf.add_header(header_lines) else: v = line.rstrip().split('\t') var = Variant(v, vcf) filter_list.append( (var.var_id, set_from_string(var.get_info('SNAME')))) return filter_list
def test_simple_breakpoints(self): vcf_array1 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500', 'GT', '0/0' ] v1 = Variant(vcf_array1, self.vcf) self.assertEqual(self.converter.simple_breakpoints(v1), ('1', 20000, 20000, '1', 20500, 20500, '+', '-')) vcf_array2 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'END=20500;STRANDS=-+:2', 'GT', '0/0' ] v2 = Variant(vcf_array2, self.vcf) self.assertEqual(self.converter.simple_breakpoints(v2), ('1', 20000, 20000, '1', 20500, 20500, '-', '+')) vcf_array3 = [ '1', '20000', '235', 'T', '<DEL>', '0.00', '.', 'STRANDS=--:2', 'GT', '0/0' ] v3 = Variant(vcf_array3, self.vcf) with self.assertRaises(ValueError): self.converter.simple_breakpoints(v3)
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile): vcf_out = sys.stdout vcf = Vcf() header = [] in_header = True min_pos_samps_for_regression = 10 sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') # split variant line, quick pre-check if the SVTYPE is BND, and skip if so v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue # parse the VCF line var = Variant(v, vcf, True) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue # for now, don't worry about sex chromosomes if (var.chrom == 'X' or var.chrom == 'Y'): vcf_out.write(line) continue #count positively genotyped samples num_pos_samps = 0; for s in var.sample_list: if s in exclude: continue if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 high_freq_support = False low_freq_support = False nb_support = False if num_pos_samps == 0: vcf_out.write(line) else: df=load_df(var, exclude, sex) if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params): nb_support = True if num_pos_samps < min_pos_samps_for_regression: if has_low_freq_depth_support(df): low_freq_support = True vcf_out.write(line) else: for m_var in to_bnd_strings(var, True ): vcf_out.write(m_var + '\n') else: if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold): high_freq_support = True vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(high_freq_support)+"\t"+str(low_freq_support)+"\n") vcf_out.close() if diag_outfile is not None: outf.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR']) var.genotype(s).set_format( "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write( var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 sum_sq = 0.0 for gt in var.genotypes(): gt_string = gt.get_format('GT') if '.' not in gt_string: indexes = self.numeric_alleles(gt_string) for i in indexes: alleles[i] += 1 # iterate the number of non-reference samples if sum(indexes) > 0: num_samp += 1 try: sum_sq += float(gt.get_format('SQ')) except KeyError: pass allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp if num_samp > 0: msq = '%0.2f' % (sum_sq / num_samp) else: msq = '.' var.info['MSQ'] = msq # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf, fixed_genotypes=True) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 gt = [var.genotype(s).get_format('GT') for s in var.sample_list] for gt_string in gt: if '.' in gt_string: continue gt = gt_string.split('/') if len(gt) == 1: gt = gt_string.split('|') gt = map(int, gt) for i in xrange(len(gt)): alleles[gt[i]] += 1 # iterate the number of non-reference samples if sum(gt) > 0: num_samp += 1 allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp var.info['MSQ'] = self.calc_msq(var) # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
class TestVariant(TestCase): def setUp(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA0001' ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9 1/1:15' self.variant = Variant(self.variant_line.split('\t'), self.vcf) def test_parse_genotypes(self): genotype_field_strings = ['0/1:20', '0/0:15'] parsed_dict = self.variant._parse_genotypes(genotype_field_strings) na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(':')) na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(':')) expected_genotype_dict = { 'NA12878': na12878_gt, 'NA0001': na0001_gt } self.assertEqual(parsed_dict, expected_genotype_dict) def test_set_info(self): self.variant.set_info('SVTYPE', 'INV') self.assertEqual(self.variant.info['SVTYPE'], 'INV') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.info['IMAFLAG'], False) with self.assertRaises(SystemExit) as cm: self.variant.set_info('SUPER', True) def test_get_info(self): self.assertEqual(self.variant.get_info('IMAFLAG'), True) self.assertEqual(self.variant.get_info('SVTYPE'), 'BND') with self.assertRaises(KeyError) as cm: self.variant.get_info('CALI') def test_get_info_string(self): self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9') def test_get_format_string(self): self.assertEqual(self.variant.get_format_string(), 'GT:SU') def test_get_gt_string(self): self.assertEqual(self.variant.get_gt_string(), '0/0:9 1/1:15') def test_genotype(self): self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9') def test_genotypes(self): self.assertEqual([ x.get_gt_string() for x in self.variant.genotypes() ], ['0/0:9', '1/1:15']) def test_var_string(self): self.assertEqual(self.variant.get_var_string(), self.variant_line) self.variant.genotype('NA12878').set_format('GT', './.') self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line) self.assertNotEqual(self.variant.get_var_string(), self.variant_line) def test_add_genotype(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ] vcf = Vcf() vcf.add_header(header_lines) variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG SU 9' variant = Variant(variant_line.split('\t'), vcf) self.assertEqual(variant.get_gt_string(), './.:9')
def merge_single_bp(BP, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes): A = BP[0].l.rstrip().split('\t') var = Variant(A, vcf) try: sname = var.get_info('SNAME') var.set_info('SNAME', sname + ':' + var.var_id) except KeyError: pass var.var_id = str(v_id) if use_product: var.set_info('ALG', 'PROD') else: var.set_info('ALG', 'SUM') GTS = None if include_genotypes: null_string = null_format_string(A[8]) gt_dict = {sname: A[9]} GTS = '\t'.join([gt_dict.get(x, null_string) for x in sample_order]) var.gts = None var.gts_string = GTS return var
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) batch = dict() if batch_file is not None: for line in batch_file: fields = line.rstrip().split('\t') if fields[1] == 'None': raise RuntimeError('Batch file contains a batch label of None. This label is reserved.') batch[fields[0]] = fields[1] outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype') vcf.add_format('GTO', 1, 'String', 'Genotype before refinement') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL prior to reclassification # DUPs can be quite complicated in their allelic structure # and thus less amenable to refinement by clustering in many cases # INV and BNDs are also unclear. # See earlier commits for code of previous attempts to refine these. if svtype not in ['DEL', 'MEI']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex, batch) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: g = var.genotype(s) g.set_format("GTO", g.get_format("GT")) g.set_format("GQO", g.get_format("GQ")) if s in recdf.index: var.genotype(s).set_format("GT", recdf.loc[s, 'GTR']) var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GT", "./.") var.genotype(s).set_format("GQ", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def calc_params(vcf_path): tSet = list() epsilon=0.1 header=[] in_header = True vcf = Vcf() if vcf_path.endswith('.gz'): vcf_file = gzip.open(vcf_path, 'rb') else: vcf_file = open(vcf_path, 'r') for line in vcf_file: if in_header: if line[0] == '#': header.append(line) if line[1] != '#': vcf_samples = line.rstrip().split('\t')[9:] in_header = False vcf.add_header(header) continue else: v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break if svtype not in ['DEL', 'DUP'] or v[0]=="X" or v[0]=="Y": continue var = Variant(v, vcf) for sample in vcf_samples: sample_genotype = var.genotype(sample) if sample_genotype.get_format('GT') != './.': log2r = math.log((float(sample_genotype.get_format('CN'))+ epsilon)/2,2) #to avoid log(0) tSet.append(CN_rec(var.var_id, sample, var.info['SVTYPE'], abs(float(var.info['SVLEN'])), var.info['AF'], sample_genotype.get_format('GT'), sample_genotype.get_format('CN'), sample_genotype.get_format('AB'), math.log(abs(float(var.info['SVLEN']))), log2r)) df=pd.DataFrame(tSet, columns=CN_rec._fields) #exclude from training data, DELs and DUPs with CN in the tails of the distribution df['q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile) df['q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile) df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)] #df.to_csv('./train.csv') #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)] small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)] het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r']) hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r']) small_het_dels['offset']=small_het_dels['log2r']-het_del_mean small_hom_dels['offset']=small_hom_dels['log2r']-hom_del_mean with warnings.catch_warnings(): warnings.filterwarnings("ignore") hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit() het_del_fit=smf.ols('offset~log_len',small_het_dels).fit() #print hom_del_fit.summary() #print het_del_fit.summary() small_hom_dels['log2r_adj'] = small_hom_dels['log2r'] - hom_del_fit.predict(small_hom_dels) small_het_dels['log2r_adj'] = small_het_dels['log2r'] - het_del_fit.predict(small_het_dels) small_dels=small_hom_dels.append(small_het_dels) small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']] # dels of length<100 bp are excluded here df1=df[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000)] df1['log2r_adj']=df1['log2r'] df1=df1.append(small_dels) params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index() params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index() params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2'] params['std_pooled']=np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2'])) #params.to_csv('./params.csv') return (params, het_del_fit, hom_del_fit)
class TestVariant(TestCase): def setUp(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878' ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9' self.variant = Variant(self.variant_line.split('\t'), self.vcf) def test_set_info(self): self.variant.set_info('SVTYPE', 'INV') self.assertEqual(self.variant.info['SVTYPE'], 'INV') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.info['IMAFLAG'], False) with self.assertRaises(SystemExit) as cm: self.variant.set_info('SUPER', True) def test_get_info(self): self.assertEqual(self.variant.get_info('IMAFLAG'), True) self.assertEqual(self.variant.get_info('SVTYPE'), 'BND') with self.assertRaises(KeyError) as cm: self.variant.get_info('CALI') def test_get_info_string(self): self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9') def test_get_format_string(self): self.assertEqual(self.variant.get_format_string(), 'GT:SU') def test_genotype(self): self.assertEqual(self.variant.genotype('NA12878').get_gt_string(), '0/0:9') def test_var_string(self): self.assertEqual(self.variant.get_var_string(), self.variant_line)
def merge_single_bp(BP, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes): A = BP[0].l.rstrip().split('\t') var = Variant(A,vcf) try: sname = var.get_info('SNAME') var.set_info('SNAME', sname + ':' + var.var_id) except KeyError: pass var.var_id=str(v_id) if use_product: var.set_info('ALG', 'PROD') else: var.set_info('ALG', 'SUM') GTS = None if include_genotypes: null_string = null_format_string(A[8]) gt_dict = { sname: A[9] } GTS = '\t'.join([gt_dict.get(x, null_string) for x in sample_order]) var.gts = None var.gts_string = GTS return var
class TestVariant8Col(TestCase): def setUp(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO' ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG' self.variant = Variant(self.variant_line.split('\t'), self.vcf) def test_set_info(self): self.variant.set_info('SVTYPE', 'INV') self.assertEqual(self.variant.info['SVTYPE'], 'INV') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.info['IMAFLAG'], False) with self.assertRaises(SystemExit) as cm: self.variant.set_info('SUPER', True) def test_get_info(self): self.assertEqual(self.variant.get_info('IMAFLAG'), True) self.assertEqual(self.variant.get_info('SVTYPE'), 'BND') with self.assertRaises(KeyError) as cm: self.variant.get_info('CALI') def test_get_info_string(self): self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9') def test_get_format_string(self): self.assertEqual(self.variant.get_format_string(), None) def test_get_format_string_caching(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO' ] vcf = Vcf() vcf.add_header(header_lines) variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG' variant = Variant(variant_line.split('\t'), vcf) self.assertEqual(variant.get_format_string(), None) gts = variant.genotypes() self.assertEqual(variant.get_format_string(), None) self.assertEqual(variant.get_format_string(True), None) def test_get_gt_string(self): self.assertEqual(self.variant.get_gt_string(), None) def test_genotypes(self): self.assertEqual(self.variant.genotypes(), []) def test_var_string(self): self.assertEqual(self.variant.get_var_string(), self.variant_line)
def __iter__(self): for line in self.stream: yield Variant(line.rstrip().split('\t'), self.vcf_obj)
def calc_params(vcf_path, sex_chrom_names): tSet = list() epsilon=0.1 header=[] in_header = True vcf = Vcf() if vcf_path.endswith('.gz'): vcf_file = gzip.open(vcf_path, 'rb') else: vcf_file = open(vcf_path, 'r') for line in vcf_file: if in_header: if line[0] == '#': header.append(line) if line[1] != '#': vcf_samples = line.rstrip().split('\t')[9:] in_header = False vcf.add_header(header) continue else: v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break if svtype not in ['DEL', 'DUP'] or v[0] in sex_chrom_names: continue var = Variant(v, vcf) for sample in vcf_samples: sample_genotype = var.genotype(sample) if sample_genotype.get_format('GT') != './.': log2r = math.log((float(sample_genotype.get_format('CN'))+ epsilon)/2,2) #to avoid log(0) tSet.append( CN_rec( var.var_id, sample, var.info['SVTYPE'], abs(float(var.info['SVLEN'])), var.info['AF'], sample_genotype.get_format('GT'), sample_genotype.get_format('CN'), sample_genotype.get_format('AB'), math.log(abs(float(var.info['SVLEN']))), log2r ) ) df=pd.DataFrame(tSet, columns=CN_rec._fields) #exclude from training data, DELs and DUPs with CN in the tails of the distribution df.loc[:,'q_low']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(lowQuantile) df.loc[:,'q_high']=df.groupby(['sample', 'svtype', 'GT'])['log2r'].transform(highQuantile) df=df[(df.log2r>=df.q_low) & (df.log2r<=df.q_high)] #df.to_csv('./train.csv') #adjust copy number for small deletions (<1kb), no strong relationship b/w cn and size for dups evident so far small_het_dels = df[(df.svtype=="DEL") & (df.GT=="0/1") & (df.svlen<1000) & (df.svlen>=50)].copy() small_hom_dels = df[(df.svtype=="DEL") & (df.GT=="1/1") & (df.svlen<1000) & (df.svlen>=50)].copy() het_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="0/1") & (df.svtype=="DEL")]['log2r']) hom_del_mean=np.mean(df[(df.svlen>1000) & (df.GT=="1/1") & (df.svtype=="DEL")]['log2r']) small_het_dels.loc[:,'offset']=small_het_dels.loc[:,'log2r']-het_del_mean small_hom_dels.loc[:,'offset']=small_hom_dels.loc[:,'log2r']-hom_del_mean with warnings.catch_warnings(): warnings.filterwarnings("ignore") hom_del_fit=smf.ols('offset~log_len',small_hom_dels).fit() het_del_fit=smf.ols('offset~log_len',small_het_dels).fit() #print hom_del_fit.summary() #print het_del_fit.summary() small_hom_dels.loc[:,'log2r_adj'] = small_hom_dels.loc[:,'log2r'] - hom_del_fit.predict(small_hom_dels) small_het_dels.loc[:,'log2r_adj'] = small_het_dels.loc[:,'log2r'] - het_del_fit.predict(small_het_dels) small_dels=small_hom_dels.append(small_het_dels) small_dels=small_dels[['var_id', 'sample', 'svtype', 'svlen', 'AF', 'GT', 'CN', 'log_len', 'log2r', 'q_low', 'q_high', 'log2r_adj']] # dels of length<100 bp are excluded here df1=df.loc[(df.svtype!="DEL") | (df.GT=="0/0") | (df.svlen>=1000), :].copy() df1.loc[:,'log2r_adj']=df1.loc[:,'log2r'] df1=df1.append(small_dels) params=df1.groupby(['sample', 'svtype', 'GT'])['log2r_adj'].aggregate([np.mean,np.var, len]).reset_index() params=pd.pivot_table(params, index=['sample', 'svtype'], columns='GT', values=['mean', 'var', 'len']).reset_index() params.columns=['sample', 'svtype', 'mean0', 'mean1', 'mean2', 'var0', 'var1', 'var2', 'len0', 'len1', 'len2'] params['std_pooled'] = np.sqrt((params['var0']*params['len0']+params['var1']*params['len1']+params['var2']*params['len2'])/(params['len0']+params['len1']+params['len2'])) #params.to_csv('./params.csv') return (params, het_del_fit, hom_del_fit)
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold): vcf_out = sys.stdout vcf = Vcf() header = [] in_header = True min_pos_samps_for_regression = 10 gender = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') gender[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) # write the output header vcf_out.write(vcf.get_header() + '\n') # split variant line, quick pre-check if the SVTYPE is BND, and skip if so v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue # parse the VCF line var = Variant(v, vcf, True) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue # # write to directory # writedir = 'data/r11.100kb.dup' # annotate based on read depth if var.info['SVTYPE'] in ['DEL', 'DUP']: # count the number of positively genotyped samples num_pos_samps = 0; for s in var.sample_list: if s in exclude: continue if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 if num_pos_samps < min_pos_samps_for_regression: if has_low_freq_depth_support(var, gender, exclude): # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd') # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd') # write variant #vcf_out.write(var.get_var_string(True) + '\n') vcf_out.write(line) else: # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd') # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd') for m_var in to_bnd_strings(var): vcf_out.write(m_var + '\n') else: if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold): # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd') # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd') # write variant #vcf_out.write(var.get_var_string(True) + '\n') vcf_out.write(line) else: # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd') # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd') for m_var in to_bnd_strings(var): vcf_out.write(m_var + '\n') vcf_out.close() return
def sv_classify(vcf_in, vcf_out, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method): vcf = Vcf() header = [] in_header = True sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n") for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue var = Variant(v, vcf) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue #count positively genotyped samples num_pos_samps = 0 num_total_samps=len(var.sample_list) for s in var.sample_list: if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 nb_support = False ls_support = False hybrid_support = False has_rd_support = False if num_pos_samps == 0: vcf_out.write(line) else: df=load_df(var, exclude, sex) if method=='large_sample': ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support=ls_support elif method=='naive_bayes': nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv) has_rd_support=nb_support elif method=='hybrid': ls_support, nb_support, hybrid_support = has_rd_support_hybrid(df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support=hybrid_support if has_rd_support: vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(ls_support)+"\t"+str(hybrid_support)+"\t"+str(has_rd_support)+"\n") vcf_out.close() if diag_outfile is not None: outf.close() vcf_in.close() vcf_out.close() gender_file.close() if exclude_file is not None: exclude_file.close() return
class TestVariant(TestCase): def setUp(self): header_lines = [ "##fileformat=VCFv4.2", "##fileDate=20151202", '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA0001", ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = ( "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9 1/1:15" ) self.variant = Variant(self.variant_line.split("\t"), self.vcf) def test_parse_genotypes(self): genotype_field_strings = ["0/1:20", "0/0:15"] parsed_dict = self.variant._parse_genotypes(genotype_field_strings) na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(":")) na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(":")) expected_genotype_dict = {"NA12878": na12878_gt, "NA0001": na0001_gt} self.assertEqual(parsed_dict, expected_genotype_dict) def test_set_info(self): self.variant.set_info("SVTYPE", "INV") self.assertEqual(self.variant.info["SVTYPE"], "INV") self.variant.set_info("IMAFLAG", False) self.assertEqual(self.variant.info["IMAFLAG"], False) with self.assertRaises(SystemExit) as cm: self.variant.set_info("SUPER", True) def test_get_info(self): self.assertEqual(self.variant.get_info("IMAFLAG"), True) self.assertEqual(self.variant.get_info("SVTYPE"), "BND") with self.assertRaises(KeyError) as cm: self.variant.get_info("CALI") def test_get_info_string(self): self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9;IMAFLAG") self.variant.set_info("IMAFLAG", False) self.assertEqual(self.variant.get_info_string(), "SVTYPE=BND;STRANDS=-+:9") def test_get_format_string(self): self.assertEqual(self.variant.get_format_string(), "GT:SU") def test_get_format_string_caching(self): header_lines = [ "##fileformat=VCFv4.2", "##fileDate=20151202", '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878", ] vcf = Vcf() vcf.add_header(header_lines) variant_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:AS:SU 0/0:1:9" variant = Variant(variant_line.split("\t"), vcf) self.assertEqual(variant.get_format_string(), "GT:AS:SU") gts = variant.genotypes() self.assertEqual(variant.get_format_string(), "GT:SU:AS") self.assertEqual(variant.get_format_string(True), "GT:AS:SU") def test_get_gt_string(self): self.assertEqual(self.variant.get_gt_string(), "0/0:9 1/1:15") def test_genotype(self): self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/0:9") def test_set_genotype(self): new_genotype = Genotype(self.variant, ["0/1", "9"]) self.variant.set_genotype("NA12878", new_genotype) self.assertEqual(self.variant.genotype("NA12878").get_gt_string(), "0/1:9") def test_genotypes(self): self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()], ["0/0:9", "1/1:15"]) def test_var_string(self): self.assertEqual(self.variant.get_var_string(), self.variant_line) self.variant.genotype("NA12878").set_format("GT", "./.") self.assertEqual(self.variant.get_var_string(use_cached_gt_string=True), self.variant_line) self.assertNotEqual(self.variant.get_var_string(), self.variant_line) def test_var_string_format_caching(self): header_lines = [ "##fileformat=VCFv4.2", "##fileDate=20151202", '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=AS,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878", ] vcf = Vcf() vcf.add_header(header_lines) variant_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:AS:SU 0/0:1:9" uncached_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU:AS 0/0:9:1" variant = Variant(variant_line.split("\t"), vcf) gt = variant.genotypes() # force parsing self.assertEqual(variant.get_var_string(), uncached_line) self.assertEqual(variant.get_var_string(use_cached_gt_string=True), variant_line) def test_add_genotype(self): header_lines = [ "##fileformat=VCFv4.2", "##fileDate=20151202", '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878", ] vcf = Vcf() vcf.add_header(header_lines) variant_line = "1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG SU 9" variant = Variant(variant_line.split("\t"), vcf) self.assertEqual(variant.get_gt_string(), "./.:9")
def create_merged_variant(BP, c, v_id, vcf, use_product, weighting_scheme='unweighted'): new_start_L, new_start_R, p_L , p_R, ALG = combine_pdfs(BP, c, use_product, weighting_scheme) max_i_L = p_L.index(max(p_L)) max_i_R = p_R.index(max(p_R)) [cipos95, ciend95]=getCI95( p_L, p_R, max_i_L, max_i_R) new_pos_L = new_start_L + max_i_L new_pos_R = new_start_R + max_i_R BP0=BP[c[0]] A=BP0.l.rstrip().split('\t', 10) ALT = '' if BP0.sv_type == 'BND': if BP0.strands[:2] == '++': ALT = 'N]' + BP0.right.chrom + ':' + str(new_pos_R) + ']' elif BP0.strands[:2] == '-+': ALT = ']' + BP0.right.chrom + ':' + str(new_pos_R) + ']N' elif BP0.strands[:2] == '+-': ALT = 'N[' + BP0.right.chrom + ':' + str(new_pos_R) + '[' elif BP0.strands[:2] == '--': ALT = '[' + BP0.right.chrom + ':' + str(new_pos_R) + '[N' else: ALT = '<' + BP0.sv_type + '>' var_list=[ BP0.left.chrom, new_pos_L, str(v_id), 'N', ALT, 0.0, '.', ''] + A[8:] var=Variant(var_list, vcf) var.set_info('SVTYPE', BP0.sv_type) var.set_info('ALG', ALG) if var.get_info('SVTYPE')=='DEL': var.set_info('SVLEN', new_pos_L - new_pos_R) elif BP0.left.chrom == BP0.right.chrom: var.set_info('SVLEN', new_pos_R - new_pos_L) else: SVLEN = None if var.get_info('SVTYPE') == 'BND': var.set_info('EVENT', str(v_id)) else: var.set_info('END', new_pos_R ) var.set_info('CIPOS95', cipos95) var.set_info('CIEND95', ciend95) var.set_info('CIPOS', ','.join([str(x) for x in [-1*max_i_L, len(p_L) - max_i_L - 1]])) var.set_info('CIEND', ','.join([str(x) for x in [-1*max_i_R, len(p_R) - max_i_R - 1]])) var.set_info('PRPOS', ','.join([str(x) for x in p_L])) var.set_info('PREND', ','.join([str(x) for x in p_R])) return var
def create_merged_variant(BP, c, v_id, vcf, use_product, weighting_scheme='unweighted'): new_start_L, new_start_R, p_L, p_R, ALG = combine_pdfs( BP, c, use_product, weighting_scheme) max_i_L = p_L.index(max(p_L)) max_i_R = p_R.index(max(p_R)) [cipos95, ciend95] = getCI95(p_L, p_R, max_i_L, max_i_R) new_pos_L = new_start_L + max_i_L new_pos_R = new_start_R + max_i_R BP0 = BP[c[0]] # sometimes after looking at PRs, the left and right can be swapped. # flip them back so downstream tools don't break. if new_pos_R < new_pos_L and BP0.sv_type != 'BND': new_pos_R, new_pos_L = new_pos_L, new_pos_R cipos95, ciend95 = ciend95, cipos95 p_L, p_R = p_R, p_L max_i_R, max_i_L = max_i_L, max_i_R A = BP0.l.rstrip().split('\t', 10) ALT = '' if BP0.sv_type == 'BND': if BP0.strands[:2] == '++': ALT = 'N]' + BP0.right.chrom + ':' + str(new_pos_R) + ']' elif BP0.strands[:2] == '-+': ALT = ']' + BP0.right.chrom + ':' + str(new_pos_R) + ']N' elif BP0.strands[:2] == '+-': ALT = 'N[' + BP0.right.chrom + ':' + str(new_pos_R) + '[' elif BP0.strands[:2] == '--': ALT = '[' + BP0.right.chrom + ':' + str(new_pos_R) + '[N' else: ALT = '<' + BP0.sv_type + '>' var_list = [BP0.left.chrom, new_pos_L, str(v_id), 'N', ALT, 0.0, '.', ''] + A[8:] var = Variant(var_list, vcf) var.set_info('SVTYPE', BP0.sv_type) var.set_info('ALG', ALG) if var.get_info('SVTYPE') == 'DEL': var.set_info('SVLEN', new_pos_L - new_pos_R) elif BP0.left.chrom == BP0.right.chrom: var.set_info('SVLEN', new_pos_R - new_pos_L) else: SVLEN = None if var.get_info('SVTYPE') == 'BND': var.set_info('EVENT', str(v_id)) elif var.get_info('SVTYPE') == 'INS': var.set_info('END', new_pos_L) else: var.set_info('END', new_pos_R) var.set_info('CIPOS95', cipos95) var.set_info('CIEND95', ciend95) var.set_info( 'CIPOS', ','.join([str(x) for x in [-1 * max_i_L, len(p_L) - max_i_L - 1]])) var.set_info( 'CIEND', ','.join([str(x) for x in [-1 * max_i_R, len(p_R) - max_i_R - 1]])) var.set_info('PRPOS', ','.join([str(x) for x in p_L])) var.set_info('PREND', ','.join([str(x) for x in p_R])) return var
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF'))<0.01: vcf_out.write(line) else: df=load_df(var, exclude, sex) recdf=recluster(df) if ct==1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s,'GTR']) var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
class TestVariant(TestCase): def setUp(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20151202', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=STRANDS,Number=.,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">', '##INFO=<ID=IMAFLAG,Number=.,Type=Flag,Description="Test Flag code">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=SU,Number=1,Type=Integer,Description="Number of pieces of evidence supporting the variant">', '##FORMAT=<ID=INACTIVE,Number=1,Type=Integer,Description="A format not in use">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 NA0001' ] self.vcf = Vcf() self.vcf.add_header(header_lines) self.variant_line = '1 820915 5838_1 N ]GL000232.1:20940]N 0.00 . SVTYPE=BND;STRANDS=-+:9;IMAFLAG GT:SU 0/0:9 1/1:15' self.variant = Variant(self.variant_line.split('\t'), self.vcf) def test_parse_genotypes(self): genotype_field_strings = ['0/1:20', '0/0:15'] parsed_dict = self.variant._parse_genotypes(genotype_field_strings) na12878_gt = Genotype(self.variant, genotype_field_strings[0].split(':')) na0001_gt = Genotype(self.variant, genotype_field_strings[1].split(':')) expected_genotype_dict = {'NA12878': na12878_gt, 'NA0001': na0001_gt} self.assertEqual(parsed_dict, expected_genotype_dict) def test_set_info(self): self.variant.set_info('SVTYPE', 'INV') self.assertEqual(self.variant.info['SVTYPE'], 'INV') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.info['IMAFLAG'], False) with self.assertRaises(SystemExit) as cm: self.variant.set_info('SUPER', True) def test_get_info(self): self.assertEqual(self.variant.get_info('IMAFLAG'), True) self.assertEqual(self.variant.get_info('SVTYPE'), 'BND') with self.assertRaises(KeyError) as cm: self.variant.get_info('CALI') def test_get_info_string(self): self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9;IMAFLAG') self.variant.set_info('IMAFLAG', False) self.assertEqual(self.variant.get_info_string(), 'SVTYPE=BND;STRANDS=-+:9') def test_get_format_string(self): self.assertEqual(self.variant.get_format_string(), 'GT:SU') def test_get_gt_string(self): self.assertEqual(self.variant.get_gt_string(), '0/0:9 1/1:15') def test_genotype(self): self.assertEqual( self.variant.genotype('NA12878').get_gt_string(), '0/0:9') def test_genotypes(self): self.assertEqual([x.get_gt_string() for x in self.variant.genotypes()], ['0/0:9', '1/1:15']) def test_var_string(self): self.assertEqual(self.variant.get_var_string(), self.variant_line) self.variant.genotype('NA12878').set_format('GT', './.') self.assertEqual( self.variant.get_var_string(use_cached_gt_string=True), self.variant_line) self.assertNotEqual(self.variant.get_var_string(), self.variant_line)
def sv_classify(vcf_in, vcf_out, gender_file, sex_chrom_names, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method): vcf = Vcf() header = [] in_header = True sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n") for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue var = Variant(v, vcf) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue #count positively genotyped samples num_pos_samps = 0 num_total_samps=len(var.sample_list) for s in var.sample_list: if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 nb_support = False ls_support = False hybrid_support = False has_rd_support = False if num_pos_samps == 0: vcf_out.write(line) else: df = load_df(var, exclude, sex, sex_chrom_names) if method == 'large_sample': ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support = ls_support elif method == 'naive_bayes': nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv) has_rd_support = nb_support elif method == 'hybrid': ls_support, nb_support, hybrid_support = has_rd_support_hybrid( df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps ) has_rd_support=hybrid_support if has_rd_support: vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write( '\t'.join(( var.var_id, svtype, str(svlen), str(num_pos_samps), str(nb_support), str(ls_support), str(hybrid_support), str(has_rd_support) )) + "\n" ) vcf_out.close() if diag_outfile is not None: outf.close() vcf_in.close() vcf_out.close() gender_file.close() if exclude_file is not None: exclude_file.close() return
def bedpeToVcf(bedpe_file, vcf_out): myvcf = Vcf() in_header = True # parse the bedpe data header = list() for line in bedpe_file: if in_header: if line[0:2] == '##': header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list_str = line.rstrip().split('\t', 14)[-1] header.append('\t'.join([ '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', sample_list_str ] )) continue else: in_header = False myvcf.add_header(header) myvcf.file_format='VCFv4.2' vcf_out.write(myvcf.get_header() + '\n') # bedpe = Bedpe(line.rstrip().split('\t')) if bedpe.svtype == 'BND': bedpe1_list = [ bedpe.c1, bedpe.b1 + 1, bedpe.name + '_1', #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe1_list.extend(bedpe.misc) var1 = Variant(bedpe1_list, myvcf) if bedpe.o1 == '+': if bedpe.o2 == '-': var1.alt = '%s[%s:%s[' % (var1.ref, bedpe.c2, bedpe.b2 + 1) elif bedpe.o2 == '+': var1.alt = '%s]%s:%s]' % (var1.ref, bedpe.c2, bedpe.b2 + 1) elif bedpe.o1 == '-': if bedpe.o2 == '+': var1.alt = ']%s:%s]%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref) elif bedpe.o2 == '-': var1.alt = '[%s:%s[%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref) misc = copy.deepcopy(bedpe.misc) strands = re.split('=|:',''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";")))) strands_str = str(strands[0]) + '=' + str(strands[1][::-1]) + ':' + str(strands[2]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))), strands_str) #add the cipos ciend,cipos95 and ciend95 variables misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))),'CIPOS='+ re.split('=',''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))),'CIEND='+ re.split('=',''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))),'CIPOS95='+ re.split('=',''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))),'CIEND95='+ re.split('=',''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))))[1]) #Change MATEID misc[0]= misc[0].replace(''.join(filter(lambda x: 'MATEID=' in x, bedpe.misc[0].split(";"))),'MATEID=' + bedpe.name + '_2') #ADD IDENTIFIER FOR SECONDARY BREAKEND MATE misc[0]=misc[0].replace(''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))),''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))) + ';SECONDARY;') bedpe2_list = [ bedpe.c2, #chrom1 bedpe.b2 + 1, bedpe.name + '_2', #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe2_list.extend(misc) var2 = Variant(bedpe2_list, myvcf) # add the strands field. For variant 2 must switch the order if bedpe.o2 == '+': if bedpe.o1 == '-': var2.alt = '%s[%s:%s[' % (var2.ref, bedpe.c1, bedpe.b1 + 1) elif bedpe.o1 == '+': var2.alt = '%s]%s:%s]' % (var2.ref, bedpe.c1, bedpe.b1 + 1) elif bedpe.o2 == '-': if bedpe.o1 == '+': var2.alt = ']%s:%s]%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref) elif bedpe.o1 == '-': var2.alt = '[%s:%s[%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref) if bedpe.malformedFlag == 0: vcf_out.write(var1.get_var_string() + '\n') vcf_out.write(var2.get_var_string() + '\n') elif bedpe.malformedFlag == 1: vcf_out.write(var2.get_var_string() + '\n') elif bedpe.malformedFlag == 2: vcf_out.write(var1.get_var_string() + '\n') else: # set VCF info elements for simple events bedpe_list = [ bedpe.c1, #chrom1 bedpe.b1 + 1, bedpe.name, #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe_list.extend(bedpe.misc) var = Variant(bedpe_list, myvcf) # write the record to the VCF output file vcf_out.write(var.get_var_string() + '\n') # close the VCF output file vcf_out.close() return