def parse_vcf_record(vcf_line): A = vcf_line.rstrip().split('\t') if not 'SECONDARY' in A[7]: if 'SVTYPE=BND' in A[7]: sep, o_chr, o_pos = parse_bnd_alt_string(A[4]) if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])): neg_s = A[7].find('--:') pos_s = A[7].find('++:') if neg_s > 0: neg_e = neg_s + A[7][neg_s:].find(';') pre = A[7][:neg_s] mid = A[7][neg_s:neg_e] post = A[7][neg_e:] A[7] = pre + '++:0,' + mid + post else: pos_e = pos_s + A[7][pos_s:].find(';') pre = A[7][:pos_s] mid = A[7][pos_s:pos_e] post = A[7][pos_e:] A[7] = pre + mid + ',--:0' + post A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos A[4] = '<INV>' vcf_line = '\t'.join(A) + '\n' return vcf_line
def split_v(l): ''' Split a VCF line into constituents and return a subset of values in an array ''' A = l.rstrip().split('\t', 8) m = to_map(A[7]) chr_l = A[0] pos_l = int(A[1]) chr_r = A[0] pos_r = int(A[1]) if m['SVTYPE'] == 'BND': sep, chr_r, pos_r = parse_bnd_alt_string(A[4]) m['END'] = pos_r pos_r = int(pos_r) elif m['SVTYPE'] == 'INS': pos_r = pos_l + int(m['SVLEN']) else: pos_r = int(m['END']) start_l = pos_l + int(m['CIPOS'].split(',')[0]) end_l = pos_l + int(m['CIPOS'].split(',')[1]) start_r = pos_r + int(m['CIEND'].split(',')[0]) end_r = pos_r + int(m['CIEND'].split(',')[1]) strands = m['STRANDS'] return [ m['SVTYPE'], chr_l, chr_r, strands, start_l, end_l, start_r, end_r, m ]
def convert_bnd(var): var.ref='N' alt=var.alt ff=alt.find("[") newalt="" strands="" sep1, chrom2, breakpoint2=su.parse_bnd_alt_string(alt) if chrom2 < var.chrom or (chrom2==var.chrom and int(breakpoint2)<int(var.pos)): var.set_info('SECONDARY', True) if ff==0: strands="--:" ff1=alt.find("[", 1) newalt=alt[0:(ff1+1)]+'N' elif ff>0: strands="+-:" newalt='N'+alt[ff::] else: ff=alt.find("]") if ff==0: strands="-+:" ff1=alt.find("]", 1) newalt=alt[0:(ff1+1)]+'N' else: strands="++:" newalt='N'+alt[ff::] var.alt=newalt var.info['STRANDS']=strands+str(var.info['SU'])
def parse_vcf_record(vcf_line): A = vcf_line.rstrip().split('\t') if not 'SECONDARY' in A[7]: if 'SVTYPE=BND' in A[7]: sep, o_chr, o_pos = parse_bnd_alt_string(A[4]) if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])): neg_s = A[7].find('--:') pos_s = A[7].find('++:') if neg_s > 0: neg_e = neg_s + A[7][neg_s:].find(';') pre=A[7][:neg_s] mid=A[7][neg_s:neg_e] post=A[7][neg_e:] A[7] = pre + '++:0,' + mid + post else: pos_e = pos_s + A[7][pos_s:].find(';') pre=A[7][:pos_s] mid=A[7][pos_s:pos_e] post=A[7][pos_e:] A[7] = pre + mid + ',--:0' + post A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos A[4] = '<INV>' vcf_line='\t'.join(A) + '\n' return vcf_line
def split_v(l): ''' Split a VCF line into constituents and return a subset of values in an array ''' A = l.rstrip().split('\t', 8) m = to_map(A[7]) chr_l = A[0] pos_l = int(A[1]) chr_r = A[0] pos_r = int(A[1]) if m['SVTYPE'] == 'BND': sep, chr_r, pos_r = parse_bnd_alt_string(A[4]) m['END'] = pos_r pos_r = int(pos_r) else: pos_r = int(m['END']) start_l = pos_l + int(m['CIPOS'].split(',')[0]) end_l = pos_l + int(m['CIPOS'].split(',')[1]) start_r = pos_r + int(m['CIEND'].split(',')[0]) end_r = pos_r + int(m['CIEND'].split(',')[1]) strands = m['STRANDS'] return [m['SVTYPE'],chr_l,chr_r,strands,start_l,end_l,start_r,end_r,m]
def test_bnd_alt_string(self): self.assertEqual(su.parse_bnd_alt_string('A[1:6['), ('[', '1', '6')) self.assertEqual(su.parse_bnd_alt_string('A]1:6]'), (']', '1', '6')) self.assertEqual(su.parse_bnd_alt_string(']1:6]A'), (']', '1', '6')) self.assertEqual(su.parse_bnd_alt_string(']HLA-DQB1*06:09:01:6]A'), (']', 'HLA-DQB1*06:09:01', '6')) with self.assertRaises(AssertionError): su.parse_bnd_alt_string(']1:6[A') with self.assertRaises(AssertionError): su.parse_bnd_alt_string('1')
def parse_vcf(vcf_file_stream, vcf_lines, vcf_headers, add_sname=True, include_ref=False): header = '' samples = '' for l in vcf_file_stream: if l[0] == '#': if l[1] != '#': samples = l.rstrip().split('\t')[9:] else: # ignore fileDate if l[:10] == '##fileDate': continue if l not in vcf_headers: vcf_headers.append(l) else: A = l.rstrip().split('\t') if not include_ref and (len(A) > 8 and 'GT' in A[8]): has_nonref = False for sample_field in A[9:]: if not (sample_field.startswith('0/0') or sample_field.startswith('./.')): has_nonref = True break if not has_nonref: continue if not 'SECONDARY' in A[7]: if add_sname and (samples != []): A[7] += ';' + 'SNAME=' + ','.join(samples) l = '\t'.join(A) + '\n' if 'SVTYPE=BND' in A[7]: sep, o_chr, o_pos = parse_bnd_alt_string(A[4]) if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])): neg_s = A[7].find('--:') pos_s = A[7].find('++:') if neg_s > 0: neg_e = neg_s + A[7][neg_s:].find(';') pre=A[7][:neg_s] mid=A[7][neg_s:neg_e] post=A[7][neg_e:] A[7] = pre + '++:0,' + mid + post else: pos_e = pos_s + A[7][pos_s:].find(';') pre=A[7][:pos_s] mid=A[7][pos_s:pos_e] post=A[7][pos_e:] A[7] = pre + mid + ',--:0' + post A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos A[4] = '<INV>' l = '\t'.join(A) + '\n' vcf_lines.append(l) return samples
def bnd_breakpoints(self, vcf_variant): ''' Return a tuple containing calculated breakpoints and orientations for a BND variant ''' chrom1 = vcf_variant.chrom breakpoint1 = vcf_variant.pos orientation1 = orientation2 = '+' sep, chrom2, breakpoint2 = parse_bnd_alt_string(vcf_variant.alt) breakpoint2 = int(breakpoint2) if vcf_variant.alt.startswith(sep): orientation1 = '-' breakpoint1 -= 1 if sep == '[': orientation2 = '-' breakpoint2 -= 1 return (chrom1, breakpoint1, breakpoint1, chrom2, breakpoint2, breakpoint2, orientation1, orientation2)
def parse_vcf(vcf_file_stream, vcf_lines, vcf_headers, add_sname=True, include_ref=False): header = '' samples = '' for l in vcf_file_stream: if l[0] == '#': if l[1] != '#': samples = l.rstrip().split('\t')[9:] else: # ignore fileDate if l[:10] == '##fileDate': continue if l not in vcf_headers: vcf_headers.append(l) else: A = l.rstrip().split('\t') if not include_ref and (len(A) > 8 and 'GT' in A[8]): has_nonref = False for sample_field in A[9:]: if not (sample_field.startswith('0/0') or sample_field.startswith('./.')): has_nonref = True break if not has_nonref: continue if not 'SECONDARY' in A[7]: if add_sname and (samples != []): A[7] += ';' + 'SNAME=' + ','.join(samples) l = '\t'.join(A) + '\n' if 'SVTYPE=BND' in A[7]: sep, o_chr, o_pos = parse_bnd_alt_string(A[4]) if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])): neg_s = A[7].find('--:') pos_s = A[7].find('++:') if neg_s > 0: neg_e = neg_s + A[7][neg_s:].find(';') pre = A[7][:neg_s] mid = A[7][neg_s:neg_e] post = A[7][neg_e:] A[7] = pre + '++:0,' + mid + post else: pos_e = pos_s + A[7][pos_s:].find(';') pre = A[7][:pos_s] mid = A[7][pos_s:pos_e] post = A[7][pos_e:] A[7] = pre + mid + ',--:0' + post if ';END=' not in A[7]: A[7] = A[7] + ';END=' + o_pos A[7] = A[7].replace('SVTYPE=BND', 'SVTYPE=INV') A[4] = '<INV>' l = '\t'.join(A) + '\n' vcf_lines.append(l) return samples
def write_var(var, vcf_out, include_genotypes=False): v_id = var.var_id if var.get_info('CIPOS95') != '0,0' or var.get_info('CIEND95') != '0,0': var.set_info('IMPRECISE', True) else: var.set_info('IMPRECISE', False) if var.get_info('SVTYPE') == 'INV' and ('--:0' in var.get_info('STRANDS') or '++:0' in var.get_info('STRANDS')): invtobnd(var) if var.alt not in ['<DEL>', '<DUP>', '<INV>', '<INS>']: var.var_id = str(v_id) + '_1' var.set_info('EVENT', v_id) var.set_info('MATEID', str(v_id) + '_2') var.info.pop('END', None) var.info.pop('SVLEN', None) varstring = var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring = '\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring + '\n') new_alt = '' if var.alt[0] == '[': new_alt = '[' + var.chrom + ':' + str(var.pos) + '[N' elif var.alt[0] == ']': new_alt = 'N[' + var.chrom + ':' + str(var.pos) + '[' elif var.alt[-1] == '[': new_alt = ']' + var.chrom + ':' + str(var.pos) + ']N' elif var.alt[-1] == ']': new_alt = 'N]' + var.chrom + ':' + str(var.pos) + ']' sep, chrom, pos = parse_bnd_alt_string(var.alt) var.chrom = chrom var.pos = int(pos) var.var_id = str(v_id) + '_2' var.set_info('MATEID', str(v_id) + '_1') var.set_info('SECONDARY', True) var.alt = new_alt [tempci, temp95] = [var.get_info('CIPOS'), var.get_info('CIPOS95')] try: temppr = var.get_info('PRPOS') except KeyError: raise MissingProbabilitiesException( 'Required tag PRPOS not found.') var.set_info('CIPOS', var.get_info('CIEND')) var.set_info('CIEND', tempci) var.set_info('CIPOS95', var.get_info('CIEND95')) var.set_info('CIEND95', temp95) try: var.set_info('PRPOS', var.get_info('PREND')) except KeyError: raise MissingProbabilitiesException( 'Required tag PREND not found.') var.set_info('PREND', temppr) varstring = var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring = '\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring + '\n') else: varstring = var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring = '\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring + '\n')
def write_var(var, vcf_out, include_genotypes=False): v_id=var.var_id if var.get_info('CIPOS95') != '0,0' or var.get_info('CIEND95') != '0,0': var.set_info('IMPRECISE', True) else: var.set_info('IMPRECISE', False) if var.get_info('SVTYPE') == 'INV' and ('--:0' in var.get_info('STRANDS') or '++:0' in var.get_info('STRANDS')): invtobnd(var) if var.alt not in ['<DEL>', '<DUP>', '<INV>']: var.var_id=str(v_id)+'_1' var.set_info('EVENT', v_id) var.set_info('MATEID', str(v_id)+'_2') var.info.pop('END', None) var.info.pop('SVLEN', None) varstring=var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring='\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring+'\n') new_alt = '' if var.alt[0] == '[': new_alt = '[' + var.chrom + ':' + str(var.pos) + '[N' elif var.alt[0] == ']': new_alt = 'N[' + var.chrom + ':' + str(var.pos) + '[' elif var.alt[-1] == '[': new_alt = ']' + var.chrom + ':' + str(var.pos) + ']N' elif var.alt[-1] == ']': new_alt = 'N]' + var.chrom + ':' + str(var.pos) + ']' sep, chrom, pos = parse_bnd_alt_string(var.alt) var.chrom = chrom var.pos = int(pos) var.var_id = str(v_id)+'_2' var.set_info('MATEID', str(v_id)+'_1') var.set_info('SECONDARY', True) var.alt = new_alt [ tempci, temp95 ] = [var.get_info('CIPOS'), var.get_info('CIPOS95')] try: temppr = var.get_info('PRPOS') except KeyError: raise MissingProbabilitiesException('Required tag PRPOS not found.') var.set_info('CIPOS', var.get_info('CIEND')) var.set_info('CIEND', tempci) var.set_info('CIPOS95', var.get_info('CIEND95')) var.set_info('CIEND95', temp95 ) try: var.set_info('PRPOS', var.get_info('PREND')) except KeyError: raise MissingProbabilitiesException('Required tag PREND not found.') var.set_info('PREND', temppr ) varstring=var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring='\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring+'\n') else: varstring=var.get_var_string(use_cached_gt_string=True) if not include_genotypes: varstring='\t'.join(varstring.split('\t', 10)[:8]) vcf_out.write(varstring+'\n')