Example #1
0
def parse_vcf_record(vcf_line):

    A = vcf_line.rstrip().split('\t')
    if not 'SECONDARY' in A[7]:

        if 'SVTYPE=BND' in A[7]:
            sep, o_chr, o_pos = parse_bnd_alt_string(A[4])

            if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])):
                neg_s = A[7].find('--:')
                pos_s = A[7].find('++:')

                if neg_s > 0:
                    neg_e = neg_s + A[7][neg_s:].find(';')
                    pre = A[7][:neg_s]
                    mid = A[7][neg_s:neg_e]
                    post = A[7][neg_e:]
                    A[7] = pre + '++:0,' + mid + post
                else:
                    pos_e = pos_s + A[7][pos_s:].find(';')
                    pre = A[7][:pos_s]
                    mid = A[7][pos_s:pos_e]
                    post = A[7][pos_e:]
                    A[7] = pre + mid + ',--:0' + post

                A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos
                A[4] = '<INV>'
                vcf_line = '\t'.join(A) + '\n'

    return vcf_line
Example #2
0
def split_v(l):
    '''
    Split a VCF line into constituents and return a subset of values in an array
    '''
    A = l.rstrip().split('\t', 8)
    m = to_map(A[7])

    chr_l = A[0]
    pos_l = int(A[1])

    chr_r = A[0]
    pos_r = int(A[1])
    if m['SVTYPE'] == 'BND':
        sep, chr_r, pos_r = parse_bnd_alt_string(A[4])
        m['END'] = pos_r
        pos_r = int(pos_r)
    elif m['SVTYPE'] == 'INS':
        pos_r = pos_l + int(m['SVLEN'])
    else:
        pos_r = int(m['END'])

    start_l = pos_l + int(m['CIPOS'].split(',')[0])
    end_l = pos_l + int(m['CIPOS'].split(',')[1])

    start_r = pos_r + int(m['CIEND'].split(',')[0])
    end_r = pos_r + int(m['CIEND'].split(',')[1])

    strands = m['STRANDS']

    return [
        m['SVTYPE'], chr_l, chr_r, strands, start_l, end_l, start_r, end_r, m
    ]
Example #3
0
def convert_bnd(var):
    var.ref='N'
    alt=var.alt
    ff=alt.find("[")
    newalt=""
    strands=""
    sep1, chrom2, breakpoint2=su.parse_bnd_alt_string(alt)
    if chrom2 < var.chrom or (chrom2==var.chrom and int(breakpoint2)<int(var.pos)):
        var.set_info('SECONDARY', True)
    if ff==0:
        strands="--:"
        ff1=alt.find("[", 1)
        newalt=alt[0:(ff1+1)]+'N'
    elif ff>0:
        strands="+-:"
        newalt='N'+alt[ff::]
    else:
        ff=alt.find("]")
        if ff==0:
            strands="-+:"
            ff1=alt.find("]", 1)
            newalt=alt[0:(ff1+1)]+'N'
        else:
            strands="++:"
            newalt='N'+alt[ff::]
    var.alt=newalt
    var.info['STRANDS']=strands+str(var.info['SU'])
Example #4
0
def parse_vcf_record(vcf_line):

    A = vcf_line.rstrip().split('\t')
    if not 'SECONDARY' in A[7]:

        if 'SVTYPE=BND' in A[7]:
            sep, o_chr, o_pos = parse_bnd_alt_string(A[4])

            if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])):
                neg_s = A[7].find('--:')
                pos_s = A[7].find('++:')

                if neg_s > 0:
                    neg_e = neg_s + A[7][neg_s:].find(';')
                    pre=A[7][:neg_s]
                    mid=A[7][neg_s:neg_e]
                    post=A[7][neg_e:]
                    A[7] = pre + '++:0,' + mid + post
                else:
                    pos_e = pos_s + A[7][pos_s:].find(';')
                    pre=A[7][:pos_s]
                    mid=A[7][pos_s:pos_e]
                    post=A[7][pos_e:]
                    A[7] = pre + mid + ',--:0' + post

                A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos
                A[4] = '<INV>'
                vcf_line='\t'.join(A) + '\n'

    return vcf_line
Example #5
0
def split_v(l):
    '''
    Split a VCF line into constituents and return a subset of values in an array
    '''
    A = l.rstrip().split('\t', 8)
    m = to_map(A[7])

    chr_l = A[0]
    pos_l = int(A[1])

    chr_r = A[0]
    pos_r = int(A[1])
    if m['SVTYPE'] == 'BND':
        sep, chr_r, pos_r = parse_bnd_alt_string(A[4])
        m['END'] = pos_r
        pos_r = int(pos_r)
    else:
        pos_r = int(m['END'])

    start_l = pos_l + int(m['CIPOS'].split(',')[0])
    end_l = pos_l + int(m['CIPOS'].split(',')[1])

    start_r = pos_r + int(m['CIEND'].split(',')[0])
    end_r = pos_r + int(m['CIEND'].split(',')[1])

    strands = m['STRANDS']

    return [m['SVTYPE'],chr_l,chr_r,strands,start_l,end_l,start_r,end_r,m]
Example #6
0
 def test_bnd_alt_string(self):
     self.assertEqual(su.parse_bnd_alt_string('A[1:6['), ('[', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string('A]1:6]'), (']', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string(']1:6]A'), (']', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string(']HLA-DQB1*06:09:01:6]A'), (']', 'HLA-DQB1*06:09:01', '6'))
     with self.assertRaises(AssertionError):
         su.parse_bnd_alt_string(']1:6[A')
     with self.assertRaises(AssertionError):
         su.parse_bnd_alt_string('1')
Example #7
0
 def test_bnd_alt_string(self):
     self.assertEqual(su.parse_bnd_alt_string('A[1:6['), ('[', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string('A]1:6]'), (']', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string(']1:6]A'), (']', '1', '6'))
     self.assertEqual(su.parse_bnd_alt_string(']HLA-DQB1*06:09:01:6]A'),
                      (']', 'HLA-DQB1*06:09:01', '6'))
     with self.assertRaises(AssertionError):
         su.parse_bnd_alt_string(']1:6[A')
     with self.assertRaises(AssertionError):
         su.parse_bnd_alt_string('1')
Example #8
0
def parse_vcf(vcf_file_stream, vcf_lines, vcf_headers, add_sname=True, include_ref=False):
    header = ''
    samples = ''

    for l in vcf_file_stream:
        if l[0] == '#':
            if l[1] != '#':
                samples = l.rstrip().split('\t')[9:]
            else:
                # ignore fileDate
                if l[:10] == '##fileDate':
                    continue
                if l not in vcf_headers:
                    vcf_headers.append(l)
        else:
            A = l.rstrip().split('\t')
            if not include_ref and (len(A) > 8 and 'GT' in A[8]):
                has_nonref = False
                for sample_field in A[9:]:
                    if not (sample_field.startswith('0/0') or sample_field.startswith('./.')):
                        has_nonref = True
                        break
                if not has_nonref:
                    continue

            if not 'SECONDARY' in A[7]:

                if add_sname and (samples != []):
                    A[7] += ';' + 'SNAME=' + ','.join(samples)
                    l = '\t'.join(A) + '\n'


                if 'SVTYPE=BND' in A[7]:
                    sep, o_chr, o_pos = parse_bnd_alt_string(A[4])

                    if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])):
                        neg_s = A[7].find('--:')
                        pos_s = A[7].find('++:')

                        if neg_s > 0:
                            neg_e = neg_s + A[7][neg_s:].find(';')
                            pre=A[7][:neg_s]
                            mid=A[7][neg_s:neg_e]
                            post=A[7][neg_e:]
                            A[7] = pre + '++:0,' + mid + post
                        else:
                            pos_e = pos_s + A[7][pos_s:].find(';')
                            pre=A[7][:pos_s]
                            mid=A[7][pos_s:pos_e]
                            post=A[7][pos_e:]
                            A[7] = pre + mid + ',--:0' + post

                        A[7] = 'SVTYPE=INV' + A[7][10:] + ';END=' + o_pos
                        A[4] = '<INV>'
                        l = '\t'.join(A) + '\n'
                vcf_lines.append(l)

    return samples
Example #9
0
    def bnd_breakpoints(self, vcf_variant):
        '''
        Return a tuple containing calculated breakpoints and orientations for a BND variant
        '''
        chrom1 = vcf_variant.chrom
        breakpoint1 = vcf_variant.pos
        orientation1 = orientation2 = '+'
        sep, chrom2, breakpoint2 = parse_bnd_alt_string(vcf_variant.alt)
        breakpoint2 = int(breakpoint2)

        if vcf_variant.alt.startswith(sep):
            orientation1 = '-'
            breakpoint1 -= 1

        if sep == '[':
            orientation2 = '-'
            breakpoint2 -= 1

        return (chrom1, breakpoint1, breakpoint1, chrom2, breakpoint2,
                breakpoint2, orientation1, orientation2)
Example #10
0
def parse_vcf(vcf_file_stream,
              vcf_lines,
              vcf_headers,
              add_sname=True,
              include_ref=False):
    header = ''
    samples = ''

    for l in vcf_file_stream:
        if l[0] == '#':
            if l[1] != '#':
                samples = l.rstrip().split('\t')[9:]
            else:
                # ignore fileDate
                if l[:10] == '##fileDate':
                    continue
                if l not in vcf_headers:
                    vcf_headers.append(l)
        else:
            A = l.rstrip().split('\t')
            if not include_ref and (len(A) > 8 and 'GT' in A[8]):
                has_nonref = False
                for sample_field in A[9:]:
                    if not (sample_field.startswith('0/0')
                            or sample_field.startswith('./.')):
                        has_nonref = True
                        break
                if not has_nonref:
                    continue

            if not 'SECONDARY' in A[7]:

                if add_sname and (samples != []):
                    A[7] += ';' + 'SNAME=' + ','.join(samples)
                    l = '\t'.join(A) + '\n'

                if 'SVTYPE=BND' in A[7]:
                    sep, o_chr, o_pos = parse_bnd_alt_string(A[4])

                    if (o_chr == A[0]) and (('--:' in A[7]) != ('++' in A[7])):
                        neg_s = A[7].find('--:')
                        pos_s = A[7].find('++:')

                        if neg_s > 0:
                            neg_e = neg_s + A[7][neg_s:].find(';')
                            pre = A[7][:neg_s]
                            mid = A[7][neg_s:neg_e]
                            post = A[7][neg_e:]
                            A[7] = pre + '++:0,' + mid + post
                        else:
                            pos_e = pos_s + A[7][pos_s:].find(';')
                            pre = A[7][:pos_s]
                            mid = A[7][pos_s:pos_e]
                            post = A[7][pos_e:]
                            A[7] = pre + mid + ',--:0' + post
                        if ';END=' not in A[7]:
                            A[7] = A[7] + ';END=' + o_pos
                        A[7] = A[7].replace('SVTYPE=BND', 'SVTYPE=INV')
                        A[4] = '<INV>'
                        l = '\t'.join(A) + '\n'
                vcf_lines.append(l)

    return samples
Example #11
0
def write_var(var, vcf_out, include_genotypes=False):

    v_id = var.var_id
    if var.get_info('CIPOS95') != '0,0' or var.get_info('CIEND95') != '0,0':
        var.set_info('IMPRECISE', True)
    else:
        var.set_info('IMPRECISE', False)

    if var.get_info('SVTYPE') == 'INV' and ('--:0' in var.get_info('STRANDS')
                                            or '++:0'
                                            in var.get_info('STRANDS')):

        invtobnd(var)

    if var.alt not in ['<DEL>', '<DUP>', '<INV>', '<INS>']:

        var.var_id = str(v_id) + '_1'
        var.set_info('EVENT', v_id)
        var.set_info('MATEID', str(v_id) + '_2')
        var.info.pop('END', None)
        var.info.pop('SVLEN', None)

        varstring = var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring = '\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring + '\n')

        new_alt = ''

        if var.alt[0] == '[':
            new_alt = '[' + var.chrom + ':' + str(var.pos) + '[N'
        elif var.alt[0] == ']':
            new_alt = 'N[' + var.chrom + ':' + str(var.pos) + '['
        elif var.alt[-1] == '[':
            new_alt = ']' + var.chrom + ':' + str(var.pos) + ']N'
        elif var.alt[-1] == ']':
            new_alt = 'N]' + var.chrom + ':' + str(var.pos) + ']'

        sep, chrom, pos = parse_bnd_alt_string(var.alt)
        var.chrom = chrom
        var.pos = int(pos)
        var.var_id = str(v_id) + '_2'
        var.set_info('MATEID', str(v_id) + '_1')
        var.set_info('SECONDARY', True)
        var.alt = new_alt

        [tempci, temp95] = [var.get_info('CIPOS'), var.get_info('CIPOS95')]
        try:
            temppr = var.get_info('PRPOS')
        except KeyError:
            raise MissingProbabilitiesException(
                'Required tag PRPOS not found.')
        var.set_info('CIPOS', var.get_info('CIEND'))
        var.set_info('CIEND', tempci)
        var.set_info('CIPOS95', var.get_info('CIEND95'))
        var.set_info('CIEND95', temp95)
        try:
            var.set_info('PRPOS', var.get_info('PREND'))
        except KeyError:
            raise MissingProbabilitiesException(
                'Required tag PREND not found.')
        var.set_info('PREND', temppr)

        varstring = var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring = '\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring + '\n')

    else:
        varstring = var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring = '\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring + '\n')
Example #12
0
def write_var(var, vcf_out, include_genotypes=False):

    v_id=var.var_id
    if var.get_info('CIPOS95') != '0,0' or var.get_info('CIEND95') != '0,0':
        var.set_info('IMPRECISE', True)
    else:
        var.set_info('IMPRECISE', False)

    if var.get_info('SVTYPE') == 'INV' and ('--:0' in var.get_info('STRANDS') or '++:0' in var.get_info('STRANDS')):

        invtobnd(var)

    if var.alt not in ['<DEL>', '<DUP>', '<INV>']:

        var.var_id=str(v_id)+'_1'
        var.set_info('EVENT', v_id)
        var.set_info('MATEID', str(v_id)+'_2')
        var.info.pop('END', None)
        var.info.pop('SVLEN', None)

        varstring=var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring='\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring+'\n')

        new_alt = ''

        if var.alt[0] == '[':
            new_alt = '[' + var.chrom + ':' + str(var.pos) + '[N'
        elif var.alt[0] == ']':
            new_alt = 'N[' + var.chrom + ':' + str(var.pos) + '['
        elif var.alt[-1] == '[':
            new_alt = ']' + var.chrom + ':' + str(var.pos) + ']N'
        elif var.alt[-1] == ']':
            new_alt = 'N]' + var.chrom + ':' + str(var.pos) + ']'

        sep, chrom, pos = parse_bnd_alt_string(var.alt)
        var.chrom = chrom
        var.pos = int(pos)
        var.var_id = str(v_id)+'_2'
        var.set_info('MATEID', str(v_id)+'_1')
        var.set_info('SECONDARY', True)
        var.alt = new_alt

        [ tempci, temp95 ] = [var.get_info('CIPOS'), var.get_info('CIPOS95')]
        try:
            temppr = var.get_info('PRPOS')
        except KeyError:
            raise MissingProbabilitiesException('Required tag PRPOS not found.')
        var.set_info('CIPOS', var.get_info('CIEND'))
        var.set_info('CIEND', tempci)
        var.set_info('CIPOS95', var.get_info('CIEND95'))
        var.set_info('CIEND95', temp95 )
        try:
            var.set_info('PRPOS', var.get_info('PREND'))
        except KeyError:
            raise MissingProbabilitiesException('Required tag PREND not found.')
        var.set_info('PREND', temppr )

        varstring=var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring='\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring+'\n')


    else:
        varstring=var.get_var_string(use_cached_gt_string=True)
        if not include_genotypes:
            varstring='\t'.join(varstring.split('\t', 10)[:8])

        vcf_out.write(varstring+'\n')