Esempio n. 1
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                   vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record')
                header=vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = ['#CHROM_A',
                        'START_A',
                        'END_A',
                        'CHROM_B',
                        'START_B',
                        'END_B',
                        'ID',
                        'QUAL',
                        'STRAND_A',
                        'STRAND_B',
                        'TYPE',
                        'FILTER',
                        'NAME_A',
                        'REF_A',
                        'ALT_A',
                        'NAME_B',
                        'REF_B',
                        'ALT_B',
                        'INFO_A',
                        'INFO_B']

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        unique_name = var.var_id
        # The EVENT ID is not unique. Stick with ID column.
        # if 'EVENT' in var.info:
        #     unique_name = var.info['EVENT']
        if var.info['SVTYPE'] != 'BND':
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            # Manta doesn't use the SECONDARY flag
            # So, just check whether the mate ID is in bnds
            # Which means that the mate has already been seen
            if 'MATEID' in var.info:
                mate_id = var.info['MATEID']
                if mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({unique_name:var})
            else:
                bnds.update({unique_name:var})
                continue
    intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys()
    for key in intersected_keys:
        bedpe_out.write(str(converter.convert(bnds[key], sec_bnds[key])) + '\n')
        del bnds[key]
        del sec_bnds[key]
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Esempio n. 2
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                    vcf.add_info_after(
                        "SVTYPE", "POS", 1, 'Integer',
                        'Position of the variant described in this record')
                header = vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = [
                    '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                    'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                    'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B',
                    'ALT_B', 'INFO_A', 'INFO_B'
                ]

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(
                        final_header_line +
                        ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        # If there is no MATEID then assume this is a single-ended BND and simply output
        if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info:
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            mate_id = var.info['MATEID']
            if 'SECONDARY' in var.info:
                if mate_id in bnds:
                    #primary
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    sec_bnds.update({var.var_id: var})
            else:
                if mate_id in sec_bnds:
                    var2 = sec_bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var, var2)) + '\n')
                    del sec_bnds[mate_id]
                elif mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({var.var_id: var})
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write(
                'Warning: missing secondary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write(
                'Warning: missing primary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Esempio n. 3
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                   vcf.add_info_after("SVTYPE", "POS", 1, 'Integer', 'Position of the variant described in this record')
                header=vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = ['#CHROM_A',
                        'START_A',
                        'END_A',
                        'CHROM_B',
                        'START_B',
                        'END_B',
                        'ID',
                        'QUAL',
                        'STRAND_A',
                        'STRAND_B',
                        'TYPE',
                        'FILTER',
                        'NAME_A',
                        'REF_A',
                        'ALT_A',
                        'NAME_B',
                        'REF_B',
                        'ALT_B',
                        'INFO_A',
                        'INFO_B']

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(final_header_line + ['FORMAT','\t'.join(map(str,sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        # If there is no MATEID then assume this is a single-ended BND and simply output
        if var.info['SVTYPE'] != 'BND' or 'MATEID' not in var.info:
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            mate_id = var.info['MATEID']
            if 'SECONDARY' in var.info:
                if mate_id in bnds:
                    #primary
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    sec_bnds.update({var.var_id:var})
            else:
                if mate_id in sec_bnds:
                    var2 = sec_bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var, var2)) + '\n')
                    del sec_bnds[mate_id]
                else:
                    bnds.update({var.var_id:var})
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write('Warning: missing secondary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write('Warning: missing primary multiline variant at ID:' + bnd + '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return
Esempio n. 4
0
def vcfToBedpe(vcf_file, bedpe_out):
    converter = VcfToBedpeConverter()
    vcf = svtools.vcf.file.Vcf()
    in_header = True
    header = []
    sample_list = []
    bnds = dict()
    sec_bnds = dict()
    v = []
    for line in vcf_file:
        if in_header:
            if line[0:2] == '##':
                if line.split('=')[0] == '##fileformat':
                    line = '##fileformat=' + "BEDPE" + '\n'
                if line.split('=')[0] == '##fileDate':
                    line = '##fileDate=' + time.strftime('%Y%m%d') + '\n'
                header.append(line)
                continue
            elif line[0] == '#' and line[1] != '#':
                sample_list = line.rstrip().split('\t')[9:]
                header.append(line)
                continue
            else:
                # print header
                in_header = False
                vcf.add_header(header)
                if "SVTYPE" in [info.id for info in vcf.info_list]:
                    vcf.add_info_after(
                        "SVTYPE", "POS", 1, 'Integer',
                        'Position of the variant described in this record')
                header = vcf.get_header()
                bedpe_out.write(header[:header.rfind('\n')] + '\n')
                final_header_line = [
                    '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B',
                    'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE',
                    'FILTER', 'NAME_A', 'REF_A', 'ALT_A', 'NAME_B', 'REF_B',
                    'ALT_B', 'INFO_A', 'INFO_B'
                ]

                if len(sample_list) > 0:
                    bedpe_out.write('\t'.join(
                        final_header_line +
                        ['FORMAT', '\t'.join(map(str, sample_list))]) + '\n')
                else:
                    bedpe_out.write('\t'.join(final_header_line) + '\n')

        v = line.rstrip().split('\t')
        var = svtools.vcf.variant.Variant(v, vcf)
        var.set_info("POS", var.pos)
        unique_name = var.var_id
        # The EVENT ID is not unique. Stick with ID column.
        # if 'EVENT' in var.info:
        #     unique_name = var.info['EVENT']
        if var.info['SVTYPE'] != 'BND':
            bedpe_out.write(str(converter.convert(var)) + '\n')
        else:
            # Manta doesn't use the SECONDARY flag
            # So, just check whether the mate ID is in bnds
            # Which means that the mate has already been seen
            if 'MATEID' in var.info:
                mate_id = var.info['MATEID']
                if mate_id in bnds:
                    var1 = bnds[mate_id]
                    bedpe_out.write(str(converter.convert(var1, var)) + '\n')
                    del bnds[mate_id]
                else:
                    bnds.update({unique_name: var})
            else:
                bnds.update({unique_name: var})
                continue
    intersected_keys = bnds.viewkeys() & sec_bnds.viewkeys()
    for key in intersected_keys:
        bedpe_out.write(
            str(converter.convert(bnds[key], sec_bnds[key])) + '\n')
        del bnds[key]
        del sec_bnds[key]
    if bnds is not None:
        for bnd in bnds:
            sys.stderr.write(
                'Warning: missing secondary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(bnds[bnd], None)) + '\n')
    if sec_bnds is not None:
        for bnd in sec_bnds:
            sys.stderr.write(
                'Warning: missing primary multiline variant at ID:' + bnd +
                '\n')
            bedpe_out.write(str(converter.convert(None, sec_bnds[bnd])) + '\n')

    # close the files
    bedpe_out.close()
    return