Esempio n. 1
0
def main():
    global options, args

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input
    former_vcfformat_freq = input_vcf.formats[
        'FREQ'] if 'FREQ' in input_vcf.formats else None
    former_vcfinfo_sfreq = input_vcf.infos[
        'FREQ'] if 'FREQ' in input_vcf.infos else None
    former_vcfinfo_sdp = input_vcf.infos[
        'DPS'] if 'DPS' in input_vcf.infos else None
    input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String',
                                          'Variant allele frequency')
    input_vcf.infos['SFREQ'] = VcfInfo(
        'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples')
    input_vcf.infos['SDP'] = VcfInfo(
        'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples')
    output_vcf = vcf.Writer(open(options.output_vcf, 'w'),
                            input_vcf,
                            lineterminator='\n')
    if former_vcfformat_freq is not None:
        input_vcf.formats['FREQ'] = former_vcfformat_freq
    if former_vcfinfo_sfreq is not None:
        input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq
    if former_vcfinfo_sdp is not None:
        input_vcf.infos['SDP'] = former_vcfinfo_sdp

    for record in input_vcf:
        if not 'FREQ' in record.FORMAT.split(':'):
            record.add_format('FREQ')

        # Default values for added INFO fields
        site_freq = None
        site_depth = 0

        # iterate over all call objects of record
        for call in record.samples:
            # Allele frequency and Depth evaluation among samples
            try:
                site_freq = max(site_freq, max(
                    call.aaf)) if call.aaf is not None else site_freq
                site_depth = max(
                    call.depth,
                    site_depth) if call.depth is not None else site_depth
            except Exception:
                print "ERROR: unforeseen exception when normalizing record:", record
                raise
            call.add_format('FREQ', norm_freq(call.aaf))
            # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values
            if site_freq is None or site_freq == '.':
                site_freq = 0
            record.add_info('SFREQ', site_freq)
            record.add_info('SDP', site_depth)
        output_vcf.write_record(record)
def main():
    global options, args

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input
    former_vcfformat_freq = input_vcf.formats['FREQ'] if 'FREQ' in input_vcf.formats else None
    former_vcfinfo_sfreq = input_vcf.infos['FREQ'] if 'FREQ' in input_vcf.infos else None
    former_vcfinfo_sdp = input_vcf.infos['DPS'] if 'DPS' in input_vcf.infos else None
    input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency')
    input_vcf.infos['SFREQ'] = VcfInfo('SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples')
    input_vcf.infos['SDP'] = VcfInfo('SDP', 1, 'Integer', 'Maximum sequencing depth of all samples')
    output_vcf = vcf.Writer(open(options.output_vcf, 'w'), input_vcf, lineterminator='\n')
    if former_vcfformat_freq is not None:
        input_vcf.formats['FREQ'] = former_vcfformat_freq
    if former_vcfinfo_sfreq is not None:
        input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq
    if former_vcfinfo_sdp is not None:
        input_vcf.infos['SDP'] = former_vcfinfo_sdp

    for record in input_vcf:
        if not 'FREQ' in record.FORMAT.split(':'):
            record.add_format('FREQ')

        # Default values for added INFO fields
        site_freq = None
        site_depth = 0

        # iterate over all call objects of record
        for call in record.samples:
            # Allele frequency and Depth evaluation among samples
            try:
                site_freq = max(site_freq, max(call.aaf)) if call.aaf is not None else site_freq
                site_depth = max(call.depth, site_depth) if call.depth is not None else site_depth
            except Exception:
                print "ERROR: unforeseen exception when normalizing record:", record
                raise
            call.add_format('FREQ', norm_freq(call.aaf))
            # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values
            if site_freq is None or site_freq == '.':
                site_freq = 0
            record.add_info('SFREQ', site_freq)
            record.add_info('SDP', site_depth)
        output_vcf.write_record(record)
Esempio n. 3
0
def main():
    global options, args

    # Create template for outputting from input file
    template_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    template_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String',
                                             'Variant allele frequency')

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    repaired_vcf = vcf.Writer(open(options.output_vcf, 'w'),
                              template_vcf,
                              lineterminator='\n')
    for record in input_vcf:
        if len(record.ALT) > 1:
            # Problem 1 in VarScan files: alternative allele repeated. Output unique alternative alleles
            new_alt = []
            for index, alternative in enumerate(record.ALT):
                if not alternative in new_alt:
                    new_alt.append(alternative)
            for call in record.samples:
                if call.gt_nums is not None:
                    gt_alleles = []
                    for allele in call.gt_bases.split(call.gt_phase_char()):
                        try:
                            gt_alleles.append(str(new_alt.index(allele) + 1))
                        except ValueError:
                            gt_alleles.append('0')
                    call.add_format('GT',
                                    call.gt_phase_char().join(gt_alleles))
                    call.gt_nums = call.gt_phase_char().join(gt_alleles)
            record.ALT = new_alt
            record.alleles = [record.REF] + new_alt

            # Problem 2 in VarScan files: only a FREQ value is stored, even in case of multiple alternative alleles
            for call in record.samples:
                # Assign allele frequency to those alleles predicted by variant caller
                freqs = []
                for index in range(1, len(record.ALT) + 1):
                    if call.gt_nums == '0/0' or call.gt_nums == '0|0':
                        freqs.append(norm_freq(call.aaf[0]))
                    elif call.gt_alleles is None:
                        freqs.append(None)
                    elif str(index) in call.gt_alleles and (
                            '0' in call.gt_alleles or call.gt_type == 2):
                        freqs.append(norm_freq(call.aaf[0]))
                    elif str(index) not in call.gt_alleles:
                        freqs.append(None)
                    else:
                        print "ERROR: unforeseen case when obtaining freqs!!"
                        print "record:", record
                        raise
                call.add_format('FREQ', freqs)
        else:
            for call in record.samples:
                call.add_format('FREQ', norm_freq(call.aaf))
        # Problem 3: hybrid records (both SNP and Indel in a single record)
        snv_record = None
        indel_record = None
        if record.is_indel and len(record.ALT) > 1:
            # Creation of lists containing indels and snvs
            indel_alternatives = []
            snv_alternatives = []
            for index, alt in enumerate(record.ALT):
                if len(alt) == len(
                        record.REF) and alt.sequence[1:] == record.REF[1:]:
                    if not alt in snv_alternatives:
                        snv_alternatives.append(alt)
                else:
                    if not alt in indel_alternatives:
                        indel_alternatives.append(alt)
            for index, alt in enumerate(record.ALT):
                if len(alt) == len(record.REF):
                    # Check if alternative allele could be expressed as SNV
                    if alt.sequence[1:] == record.REF[1:]:
                        print "INFO: splitting hybrid record (containing both SNP and indel)", record
                        my_alternative = Substitution(alt.sequence[0])
                        my_samples = []
                        for call in record.samples:
                            my_samples.append(
                                Call(call.site, call.sample, call.data.copy()))
                        my_record = Record(record.CHROM, record.POS, record.ID,
                                           record.REF[0], [my_alternative],
                                           record.QUAL, record.FILTER,
                                           record.INFO, record.FORMAT,
                                           record._sample_indexes, my_samples)
                        for call in my_record.samples:
                            call.add_format('FREQ',
                                            call.get_format('FREQ')[index])
                            if call.gt_nums is not None:
                                gt_alleles = []
                                for allele in call.gt_bases.split(
                                        call.gt_phase_char()):
                                    try:
                                        gt_alleles.append(
                                            str(
                                                snv_alternatives.index(allele)
                                                + 1))
                                    except ValueError:
                                        gt_alleles.append('0')
                                call.add_format(
                                    'GT',
                                    call.gt_phase_char().join(gt_alleles))
                                call.gt_nums = call.gt_phase_char().join(
                                    gt_alleles)
                        if snv_record is None:
                            snv_record = my_record
                        else:
                            if not snv_record.merge(my_record):
                                print "ERROR: Impossible to split record", record
                                raise
                else:
                    # Indel
                    my_samples = []
                    for call in record.samples:
                        my_samples.append(
                            Call(call.site, call.sample, call.data.copy()))
                    my_record = Record(record.CHROM, record.POS, record.ID,
                                       record.REF, [alt], record.QUAL,
                                       record.FILTER, record.INFO,
                                       record.FORMAT, record._sample_indexes,
                                       my_samples)
                    for call in my_record.samples:
                        call.add_format('FREQ', call.get_format('FREQ')[index])
                        if call.gt_nums is not None:
                            gt_alleles = []
                            for allele in call.gt_bases.split(
                                    call.gt_phase_char()):
                                try:
                                    gt_alleles.append(
                                        str(
                                            indel_alternatives.index(allele) +
                                            1))
                                except ValueError:
                                    gt_alleles.append('0')
                            call.add_format(
                                'GT',
                                call.gt_phase_char().join(gt_alleles))
                            call.gt_nums = call.gt_phase_char().join(
                                gt_alleles)
                    if indel_record is None:
                        indel_record = my_record
                    else:
                        if not indel_record.merge(my_record):
                            print "ERROR: Impossible to split record", record
                            raise
        if snv_record is not None:
            repaired_vcf.write_record(snv_record)
            repaired_vcf.write_record(indel_record)
        else:
            repaired_vcf.write_record(record)
def main():
    global options, args

    # Create template for outputting from input file
    template_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    template_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency')

    # Open and parse each line of the vcf file
    input_vcf = vcf.Reader(open(options.input_vcf, 'r'))
    repaired_vcf = vcf.Writer(open(options.output_vcf, 'w'), template_vcf, lineterminator='\n')
    for record in input_vcf:
        if len(record.ALT) > 1:
            # Problem 1 in VarScan files: alternative allele repeated. Output unique alternative alleles
            new_alt = []
            for index, alternative in enumerate(record.ALT):
                if not alternative in new_alt:
                    new_alt.append(alternative)
            for call in record.samples:
                if call.gt_nums is not None:
                    gt_alleles = []
                    for allele in call.gt_bases.split(call.gt_phase_char()):
                        try:
                            gt_alleles.append(str(new_alt.index(allele)+1))
                        except ValueError:
                            gt_alleles.append('0')
                    call.add_format('GT', call.gt_phase_char().join(gt_alleles))
                    call.gt_nums = call.gt_phase_char().join(gt_alleles)
            record.ALT = new_alt
            record.alleles = [record.REF] + new_alt

            # Problem 2 in VarScan files: only a FREQ value is stored, even in case of multiple alternative alleles
            for call in record.samples:
                # Assign allele frequency to those alleles predicted by variant caller
                freqs = []
                for index in range(1, len(record.ALT) + 1):
                    if call.gt_nums == '0/0' or call.gt_nums == '0|0':
                        freqs.append(norm_freq(call.aaf[0]))
                    elif call.gt_alleles is None:
                        freqs.append(None)
                    elif str(index) in call.gt_alleles and ('0' in call.gt_alleles or call.gt_type == 2):
                        freqs.append(norm_freq(call.aaf[0]))
                    elif str(index) not in call.gt_alleles:
                        freqs.append(None)
                    else:
                        print "ERROR: unforeseen case when obtaining freqs!!"
                        print "record:", record
                        raise
                call.add_format('FREQ', freqs)
        else:
            for call in record.samples:
                call.add_format('FREQ', norm_freq(call.aaf))
        # Problem 3: hybrid records (both SNP and Indel in a single record)
        snv_record = None
        indel_record = None
        if record.is_indel and len(record.ALT) > 1:
            # Creation of lists containing indels and snvs
            indel_alternatives = []
            snv_alternatives = []
            for index, alt in enumerate(record.ALT):
                if len(alt) == len(record.REF) and alt.sequence[1:] == record.REF[1:]:
                    if not alt in snv_alternatives:
                        snv_alternatives.append(alt)
                else:
                    if not alt in indel_alternatives:
                        indel_alternatives.append(alt)
            for index, alt in enumerate(record.ALT):
                if len(alt) == len(record.REF):
                    # Check if alternative allele could be expressed as SNV
                    if alt.sequence[1:] == record.REF[1:]:
                        print "INFO: splitting hybrid record (containing both SNP and indel)", record
                        my_alternative = Substitution(alt.sequence[0])
                        my_samples = []
                        for call in record.samples:
                            my_samples.append(Call(call.site, call.sample, call.data.copy()))
                        my_record = Record(record.CHROM, record.POS, record.ID, record.REF[0], [my_alternative],
                                           record.QUAL, record.FILTER, record.INFO, record.FORMAT,
                                           record._sample_indexes, my_samples)
                        for call in my_record.samples:
                            call.add_format('FREQ', call.get_format('FREQ')[index])
                            if call.gt_nums is not None:
                                gt_alleles = []
                                for allele in call.gt_bases.split(call.gt_phase_char()):
                                    try:
                                        gt_alleles.append(str(snv_alternatives.index(allele)+1))
                                    except ValueError:
                                        gt_alleles.append('0')
                                call.add_format('GT', call.gt_phase_char().join(gt_alleles))
                                call.gt_nums = call.gt_phase_char().join(gt_alleles)
                        if snv_record is None:
                            snv_record = my_record
                        else:
                            if not snv_record.merge(my_record):
                                print "ERROR: Impossible to split record", record
                                raise
                else:
                    # Indel
                    my_samples = []
                    for call in record.samples:
                        my_samples.append(Call(call.site, call.sample, call.data.copy()))
                    my_record = Record(record.CHROM, record.POS, record.ID, record.REF, [alt],
                                       record.QUAL, record.FILTER, record.INFO, record.FORMAT,
                                       record._sample_indexes, my_samples)
                    for call in my_record.samples:
                        call.add_format('FREQ', call.get_format('FREQ')[index])
                        if call.gt_nums is not None:
                            gt_alleles = []
                            for allele in call.gt_bases.split(call.gt_phase_char()):
                                try:
                                    gt_alleles.append(str(indel_alternatives.index(allele)+1))
                                except ValueError:
                                    gt_alleles.append('0')
                            call.add_format('GT', call.gt_phase_char().join(gt_alleles))
                            call.gt_nums = call.gt_phase_char().join(gt_alleles)
                    if indel_record is None:
                        indel_record = my_record
                    else:
                        if not indel_record.merge(my_record):
                            print "ERROR: Impossible to split record", record
                            raise
        if snv_record is not None:
            repaired_vcf.write_record(snv_record)
            repaired_vcf.write_record(indel_record)
        else:
            repaired_vcf.write_record(record)