def main(): global options, args # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input former_vcfformat_freq = input_vcf.formats[ 'FREQ'] if 'FREQ' in input_vcf.formats else None former_vcfinfo_sfreq = input_vcf.infos[ 'FREQ'] if 'FREQ' in input_vcf.infos else None former_vcfinfo_sdp = input_vcf.infos[ 'DPS'] if 'DPS' in input_vcf.infos else None input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency') input_vcf.infos['SFREQ'] = VcfInfo( 'SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples') input_vcf.infos['SDP'] = VcfInfo( 'SDP', 1, 'Integer', 'Maximum sequencing depth of all samples') output_vcf = vcf.Writer(open(options.output_vcf, 'w'), input_vcf, lineterminator='\n') if former_vcfformat_freq is not None: input_vcf.formats['FREQ'] = former_vcfformat_freq if former_vcfinfo_sfreq is not None: input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq if former_vcfinfo_sdp is not None: input_vcf.infos['SDP'] = former_vcfinfo_sdp for record in input_vcf: if not 'FREQ' in record.FORMAT.split(':'): record.add_format('FREQ') # Default values for added INFO fields site_freq = None site_depth = 0 # iterate over all call objects of record for call in record.samples: # Allele frequency and Depth evaluation among samples try: site_freq = max(site_freq, max( call.aaf)) if call.aaf is not None else site_freq site_depth = max( call.depth, site_depth) if call.depth is not None else site_depth except Exception: print "ERROR: unforeseen exception when normalizing record:", record raise call.add_format('FREQ', norm_freq(call.aaf)) # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values if site_freq is None or site_freq == '.': site_freq = 0 record.add_info('SFREQ', site_freq) record.add_info('SDP', site_depth) output_vcf.write_record(record)
def main(): global options, args # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) # If an FREQ field already exists in FORMAT or INFO, it has to be stored and be used when importing from input former_vcfformat_freq = input_vcf.formats['FREQ'] if 'FREQ' in input_vcf.formats else None former_vcfinfo_sfreq = input_vcf.infos['FREQ'] if 'FREQ' in input_vcf.infos else None former_vcfinfo_sdp = input_vcf.infos['DPS'] if 'DPS' in input_vcf.infos else None input_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency') input_vcf.infos['SFREQ'] = VcfInfo('SFREQ', 1, 'Float', 'Maximum variant allele frequency of all samples') input_vcf.infos['SDP'] = VcfInfo('SDP', 1, 'Integer', 'Maximum sequencing depth of all samples') output_vcf = vcf.Writer(open(options.output_vcf, 'w'), input_vcf, lineterminator='\n') if former_vcfformat_freq is not None: input_vcf.formats['FREQ'] = former_vcfformat_freq if former_vcfinfo_sfreq is not None: input_vcf.infos['SFREQ'] = former_vcfinfo_sfreq if former_vcfinfo_sdp is not None: input_vcf.infos['SDP'] = former_vcfinfo_sdp for record in input_vcf: if not 'FREQ' in record.FORMAT.split(':'): record.add_format('FREQ') # Default values for added INFO fields site_freq = None site_depth = 0 # iterate over all call objects of record for call in record.samples: # Allele frequency and Depth evaluation among samples try: site_freq = max(site_freq, max(call.aaf)) if call.aaf is not None else site_freq site_depth = max(call.depth, site_depth) if call.depth is not None else site_depth except Exception: print "ERROR: unforeseen exception when normalizing record:", record raise call.add_format('FREQ', norm_freq(call.aaf)) # TODO: unfortunately GATK filtering doesn't yet deal correctly with "None" (.) values if site_freq is None or site_freq == '.': site_freq = 0 record.add_info('SFREQ', site_freq) record.add_info('SDP', site_depth) output_vcf.write_record(record)
def main(): global options, args # Create template for outputting from input file template_vcf = vcf.Reader(open(options.input_vcf, 'r')) template_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency') # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) repaired_vcf = vcf.Writer(open(options.output_vcf, 'w'), template_vcf, lineterminator='\n') for record in input_vcf: if len(record.ALT) > 1: # Problem 1 in VarScan files: alternative allele repeated. Output unique alternative alleles new_alt = [] for index, alternative in enumerate(record.ALT): if not alternative in new_alt: new_alt.append(alternative) for call in record.samples: if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split(call.gt_phase_char()): try: gt_alleles.append(str(new_alt.index(allele) + 1)) except ValueError: gt_alleles.append('0') call.add_format('GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join(gt_alleles) record.ALT = new_alt record.alleles = [record.REF] + new_alt # Problem 2 in VarScan files: only a FREQ value is stored, even in case of multiple alternative alleles for call in record.samples: # Assign allele frequency to those alleles predicted by variant caller freqs = [] for index in range(1, len(record.ALT) + 1): if call.gt_nums == '0/0' or call.gt_nums == '0|0': freqs.append(norm_freq(call.aaf[0])) elif call.gt_alleles is None: freqs.append(None) elif str(index) in call.gt_alleles and ( '0' in call.gt_alleles or call.gt_type == 2): freqs.append(norm_freq(call.aaf[0])) elif str(index) not in call.gt_alleles: freqs.append(None) else: print "ERROR: unforeseen case when obtaining freqs!!" print "record:", record raise call.add_format('FREQ', freqs) else: for call in record.samples: call.add_format('FREQ', norm_freq(call.aaf)) # Problem 3: hybrid records (both SNP and Indel in a single record) snv_record = None indel_record = None if record.is_indel and len(record.ALT) > 1: # Creation of lists containing indels and snvs indel_alternatives = [] snv_alternatives = [] for index, alt in enumerate(record.ALT): if len(alt) == len( record.REF) and alt.sequence[1:] == record.REF[1:]: if not alt in snv_alternatives: snv_alternatives.append(alt) else: if not alt in indel_alternatives: indel_alternatives.append(alt) for index, alt in enumerate(record.ALT): if len(alt) == len(record.REF): # Check if alternative allele could be expressed as SNV if alt.sequence[1:] == record.REF[1:]: print "INFO: splitting hybrid record (containing both SNP and indel)", record my_alternative = Substitution(alt.sequence[0]) my_samples = [] for call in record.samples: my_samples.append( Call(call.site, call.sample, call.data.copy())) my_record = Record(record.CHROM, record.POS, record.ID, record.REF[0], [my_alternative], record.QUAL, record.FILTER, record.INFO, record.FORMAT, record._sample_indexes, my_samples) for call in my_record.samples: call.add_format('FREQ', call.get_format('FREQ')[index]) if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split( call.gt_phase_char()): try: gt_alleles.append( str( snv_alternatives.index(allele) + 1)) except ValueError: gt_alleles.append('0') call.add_format( 'GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join( gt_alleles) if snv_record is None: snv_record = my_record else: if not snv_record.merge(my_record): print "ERROR: Impossible to split record", record raise else: # Indel my_samples = [] for call in record.samples: my_samples.append( Call(call.site, call.sample, call.data.copy())) my_record = Record(record.CHROM, record.POS, record.ID, record.REF, [alt], record.QUAL, record.FILTER, record.INFO, record.FORMAT, record._sample_indexes, my_samples) for call in my_record.samples: call.add_format('FREQ', call.get_format('FREQ')[index]) if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split( call.gt_phase_char()): try: gt_alleles.append( str( indel_alternatives.index(allele) + 1)) except ValueError: gt_alleles.append('0') call.add_format( 'GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join( gt_alleles) if indel_record is None: indel_record = my_record else: if not indel_record.merge(my_record): print "ERROR: Impossible to split record", record raise if snv_record is not None: repaired_vcf.write_record(snv_record) repaired_vcf.write_record(indel_record) else: repaired_vcf.write_record(record)
def main(): global options, args # Create template for outputting from input file template_vcf = vcf.Reader(open(options.input_vcf, 'r')) template_vcf.formats['FREQ'] = VcfFormat('FREQ', None, 'String', 'Variant allele frequency') # Open and parse each line of the vcf file input_vcf = vcf.Reader(open(options.input_vcf, 'r')) repaired_vcf = vcf.Writer(open(options.output_vcf, 'w'), template_vcf, lineterminator='\n') for record in input_vcf: if len(record.ALT) > 1: # Problem 1 in VarScan files: alternative allele repeated. Output unique alternative alleles new_alt = [] for index, alternative in enumerate(record.ALT): if not alternative in new_alt: new_alt.append(alternative) for call in record.samples: if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split(call.gt_phase_char()): try: gt_alleles.append(str(new_alt.index(allele)+1)) except ValueError: gt_alleles.append('0') call.add_format('GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join(gt_alleles) record.ALT = new_alt record.alleles = [record.REF] + new_alt # Problem 2 in VarScan files: only a FREQ value is stored, even in case of multiple alternative alleles for call in record.samples: # Assign allele frequency to those alleles predicted by variant caller freqs = [] for index in range(1, len(record.ALT) + 1): if call.gt_nums == '0/0' or call.gt_nums == '0|0': freqs.append(norm_freq(call.aaf[0])) elif call.gt_alleles is None: freqs.append(None) elif str(index) in call.gt_alleles and ('0' in call.gt_alleles or call.gt_type == 2): freqs.append(norm_freq(call.aaf[0])) elif str(index) not in call.gt_alleles: freqs.append(None) else: print "ERROR: unforeseen case when obtaining freqs!!" print "record:", record raise call.add_format('FREQ', freqs) else: for call in record.samples: call.add_format('FREQ', norm_freq(call.aaf)) # Problem 3: hybrid records (both SNP and Indel in a single record) snv_record = None indel_record = None if record.is_indel and len(record.ALT) > 1: # Creation of lists containing indels and snvs indel_alternatives = [] snv_alternatives = [] for index, alt in enumerate(record.ALT): if len(alt) == len(record.REF) and alt.sequence[1:] == record.REF[1:]: if not alt in snv_alternatives: snv_alternatives.append(alt) else: if not alt in indel_alternatives: indel_alternatives.append(alt) for index, alt in enumerate(record.ALT): if len(alt) == len(record.REF): # Check if alternative allele could be expressed as SNV if alt.sequence[1:] == record.REF[1:]: print "INFO: splitting hybrid record (containing both SNP and indel)", record my_alternative = Substitution(alt.sequence[0]) my_samples = [] for call in record.samples: my_samples.append(Call(call.site, call.sample, call.data.copy())) my_record = Record(record.CHROM, record.POS, record.ID, record.REF[0], [my_alternative], record.QUAL, record.FILTER, record.INFO, record.FORMAT, record._sample_indexes, my_samples) for call in my_record.samples: call.add_format('FREQ', call.get_format('FREQ')[index]) if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split(call.gt_phase_char()): try: gt_alleles.append(str(snv_alternatives.index(allele)+1)) except ValueError: gt_alleles.append('0') call.add_format('GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join(gt_alleles) if snv_record is None: snv_record = my_record else: if not snv_record.merge(my_record): print "ERROR: Impossible to split record", record raise else: # Indel my_samples = [] for call in record.samples: my_samples.append(Call(call.site, call.sample, call.data.copy())) my_record = Record(record.CHROM, record.POS, record.ID, record.REF, [alt], record.QUAL, record.FILTER, record.INFO, record.FORMAT, record._sample_indexes, my_samples) for call in my_record.samples: call.add_format('FREQ', call.get_format('FREQ')[index]) if call.gt_nums is not None: gt_alleles = [] for allele in call.gt_bases.split(call.gt_phase_char()): try: gt_alleles.append(str(indel_alternatives.index(allele)+1)) except ValueError: gt_alleles.append('0') call.add_format('GT', call.gt_phase_char().join(gt_alleles)) call.gt_nums = call.gt_phase_char().join(gt_alleles) if indel_record is None: indel_record = my_record else: if not indel_record.merge(my_record): print "ERROR: Impossible to split record", record raise if snv_record is not None: repaired_vcf.write_record(snv_record) repaired_vcf.write_record(indel_record) else: repaired_vcf.write_record(record)