def __init__(self, **kwargs): InputFileConverter.__init__(self, **kwargs) self.gene_expn_file = kwargs.pop('gene_expn_file', None) self.transcript_expn_file = kwargs.pop('transcript_expn_file', None) self.normal_snvs_coverage_file = kwargs.pop('normal_snvs_coverage_file', None) self.normal_indels_coverage_file = kwargs.pop('normal_indels_coverage_file', None) self.tdna_snvs_coverage_file = kwargs.pop('tdna_snvs_coverage_file', None) self.tdna_indels_coverage_file = kwargs.pop('tdna_indels_coverage_file', None) self.trna_snvs_coverage_file = kwargs.pop('trna_snvs_coverage_file', None) self.trna_indels_coverage_file = kwargs.pop('trna_indels_coverage_file', None) self.pass_only = kwargs.pop('pass_only', False) self.sample_name = kwargs.pop('sample_name', None) self.normal_sample_name = kwargs.pop('normal_sample_name', None) self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None) self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None) self.peptide_length = kwargs.pop('peptide_length', None) if self.proximal_variants_vcf and not (self.proximal_variants_tsv and self.peptide_length): sys.exit("A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided.") if self.proximal_variants_vcf and not lib.utils.is_gz_file(self.input_file): sys.exit("Input VCF {} needs to be bgzipped when running with a proximal variants VCF.".format(self.input_file)) if self.proximal_variants_vcf and not lib.utils.is_gz_file(self.proximal_variants_vcf): sys.exit("Proximal variants VCF {} needs to be bgzipped.".format(self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists(self.proximal_variants_vcf + '.tbi'): sys.exit('No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.'.format(self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists(self.input_file + '.tbi'): sys.exit('No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.'.format(self.input_file)) if lib.utils.is_gz_file(self.input_file): mode = 'rb' else: mode = 'r' if self.proximal_variants_vcf: self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv, 'w') self.proximal_variants_writer = csv.DictWriter(self.proximal_variants_tsv_fh, delimiter='\t', fieldnames=['chromosome_name', 'start', 'stop', 'reference', 'variant', 'amino_acid_change', 'codon_change', 'protein_position', 'type', 'main_somatic_variant']) self.proximal_variants_writer.writeheader() self.proximal_variant_parser = ProximalVariant(self.proximal_variants_vcf, self.pass_only) self.somatic_vcf_fh = open(self.input_file, mode) self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh) self.reader = open(self.input_file, mode) self.vcf_reader = vcf.Reader(self.reader) if len(self.vcf_reader.samples) > 1: if not self.sample_name: sys.exit("VCF contains more than one sample but sample_name is not set.") elif self.sample_name not in self.vcf_reader.samples: sys.exit("sample_name {} not a sample ID in the #CHROM header of VCF {}".format(self.sample_name, self.input_file)) if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples: sys.exit("normal_sample_name {} not a sample ID in the #CHROM header of VCF {}".format(self.normal_sample_name, self.input_file)) elif len(self.vcf_reader.samples) == 0: sys.exit("VCF doesn't contain any sample genotype information.") else: self.sample_name = self.vcf_reader.samples[0] self.writer = open(self.output_file, 'w') self.tsv_writer = csv.DictWriter(self.writer, delimiter='\t', fieldnames=self.output_headers(), restval='NA') self.tsv_writer.writeheader() self.csq_parser = self.create_csq_parser()
def add_proximal_variants(self, somatic_variant_index, wildtype_subsequence, mutation_position, original_position, germline_variants_only): mutation_offset = original_position - mutation_position wildtype_subsequence_with_proximal_variants = wildtype_subsequence if somatic_variant_index in self.proximal_variants.keys(): for (protein_position, lines ) in self.proximal_variants[somatic_variant_index].items(): if protein_position == original_position: continue if germline_variants_only: filtered_lines = [ line for line in lines if line['type'] == 'germline' ] else: filtered_lines = lines if len(filtered_lines) == 0: continue elif len(filtered_lines) == 1: line = filtered_lines[0] proximal_variant_wildtype_amino_acid, proximal_variant_mutant_amino_acid = line[ 'amino_acid_change'].split('/') else: line = filtered_lines[0] proximal_variant_wildtype_amino_acid = line[ 'amino_acid_change'].split('/')[0] codon_changes = [ item['codon_change'] for item in filtered_lines ] proximal_variant_mutant_amino_acid = ProximalVariant.combine_conflicting_variants( codon_changes) proximal_variant_position = int( protein_position) - 1 - mutation_offset if proximal_variant_position <= 0 or proximal_variant_position >= len( wildtype_subsequence): continue if len(proximal_variant_wildtype_amino_acid) != len( proximal_variant_mutant_amino_acid): print( "Nearby variant is not a missense mutation. Skipping.") continue if wildtype_subsequence[ proximal_variant_position] != proximal_variant_wildtype_amino_acid: sys.exit( "Error when processing proximal variant.\n" + "The wildtype amino acid for variant %s with substring %s is different than expected.\n" % (somatic_variant_index, wildtype_subsequence) + "Actual wildtype amino acid: %s\n" % wildtype_subsequence[proximal_variant_position] + "Wildtype amino acid of the proximal_variant: %s" % proximal_variant_wildtype_amino_acid) wildtype_subsequence_with_proximal_variants = wildtype_subsequence_with_proximal_variants[:proximal_variant_position] + proximal_variant_mutant_amino_acid + wildtype_subsequence_with_proximal_variants[ proximal_variant_position + 1:] return wildtype_subsequence_with_proximal_variants
class VcfConverter(InputFileConverter): def __init__(self, **kwargs): InputFileConverter.__init__(self, **kwargs) self.pass_only = kwargs.pop('pass_only', False) self.sample_name = kwargs.pop('sample_name', None) self.normal_sample_name = kwargs.pop('normal_sample_name', None) self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None) self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None) self.peptide_length = kwargs.pop('peptide_length', None) if self.proximal_variants_vcf and not (self.proximal_variants_tsv and self.peptide_length): sys.exit( "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided." ) if self.proximal_variants_vcf and not lib.utils.is_gz_file( self.input_file): sys.exit( "Input VCF {} needs to be bgzipped when running with a proximal variants VCF." .format(self.input_file)) if self.proximal_variants_vcf and not lib.utils.is_gz_file( self.proximal_variants_vcf): sys.exit("Proximal variants VCF {} needs to be bgzipped.".format( self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists( self.proximal_variants_vcf + '.tbi'): sys.exit( 'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.' .format(self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists(self.input_file + '.tbi'): sys.exit( 'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.' .format(self.input_file)) if lib.utils.is_gz_file(self.input_file): mode = 'rb' else: mode = 'r' if self.proximal_variants_vcf: self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv, 'w') self.proximal_variants_writer = csv.DictWriter( self.proximal_variants_tsv_fh, delimiter='\t', fieldnames=[ 'chromosome_name', 'start', 'stop', 'reference', 'variant', 'amino_acid_change', 'codon_change', 'protein_position', 'type', 'main_somatic_variant' ]) self.proximal_variants_writer.writeheader() self.proximal_variant_parser = ProximalVariant( self.proximal_variants_vcf, self.pass_only) self.somatic_vcf_fh = open(self.input_file, mode) self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh) self.reader = open(self.input_file, mode) self.vcf_reader = vcf.Reader(self.reader) if len(self.vcf_reader.samples) > 1: if not self.sample_name: sys.exit( "VCF contains more than one sample but sample_name is not set." ) elif self.sample_name not in self.vcf_reader.samples: sys.exit( "sample_name {} not a sample ID in the #CHROM header of VCF {}" .format(self.sample_name, self.input_file)) if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples: sys.exit( "normal_sample_name {} not a sample ID in the #CHROM header of VCF {}" .format(self.normal_sample_name, self.input_file)) elif len(self.vcf_reader.samples) == 0: sys.exit("VCF doesn't contain any sample genotype information.") else: self.sample_name = self.vcf_reader.samples[0] self.writer = open(self.output_file, 'w') self.tsv_writer = csv.DictWriter(self.writer, delimiter='\t', fieldnames=self.output_headers(), restval='NA') self.tsv_writer.writeheader() self.csq_parser = self.create_csq_parser() if 'DownstreamProtein' not in self.csq_parser.csq_format: sys.exit( "VCF doesn't contain VEP DownstreamProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins." ) if 'WildtypeProtein' not in self.csq_parser.csq_format: sys.exit( "VCF doesn't contain VEP WildtypeProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins." ) def is_insertion(self, ref, alt): return len(alt) > len(ref) def is_deletion(self, ref, alt): return len(alt) < len(ref) def create_csq_parser(self): info_fields = self.vcf_reader.infos if 'CSQ' not in info_fields: sys.exit( 'Input VCF does not contain a CSQ header. Please annotate the VCF with VEP before running it.' ) if info_fields['CSQ'] is None: sys.exit( 'Failed to extract format string from info description for tag (CSQ)' ) else: csq_header = info_fields['CSQ'] return CsqParser(csq_header.desc) def resolve_consequence(self, consequence_string, ref, alt): if '&' in consequence_string: consequences = { consequence.lower() for consequence in consequence_string.split('&') } elif '.' in consequence_string: consequences = { consequence.lower() for consequence in consequence_string.split('.') } else: consequences = [consequence_string.lower()] if 'start_lost' in consequences: consequence = None elif 'frameshift_variant' in consequences: consequence = 'FS' elif 'missense_variant' in consequences: consequence = 'missense' elif 'inframe_insertion' in consequences: consequence = 'inframe_ins' elif 'inframe_deletion' in consequences: consequence = 'inframe_del' elif 'protein_altering_variant' in consequences: if len(ref) > len(alt) and (len(ref) - len(alt)) % 3 == 0: consequence = 'inframe_del' elif len(alt) > len(ref) and (len(alt) - len(ref)) % 3 == 0: consequence = 'inframe_ins' else: consequence = None else: consequence = None return consequence def calculate_vaf(self, var_count, depth): if depth == 0: return 'NA' else: return (var_count / depth) def get_depth_from_vcf_genotype(self, genotype, tag): try: depth = genotype[tag] if depth is None or depth == "": depth = 'NA' except AttributeError: depth = 'NA' return depth def get_vaf_from_vcf_genotype(self, genotype, alts, alt, af_tag, ad_tag, dp_tag): try: allele_frequencies = genotype[af_tag] if isinstance(allele_frequencies, list): vaf = allele_frequencies[alts.index(alt)] else: vaf = allele_frequencies if vaf > 1: print( "Warning: VAF is expected to be a fraction, but is larger than 1. If VAFs are encoded as percentages, please adjust the coverage cutoffs accordingly." ) except (AttributeError, TypeError): try: allele_depths = genotype[ad_tag] if isinstance(allele_depths, list): #sometimes AF is type R, sometimes it's A if len(allele_depths) == len(alts): var_count = allele_depths[alts.index(alt)] elif len(allele_depths) == len(alts) + 1: var_count = allele_depths[alts.index(alt) + 1] else: print( "Warning: Mismatch between the number of alternate alleles and number of values in the AD field for genotype {}" .format(genotype)) return 'NA' else: var_count = allele_depths if var_count is None or var_count == "": return 'NA' depth = genotype[dp_tag] if depth is None or depth == "": return 'NA' vaf = self.calculate_vaf(int(var_count), int(depth)) except AttributeError: vaf = 'NA' return vaf def calculate_coverage_for_entry(self, entry, reference, alt, start, chromosome, genotype): coverage_for_entry = {} coverage_for_entry['tdna_depth'] = self.get_depth_from_vcf_genotype( genotype, 'DP') coverage_for_entry['trna_depth'] = self.get_depth_from_vcf_genotype( genotype, 'RDP') alts = list(map(lambda x: str(x), entry.ALT)) coverage_for_entry['tdna_vaf'] = self.get_vaf_from_vcf_genotype( genotype, alts, alt, 'AF', 'AD', 'DP') coverage_for_entry['trna_vaf'] = self.get_vaf_from_vcf_genotype( genotype, alts, alt, 'RAF', 'RAD', 'RDP') if self.normal_sample_name is not None: normal_genotype = entry.genotype(self.normal_sample_name) coverage_for_entry[ 'normal_depth'] = self.get_depth_from_vcf_genotype( normal_genotype, 'DP') coverage_for_entry['normal_vaf'] = self.get_vaf_from_vcf_genotype( normal_genotype, alts, alt, 'AF', 'AD', 'DP') return coverage_for_entry def write_proximal_variant_entries(self, entry, alt, transcript_name, index): proximal_variants = self.proximal_variant_parser.extract( entry, alt, transcript_name, self.peptide_length) for (proximal_variant, csq_entry) in proximal_variants: if len( list( self.somatic_vcf_reader.fetch( proximal_variant.CHROM, proximal_variant.POS - 1, proximal_variant.POS))) > 0: proximal_variant_type = 'somatic' else: proximal_variant_type = 'germline' if '/' in csq_entry['Protein_position']: protein_position = csq_entry['Protein_position'].split('/')[0] else: protein_position = csq_entry['Protein_position'] proximal_variant_entry = { 'chromosome_name': proximal_variant.CHROM, 'start': proximal_variant.affected_start, 'stop': proximal_variant.affected_end, 'reference': proximal_variant.REF, 'variant': proximal_variant.ALT[0], 'amino_acid_change': csq_entry['Amino_acids'], 'codon_change': csq_entry['Codons'], 'protein_position': protein_position, 'type': proximal_variant_type, 'main_somatic_variant': index, } self.proximal_variants_writer.writerow(proximal_variant_entry) def close_filehandles(self): self.writer.close() self.reader.close() if self.proximal_variants_vcf: self.proximal_variant_parser.fh.close() self.proximal_variants_tsv_fh.close() self.somatic_vcf_fh.close() def decode_hex(self, string): hex_string = string.group(0).replace('%', '') return binascii.unhexlify(hex_string).decode('utf-8') def execute(self): indexes = [] count = 1 while True: try: entry = next(self.vcf_reader) except StopIteration: break except ValueError as e: raise Exception( "VCF is truncated in the middle of an entry near string '{}'" .format(str(e).split("'")[1])) except IndexError as e: raise Exception("VCF is truncated at the end of the file") except Exception as e: raise Exception("Error while reading VCF entry: {}".format( str(e))) chromosome = entry.CHROM start = entry.affected_start stop = entry.affected_end reference = entry.REF alts = entry.ALT genotype = entry.genotype(self.sample_name) if genotype.gt_type is None or genotype.gt_type == 0: #The genotype is uncalled or hom_ref continue filt = entry.FILTER if self.pass_only and not (filt is None or len(filt) == 0): continue if 'CSQ' not in entry.INFO: continue alleles_dict = self.csq_parser.resolve_alleles(entry) for alt in alts: alt = str(alt) if genotype.gt_bases and alt not in genotype.gt_bases.split( '/'): continue coverage_for_entry = self.calculate_coverage_for_entry( entry, reference, alt, start, chromosome, genotype) transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], alt) if len(transcripts) == 0: csq_allele = alleles_dict[alt] transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], csq_allele) if len(transcripts) == 0 and self.is_deletion(reference, alt): transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], 'deletion') for transcript in transcripts: if '/' in transcript['Protein_position']: protein_position = transcript[ 'Protein_position'].split('/')[0] else: protein_position = transcript['Protein_position'] transcript_name = transcript['Feature'] consequence = self.resolve_consequence( transcript['Consequence'], reference, alt) if consequence is None: continue elif consequence == 'FS': if transcript['DownstreamProtein'] == '': print( "frameshift_variant transcript does not contain a DownstreamProtein sequence. Skipping.\n{} {} {} {} {}" .format(entry.CHROM, entry.POS, entry.REF, alt, transcript['Feature'])) continue else: amino_acid_change_position = "%s%s/%s" % ( protein_position, entry.REF, alt) else: if transcript['Amino_acids'] == '': print( "Transcript does not contain Amino_acids change information. Skipping.\n{} {} {} {} {}" .format(entry.CHROM, entry.POS, entry.REF, alt, transcript['Feature'])) continue else: amino_acid_change_position = protein_position + transcript[ 'Amino_acids'] gene_name = transcript['SYMBOL'] index = '%s.%s.%s.%s.%s' % (count, gene_name, transcript_name, consequence, amino_acid_change_position) if index in indexes: sys.exit( "Warning: TSV index already exists: {}".format( index)) else: indexes.append(index) count += 1 if self.proximal_variants_vcf: self.write_proximal_variant_entries( entry, alt, transcript_name, index) ensembl_gene_id = transcript['Gene'] hgvsc = re.sub( r'%[0-9|A-F][0-9|A-F]', self.decode_hex, transcript['HGVSc']) if 'HGVSc' in transcript else 'NA' hgvsp = re.sub( r'%[0-9|A-F][0-9|A-F]', self.decode_hex, transcript['HGVSp']) if 'HGVSp' in transcript else 'NA' if 'TSL' in transcript and transcript[ 'TSL'] is not None and transcript['TSL'] != '': tsl = transcript['TSL'] else: tsl = 'NA' output_row = { 'chromosome_name': entry.CHROM, 'start': entry.affected_start, 'stop': entry.affected_end, 'reference': entry.REF, 'variant': alt, 'gene_name': gene_name, 'transcript_name': transcript_name, 'transcript_support_level': tsl, 'ensembl_gene_id': ensembl_gene_id, 'hgvsc': hgvsc, 'hgvsp': hgvsp, 'wildtype_amino_acid_sequence': transcript['WildtypeProtein'], 'downstream_amino_acid_sequence': transcript['DownstreamProtein'], 'fusion_amino_acid_sequence': '', 'variant_type': consequence, 'protein_position': protein_position, 'index': index, 'protein_length_change': transcript['ProteinLengthChange'], } if transcript['Amino_acids']: output_row['amino_acid_change'] = transcript[ 'Amino_acids'] if transcript['Codons']: output_row['codon_change'] = transcript['Codons'] else: output_row['codon_change'] = 'NA' for (tag, key, comparison_fields) in zip( ['TX', 'GX'], ['transcript_expression', 'gene_expression'], [[transcript_name], [ensembl_gene_id, gene_name]]): if tag in self.vcf_reader.formats: if tag in genotype.data._asdict(): expressions = genotype[tag] if isinstance(expressions, list): for expression in expressions: (item, value) = expression.split('|') for comparison_field in comparison_fields: if item == comparison_field: output_row[key] = value elif expressions is not None: (item, value) = expressions.split('|') for comparison_field in comparison_fields: if item == comparison_field: output_row[key] = value output_row.update(coverage_for_entry) self.tsv_writer.writerow(output_row) self.close_filehandles()
class VcfConverter(InputFileConverter): def __init__(self, **kwargs): InputFileConverter.__init__(self, **kwargs) self.gene_expn_file = kwargs.pop('gene_expn_file', None) self.transcript_expn_file = kwargs.pop('transcript_expn_file', None) self.normal_snvs_coverage_file = kwargs.pop( 'normal_snvs_coverage_file', None) self.normal_indels_coverage_file = kwargs.pop( 'normal_indels_coverage_file', None) self.tdna_snvs_coverage_file = kwargs.pop('tdna_snvs_coverage_file', None) self.tdna_indels_coverage_file = kwargs.pop( 'tdna_indels_coverage_file', None) self.trna_snvs_coverage_file = kwargs.pop('trna_snvs_coverage_file', None) self.trna_indels_coverage_file = kwargs.pop( 'trna_indels_coverage_file', None) self.pass_only = kwargs.pop('pass_only', False) self.sample_name = kwargs.pop('sample_name', None) self.normal_sample_name = kwargs.pop('normal_sample_name', None) self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None) self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None) self.peptide_length = kwargs.pop('peptide_length', None) if self.proximal_variants_vcf and not (self.proximal_variants_tsv and self.peptide_length): sys.exit( "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided" ) if self.proximal_variants_vcf and not os.path.exists( self.proximal_variants_vcf + '.tbi'): sys.exit( 'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed' .format(self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists(self.input_file + '.tbi'): sys.exit( 'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.' .format(self.input_file)) if lib.utils.is_gz_file(self.input_file): mode = 'rb' else: mode = 'r' if self.proximal_variants_vcf: self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv, 'w') self.proximal_variants_writer = csv.DictWriter( self.proximal_variants_tsv_fh, delimiter='\t', fieldnames=[ 'chromosome_name', 'start', 'stop', 'reference', 'variant', 'amino_acid_change', 'codon_change', 'protein_position', 'type', 'main_somatic_variant' ]) self.proximal_variants_writer.writeheader() self.proximal_variant_parser = ProximalVariant( self.proximal_variants_vcf, self.pass_only) self.somatic_vcf_fh = open(self.input_file, mode) self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh) self.reader = open(self.input_file, mode) self.vcf_reader = vcf.Reader(self.reader) if len(self.vcf_reader.samples) > 1: if not self.sample_name: sys.exit( "VCF contains more than one sample but sample_name is not set." ) elif self.sample_name not in self.vcf_reader.samples: sys.exit("sample_name {} not in VCF {}".format( self.sample_name, self.input_file)) if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples: sys.exit("normal_sample_name {} not in VCF {}".format( self.normal_sample_name, self.input_file)) elif len(self.vcf_reader.samples) == 0: sys.exit("VCF doesn't contain any sample genotype information.") else: self.sample_name = self.vcf_reader.samples[0] self.writer = open(self.output_file, 'w') self.tsv_writer = csv.DictWriter(self.writer, delimiter='\t', fieldnames=self.output_headers(), restval='NA') self.tsv_writer.writeheader() self.csq_parser = self.create_csq_parser() def parse_bam_readcount_file(self, bam_readcount_file): with open(bam_readcount_file, 'r') as reader: coverage_tsv_reader = csv.reader(reader, delimiter='\t') coverage = {} for row in coverage_tsv_reader: chromosome = row[0] position = row[1] reference_base = row[2].upper() depth = row[3] brct = row[4:] if chromosome not in coverage: coverage[chromosome] = {} if position not in coverage[chromosome]: coverage[chromosome][position] = {} coverage[chromosome][position][ reference_base] = self.parse_brct_field(brct) coverage[chromosome][position][reference_base]['depth'] = depth return coverage def parse_brct_field(self, brct_entry): parsed_brct = {} for brct in brct_entry: (base, count, rest) = brct.split(':', 2) parsed_brct[base.upper()] = count return parsed_brct def is_insertion(self, ref, alt): return len(alt) > len(ref) def is_deletion(self, ref, alt): return len(alt) < len(ref) def simplify_indel_allele(self, ref, alt): while len(ref) > 0 and len(alt) > 0 and ref[-1] == alt[-1]: ref = ref[0:-1] alt = alt[0:-1] while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]: ref = ref[1:] alt = alt[1:] return ref, alt def create_csq_parser(self): info_fields = self.vcf_reader.infos if 'CSQ' not in info_fields: sys.exit( 'Input VCF does not contain a CSQ header. Please annotate the VCF with VEP before running it.' ) if info_fields['CSQ'] is None: sys.exit( 'Failed to extract format string from info description for tag (CSQ)' ) else: csq_header = info_fields['CSQ'] return CsqParser(csq_header.desc) def resolve_consequence(self, consequence_string): if '&' in consequence_string: consequences = { consequence.lower() for consequence in consequence_string.split('&') } elif '.' in consequence_string: consequences = { consequence.lower() for consequence in consequence_string.split('.') } else: consequences = [consequence_string.lower()] if 'start_lost' in consequences: consequence = None elif 'frameshift_variant' in consequences: consequence = 'FS' elif 'missense_variant' in consequences: consequence = 'missense' elif 'inframe_insertion' in consequences: consequence = 'inframe_ins' elif 'inframe_deletion' in consequences: consequence = 'inframe_del' else: consequence = None return consequence def calculate_vaf(self, var_count, depth): if depth == 0: return 'NA' else: return (var_count / depth) def parse_gene_expns_file(self): gene_expns = {} if self.gene_expn_file is not None: with open(self.gene_expn_file, 'r') as reader: genes_tsv_reader = csv.DictReader(reader, delimiter='\t') for row in genes_tsv_reader: if row['tracking_id'] not in gene_expns.keys(): gene_expns[row['tracking_id']] = {} gene_expns[row['tracking_id']][row['locus']] = row return gene_expns def parse_transcript_expns_file(self): transcript_expns = {} if self.transcript_expn_file is not None: with open(self.transcript_expn_file, 'r') as reader: isoforms_tsv_reader = csv.DictReader(reader, delimiter='\t') for row in isoforms_tsv_reader: transcript_expns[row['tracking_id']] = row return transcript_expns def parse_coverage_files(self): coverage = {} for variant_type in ['snvs', 'indels']: for data_type in ['normal', 'tdna', 'trna']: coverage_file_name = '_'.join( [data_type, variant_type, 'coverage_file']) coverage_file = getattr(self, coverage_file_name) if coverage_file is not None: if variant_type not in coverage: coverage[variant_type] = {} coverage[variant_type][ data_type] = self.parse_bam_readcount_file( coverage_file) return coverage def determine_bam_readcount_bases(self, entry, reference, alt, start): if len(reference) == len(alt): bam_readcount_position = entry.POS variant_type = 'snvs' ref_base = reference var_base = alt else: if self.is_deletion(reference, alt): bam_readcount_position = start + 1 (simplified_reference, simplified_alt) = self.simplify_indel_allele(reference, alt) ref_base = reference[1:2] var_base = '-' + simplified_reference elif self.is_insertion(reference, alt): bam_readcount_position = start (simplified_reference, simplified_alt) = self.simplify_indel_allele(reference, alt) ref_base = reference var_base = '+' + simplified_alt variant_type = 'indels' return (bam_readcount_position, ref_base, var_base, variant_type) def get_depth_from_vcf_genotype(self, genotype, tag): try: depth = genotype[tag] if depth is None or depth == "": depth = 'NA' except AttributeError: depth = 'NA' return depth def get_vaf_from_vcf_genotype(self, genotype, alts, alt, af_tag, ad_tag, dp_tag): try: allele_frequencies = genotype[af_tag] if isinstance(allele_frequencies, list): vaf = allele_frequencies[alts.index(alt)] else: vaf = allele_frequencies if vaf > 1: print( "Warning: VAF is expected to be a fraction, but is larger than 1. If VAFs are encoded as percentages, please adjust the coverage cutoffs accordingly." ) except (AttributeError, TypeError): try: allele_depths = genotype[ad_tag] if isinstance(allele_depths, list): #sometimes AF is type R, sometimes it's A if len(allele_depths) == len(alts): var_count = allele_depths[alts.index(alt)] elif len(allele_depths) == len(alts) + 1: var_count = allele_depths[alts.index(alt) + 1] else: print( "Warning: Mismatch between the number of alternate alleles and number of values in the AD field for genotype {}" .format(genotype)) return 'NA' else: var_count = allele_depths if var_count is None or var_count == "": return 'NA' depth = genotype[dp_tag] if depth is None or depth == "": return 'NA' vaf = self.calculate_vaf(int(var_count), int(depth)) except AttributeError: vaf = 'NA' return vaf def calculate_coverage_for_entry(self, coverage, entry, reference, alt, start, chromosome, genotype): (bam_readcount_position, ref_base, var_base, variant_type) = self.determine_bam_readcount_bases( entry, reference, alt, start) coverage_for_entry = {} if variant_type in coverage: for coverage_type in coverage[variant_type]: if (chromosome in coverage[variant_type][coverage_type] and str(bam_readcount_position) in coverage[variant_type][coverage_type][chromosome] and ref_base in coverage[variant_type][coverage_type] [chromosome][str(bam_readcount_position)]): brct = coverage[variant_type][coverage_type][chromosome][ str(bam_readcount_position)][ref_base] if 'depth' in brct and var_base in brct: coverage_for_entry[coverage_type + '_depth'] = int( brct['depth']) coverage_for_entry[coverage_type + '_vaf'] = self.calculate_vaf( int(brct[var_base]), int(brct['depth'])) else: coverage_for_entry[ 'tdna_depth'] = self.get_depth_from_vcf_genotype( genotype, 'DP') coverage_for_entry[ 'trna_depth'] = self.get_depth_from_vcf_genotype( genotype, 'RDP') coverage_for_entry['tdna_vaf'] = self.get_vaf_from_vcf_genotype( genotype, entry.ALT, alt, 'AF', 'AD', 'DP') coverage_for_entry['trna_vaf'] = self.get_vaf_from_vcf_genotype( genotype, entry.ALT, alt, 'RAF', 'RAD', 'RDP') if self.normal_sample_name is not None: normal_genotype = entry.genotype(self.normal_sample_name) coverage_for_entry[ 'normal_depth'] = self.get_depth_from_vcf_genotype( normal_genotype, 'DP') coverage_for_entry[ 'normal_vaf'] = self.get_vaf_from_vcf_genotype( normal_genotype, entry.ALT, alt, 'AF', 'AD', 'DP') return coverage_for_entry def write_proximal_variant_entries(self, entry, alt, transcript_name, index): proximal_variants = self.proximal_variant_parser.extract( entry, alt, transcript_name, self.peptide_length) for (proximal_variant, csq_entry) in proximal_variants: if len( list( self.somatic_vcf_reader.fetch( proximal_variant.CHROM, proximal_variant.POS - 1, proximal_variant.POS))) > 0: proximal_variant_type = 'somatic' else: proximal_variant_type = 'germline' proximal_variant_entry = { 'chromosome_name': proximal_variant.CHROM, 'start': proximal_variant.affected_start, 'stop': proximal_variant.affected_end, 'reference': proximal_variant.REF, 'variant': proximal_variant.ALT[0], 'amino_acid_change': csq_entry['Amino_acids'], 'codon_change': csq_entry['Codons'], 'protein_position': csq_entry['Protein_position'], 'type': proximal_variant_type, 'main_somatic_variant': index, } self.proximal_variants_writer.writerow(proximal_variant_entry) def close_filehandles(self): self.writer.close() self.reader.close() if self.proximal_variants_vcf: self.proximal_variant_parser.fh.close() self.proximal_variants_tsv_fh.close() self.somatic_vcf_fh.close() def decode_hex(self, string): hex_string = string.group(0).replace('%', '') return binascii.unhexlify(hex_string).decode('utf-8') def execute(self): gene_expns = self.parse_gene_expns_file() transcript_expns = self.parse_transcript_expns_file() coverage = self.parse_coverage_files() indexes = [] count = 1 for entry in self.vcf_reader: chromosome = entry.CHROM start = entry.affected_start stop = entry.affected_end reference = entry.REF alts = entry.ALT genotype = entry.genotype(self.sample_name) if genotype.gt_type is None or genotype.gt_type == 0: #The genotype is uncalled or hom_ref continue filt = entry.FILTER if self.pass_only and not (filt is None or len(filt) == 0): continue if 'CSQ' not in entry.INFO: continue alleles_dict = self.csq_parser.resolve_alleles(entry) for alt in alts: alt = str(alt) if genotype.gt_bases and alt not in genotype.gt_bases.split( '/'): continue coverage_for_entry = self.calculate_coverage_for_entry( coverage, entry, reference, alt, start, chromosome, genotype) transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], alt) if len(transcripts) == 0: csq_allele = alleles_dict[alt] transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], csq_allele) if len(transcripts) == 0 and self.is_deletion(reference, alt): transcripts = self.csq_parser.parse_csq_entries_for_allele( entry.INFO['CSQ'], 'deletion') for transcript in transcripts: transcript_name = transcript['Feature'] consequence = self.resolve_consequence( transcript['Consequence']) if consequence is None: continue elif consequence == 'FS': if transcript['DownstreamProtein'] == '': print( "frameshift_variant transcript does not contain a DownstreamProtein sequence. Skipping.\n{} {} {} {} {}" .format(entry.CHROM, entry.POS, entry.REF, alt, transcript['Feature'])) continue else: amino_acid_change_position = "%s%s/%s" % ( transcript['Protein_position'], entry.REF, alt) else: if transcript['Amino_acids'] == '': print( "Transcript does not contain Amino_acids change information. Skipping.\n{} {} {} {} {}" .format(entry.CHROM, entry.POS, entry.REF, alt, transcript['Feature'])) continue else: amino_acid_change_position = transcript[ 'Protein_position'] + transcript['Amino_acids'] gene_name = transcript['SYMBOL'] index = '%s.%s.%s.%s.%s' % (count, gene_name, transcript_name, consequence, amino_acid_change_position) if index in indexes: sys.exit( "Warning: TSV index already exists: {}".format( index)) else: indexes.append(index) count += 1 if self.proximal_variants_vcf: self.write_proximal_variant_entries( entry, alt, transcript_name, index) ensembl_gene_id = transcript['Gene'] hgvsc = re.sub( r'%[0-9|A-F][0-9|A-F]', self.decode_hex, transcript['HGVSc']) if 'HGVSc' in transcript else 'NA' hgvsp = re.sub( r'%[0-9|A-F][0-9|A-F]', self.decode_hex, transcript['HGVSp']) if 'HGVSp' in transcript else 'NA' if 'TSL' in transcript and transcript[ 'TSL'] is not None and transcript['TSL'] != '': tsl = transcript['TSL'] else: tsl = 'NA' output_row = { 'chromosome_name': entry.CHROM, 'start': entry.affected_start, 'stop': entry.affected_end, 'reference': entry.REF, 'variant': alt, 'gene_name': gene_name, 'transcript_name': transcript_name, 'transcript_support_level': tsl, 'ensembl_gene_id': ensembl_gene_id, 'hgvsc': hgvsc, 'hgvsp': hgvsp, 'wildtype_amino_acid_sequence': transcript['WildtypeProtein'], 'downstream_amino_acid_sequence': transcript['DownstreamProtein'], 'fusion_amino_acid_sequence': '', 'variant_type': consequence, 'protein_position': transcript['Protein_position'], 'index': index, 'protein_length_change': transcript['ProteinLengthChange'], } if transcript['Amino_acids']: output_row['amino_acid_change'] = transcript[ 'Amino_acids'] if transcript['Codons']: output_row['codon_change'] = transcript['Codons'] else: output_row['codon_change'] = 'NA' if transcript_name in transcript_expns.keys(): transcript_expn_entry = transcript_expns[ transcript_name] output_row[ 'transcript_expression'] = transcript_expn_entry[ 'FPKM'] elif 'TX' in self.vcf_reader.formats: if 'TX' in genotype.data._asdict(): transcript_expressions = genotype['TX'] if isinstance(transcript_expressions, list): for transcript_expression in transcript_expressions: (transcript, value) = transcript_expression.split('|') if transcript == transcript_name: output_row[ 'transcript_expression'] = value else: (transcript, value) = transcript_expressions.split('|') if transcript == transcript_name: output_row['transcript_expression'] = value if ensembl_gene_id in gene_expns.keys(): gene_expn_entries = gene_expns[ensembl_gene_id] gene_fpkm = 0 for locus, gene_expn_entry in gene_expn_entries.items( ): gene_fpkm += float(gene_expn_entry['FPKM']) output_row['gene_expression'] = gene_fpkm elif 'GX' in self.vcf_reader.formats: if 'GX' in genotype.data._asdict(): gene_expressions = genotype['GX'] if isinstance(gene_expressions, list): for gene_expression in gene_expressions: (gene, value) = gene_expression.split('|') if ensembl_gene_id == gene or gene_name == gene: output_row['gene_expression'] = value else: (gene, value) = gene_expressions.split('|') if ensembl_gene_id == gene or gene_name == gene: output_row['gene_expression'] = value output_row.update(coverage_for_entry) self.tsv_writer.writerow(output_row) self.close_filehandles()
def __init__(self, **kwargs): InputFileConverter.__init__(self, **kwargs) self.pass_only = kwargs.pop('pass_only', False) self.sample_name = kwargs.pop('sample_name', None) self.normal_sample_name = kwargs.pop('normal_sample_name', None) self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None) self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None) self.peptide_length = kwargs.pop('peptide_length', None) if self.proximal_variants_vcf and not (self.proximal_variants_tsv and self.peptide_length): sys.exit( "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided." ) if self.proximal_variants_vcf and not lib.utils.is_gz_file( self.input_file): sys.exit( "Input VCF {} needs to be bgzipped when running with a proximal variants VCF." .format(self.input_file)) if self.proximal_variants_vcf and not lib.utils.is_gz_file( self.proximal_variants_vcf): sys.exit("Proximal variants VCF {} needs to be bgzipped.".format( self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists( self.proximal_variants_vcf + '.tbi'): sys.exit( 'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.' .format(self.proximal_variants_vcf)) if self.proximal_variants_vcf and not os.path.exists(self.input_file + '.tbi'): sys.exit( 'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.' .format(self.input_file)) if lib.utils.is_gz_file(self.input_file): mode = 'rb' else: mode = 'r' if self.proximal_variants_vcf: self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv, 'w') self.proximal_variants_writer = csv.DictWriter( self.proximal_variants_tsv_fh, delimiter='\t', fieldnames=[ 'chromosome_name', 'start', 'stop', 'reference', 'variant', 'amino_acid_change', 'codon_change', 'protein_position', 'type', 'main_somatic_variant' ]) self.proximal_variants_writer.writeheader() self.proximal_variant_parser = ProximalVariant( self.proximal_variants_vcf, self.pass_only) self.somatic_vcf_fh = open(self.input_file, mode) self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh) self.reader = open(self.input_file, mode) self.vcf_reader = vcf.Reader(self.reader) self.writer = open(self.output_file, 'w') self.tsv_writer = csv.DictWriter(self.writer, delimiter='\t', fieldnames=self.output_headers(), restval='NA') self.tsv_writer.writeheader() self.csq_parser = self.create_csq_parser() if 'DownstreamProtein' not in self.csq_parser.csq_format: sys.exit( "VCF doesn't contain VEP DownstreamProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins." ) if 'WildtypeProtein' not in self.csq_parser.csq_format: sys.exit( "VCF doesn't contain VEP WildtypeProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins." )
def execute(self): peptide_sequence_length = self.peptide_sequence_length reader = open(self.input_file, 'r') tsvin = csv.DictReader(reader, delimiter='\t') fasta_sequences = OrderedDict() for line in tsvin: variant_type = line['variant_type'] full_wildtype_sequence = line['wildtype_amino_acid_sequence'] if variant_type == 'FS': position = int(line['protein_position'].split('-', 1)[0]) - 1 if line['amino_acid_change'] is not None and line[ 'amino_acid_change'].split('/')[0] == '-': if line['wildtype_amino_acid_sequence'][position] != line[ 'downstream_amino_acid_sequence'][0]: raise Exception( "Leading amino acid of the Downstream protein sequence ({}) expected to match the wildtype amino acid at postion {} ({}). " \ "You may need to reannotate your VCF with a newer version of VEP." \ .format(line['downstream_amino_acid_sequence'], position, line['wildtype_amino_acid_sequence'][position]) ) elif variant_type == 'missense' or variant_type == 'inframe_ins': if '/' not in line['amino_acid_change']: continue wildtype_amino_acid, mutant_amino_acid = line[ 'amino_acid_change'].split('/') if '*' in wildtype_amino_acid: wildtype_amino_acid = wildtype_amino_acid.split('*')[0] elif 'X' in wildtype_amino_acid: wildtype_amino_acid = wildtype_amino_acid.split('X')[0] if '*' in mutant_amino_acid: mutant_amino_acid = mutant_amino_acid.split('*')[0] stop_codon_added = True elif 'X' in mutant_amino_acid: mutant_amino_acid = mutant_amino_acid.split('X')[0] stop_codon_added = True else: stop_codon_added = False if wildtype_amino_acid == '-': position = int(line['protein_position'].split('-', 1)[0]) wildtype_amino_acid_length = 0 else: if '-' in line['protein_position']: position = int(line['protein_position'].split( '-', 1)[0]) - 1 wildtype_amino_acid_length = len(wildtype_amino_acid) else: position = int(line['protein_position']) - 1 wildtype_amino_acid_length = len(wildtype_amino_acid) elif variant_type == 'inframe_del': variant_type = 'inframe_del' wildtype_amino_acid, mutant_amino_acid = line[ 'amino_acid_change'].split('/') if '*' in wildtype_amino_acid: wildtype_amino_acid = wildtype_amino_acid.split('*')[0] elif 'X' in wildtype_amino_acid: wildtype_amino_acid = wildtype_amino_acid.split('X')[0] if '*' in mutant_amino_acid: mutant_amino_acid = mutant_amino_acid.split('*')[0] stop_codon_added = True elif 'X' in mutant_amino_acid: mutant_amino_acid = mutant_amino_acid.split('X')[0] stop_codon_added = True else: stop_codon_added = False position = int(line['protein_position'].split('-', 1)[0]) - 1 wildtype_amino_acid_length = len(wildtype_amino_acid) if mutant_amino_acid == '-': mutant_amino_acid = '' else: continue if self.position_out_of_bounds(position, full_wildtype_sequence): continue if variant_type == 'missense' and line[ 'index'] in self.proximal_variants and line[ 'protein_position'] in self.proximal_variants[ line['index']]: codon_changes = [ item['codon_change'] for item in self.proximal_variants[ line['index']][line['protein_position']] ] codon_changes.append(line['codon_change']) mutant_amino_acid_with_proximal_variants = ProximalVariant.combine_conflicting_variants( codon_changes) elif variant_type != 'FS': mutant_amino_acid_with_proximal_variants = mutant_amino_acid if variant_type == 'FS': wildtype_subsequence, left_flanking_subsequence = self.get_frameshift_subsequences( position, full_wildtype_sequence, peptide_sequence_length, line) downstream_sequence = line['downstream_amino_acid_sequence'] if self.downstream_sequence_length and len( downstream_sequence) > self.downstream_sequence_length: downstream_sequence = downstream_sequence[ 0:self.downstream_sequence_length] mutation_start_position = len(left_flanking_subsequence) wildtype_subsequence = self.add_proximal_variants( line['index'], wildtype_subsequence, mutation_start_position, position, True) left_flanking_subsequence_with_proximal_variants = self.add_proximal_variants( line['index'], left_flanking_subsequence, mutation_start_position, position, False) #The caveat here is that if a nearby variant is in the downstream sequence, the protein sequence would be further altered, which we aren't taking into account. #we would need to recalculate the downstream protein sequence taking all downstream variants into account. mutant_subsequence = left_flanking_subsequence_with_proximal_variants + downstream_sequence else: mutation_start_position, wildtype_subsequence = self.get_wildtype_subsequence( position, full_wildtype_sequence, wildtype_amino_acid_length, peptide_sequence_length, line) mutation_end_position = mutation_start_position + wildtype_amino_acid_length if wildtype_amino_acid != '-' and wildtype_amino_acid != wildtype_subsequence[ mutation_start_position:mutation_end_position]: if line['amino_acid_change'].split('/')[0].count('*') > 1: print( "Warning: Amino acid change is not sane - contains multiple stops. Skipping entry {}" .format(line['index'])) continue else: sys.exit( "ERROR: There was a mismatch between the actual wildtype amino acid sequence ({}) and the expected amino acid sequence ({}). Did you use the same reference build version for VEP that you used for creating the VCF?\n{}" .format( wildtype_subsequence[mutation_start_position: mutation_end_position], wildtype_amino_acid, line)) wildtype_subsequence_with_proximal_variants = self.add_proximal_variants( line['index'], wildtype_subsequence, mutation_start_position, position, False) wildtype_subsequence = self.add_proximal_variants( line['index'], wildtype_subsequence, mutation_start_position, position, True) if stop_codon_added: mutant_subsequence = wildtype_subsequence_with_proximal_variants[: mutation_start_position] + mutant_amino_acid_with_proximal_variants else: mutant_subsequence = wildtype_subsequence_with_proximal_variants[:mutation_start_position] + mutant_amino_acid_with_proximal_variants + wildtype_subsequence_with_proximal_variants[ mutation_end_position:] if '*' in wildtype_subsequence or '*' in mutant_subsequence: continue if 'X' in wildtype_subsequence or 'X' in mutant_subsequence: continue if 'U' in wildtype_subsequence or 'U' in mutant_subsequence: print( "Warning. Sequence contains unsupported amino acid U. Skipping entry {}" .format(line['index'])) continue if mutant_subsequence in wildtype_subsequence: #This is not a novel peptide continue if len(wildtype_subsequence) < self.epitope_length or len( mutant_subsequence) < self.epitope_length: continue variant_id = line['index'] for designation, subsequence in zip( ['WT', 'MT'], [wildtype_subsequence, mutant_subsequence]): key = '%s.%s' % (designation, variant_id) fasta_sequences.setdefault(subsequence, []).append(key) writer = open(self.output_file, 'w') key_writer = open(self.output_key_file, 'w') count = 1 for (subsequence, keys) in fasta_sequences.items(): writer.writelines('>%s\n' % count) writer.writelines('%s\n' % subsequence) yaml.dump({count: keys}, key_writer, default_flow_style=False) count += 1 reader.close() writer.close() key_writer.close()