Esempio n. 1
0
 def __init__(self, **kwargs):
     InputFileConverter.__init__(self, **kwargs)
     self.gene_expn_file              = kwargs.pop('gene_expn_file', None)
     self.transcript_expn_file        = kwargs.pop('transcript_expn_file', None)
     self.normal_snvs_coverage_file   = kwargs.pop('normal_snvs_coverage_file', None)
     self.normal_indels_coverage_file = kwargs.pop('normal_indels_coverage_file', None)
     self.tdna_snvs_coverage_file     = kwargs.pop('tdna_snvs_coverage_file', None)
     self.tdna_indels_coverage_file   = kwargs.pop('tdna_indels_coverage_file', None)
     self.trna_snvs_coverage_file     = kwargs.pop('trna_snvs_coverage_file', None)
     self.trna_indels_coverage_file   = kwargs.pop('trna_indels_coverage_file', None)
     self.pass_only                   = kwargs.pop('pass_only', False)
     self.sample_name        = kwargs.pop('sample_name', None)
     self.normal_sample_name = kwargs.pop('normal_sample_name', None)
     self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None)
     self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None)
     self.peptide_length = kwargs.pop('peptide_length', None)
     if self.proximal_variants_vcf and not (self.proximal_variants_tsv and self.peptide_length):
         sys.exit("A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided.")
     if self.proximal_variants_vcf and not lib.utils.is_gz_file(self.input_file):
         sys.exit("Input VCF {} needs to be bgzipped when running with a proximal variants VCF.".format(self.input_file))
     if self.proximal_variants_vcf and not lib.utils.is_gz_file(self.proximal_variants_vcf):
         sys.exit("Proximal variants VCF {} needs to be bgzipped.".format(self.proximal_variants_vcf))
     if self.proximal_variants_vcf and not os.path.exists(self.proximal_variants_vcf + '.tbi'):
         sys.exit('No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.'.format(self.proximal_variants_vcf))
     if self.proximal_variants_vcf and not os.path.exists(self.input_file + '.tbi'):
         sys.exit('No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.'.format(self.input_file))
     if lib.utils.is_gz_file(self.input_file):
         mode = 'rb'
     else:
         mode = 'r'
     if self.proximal_variants_vcf:
         self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv, 'w')
         self.proximal_variants_writer = csv.DictWriter(self.proximal_variants_tsv_fh, delimiter='\t', fieldnames=['chromosome_name', 'start', 'stop', 'reference', 'variant', 'amino_acid_change', 'codon_change', 'protein_position', 'type', 'main_somatic_variant'])
         self.proximal_variants_writer.writeheader()
         self.proximal_variant_parser = ProximalVariant(self.proximal_variants_vcf, self.pass_only)
         self.somatic_vcf_fh = open(self.input_file, mode)
         self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh)
     self.reader = open(self.input_file, mode)
     self.vcf_reader = vcf.Reader(self.reader)
     if len(self.vcf_reader.samples) > 1:
         if not self.sample_name:
             sys.exit("VCF contains more than one sample but sample_name is not set.")
         elif self.sample_name not in self.vcf_reader.samples:
             sys.exit("sample_name {} not a sample ID in the #CHROM header of VCF {}".format(self.sample_name, self.input_file))
         if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples:
             sys.exit("normal_sample_name {} not a sample ID in the #CHROM header of VCF {}".format(self.normal_sample_name, self.input_file))
     elif len(self.vcf_reader.samples) ==  0:
         sys.exit("VCF doesn't contain any sample genotype information.")
     else:
         self.sample_name = self.vcf_reader.samples[0]
     self.writer = open(self.output_file, 'w')
     self.tsv_writer = csv.DictWriter(self.writer, delimiter='\t', fieldnames=self.output_headers(), restval='NA')
     self.tsv_writer.writeheader()
     self.csq_parser = self.create_csq_parser()
Esempio n. 2
0
    def add_proximal_variants(self, somatic_variant_index,
                              wildtype_subsequence, mutation_position,
                              original_position, germline_variants_only):
        mutation_offset = original_position - mutation_position
        wildtype_subsequence_with_proximal_variants = wildtype_subsequence
        if somatic_variant_index in self.proximal_variants.keys():
            for (protein_position, lines
                 ) in self.proximal_variants[somatic_variant_index].items():
                if protein_position == original_position:
                    continue

                if germline_variants_only:
                    filtered_lines = [
                        line for line in lines if line['type'] == 'germline'
                    ]
                else:
                    filtered_lines = lines

                if len(filtered_lines) == 0:
                    continue
                elif len(filtered_lines) == 1:
                    line = filtered_lines[0]
                    proximal_variant_wildtype_amino_acid, proximal_variant_mutant_amino_acid = line[
                        'amino_acid_change'].split('/')
                else:
                    line = filtered_lines[0]
                    proximal_variant_wildtype_amino_acid = line[
                        'amino_acid_change'].split('/')[0]
                    codon_changes = [
                        item['codon_change'] for item in filtered_lines
                    ]
                    proximal_variant_mutant_amino_acid = ProximalVariant.combine_conflicting_variants(
                        codon_changes)

                proximal_variant_position = int(
                    protein_position) - 1 - mutation_offset
                if proximal_variant_position <= 0 or proximal_variant_position >= len(
                        wildtype_subsequence):
                    continue
                if len(proximal_variant_wildtype_amino_acid) != len(
                        proximal_variant_mutant_amino_acid):
                    print(
                        "Nearby variant is not a missense mutation. Skipping.")
                    continue
                if wildtype_subsequence[
                        proximal_variant_position] != proximal_variant_wildtype_amino_acid:
                    sys.exit(
                        "Error when processing proximal variant.\n" +
                        "The wildtype amino acid for variant %s with substring %s is different than expected.\n"
                        % (somatic_variant_index, wildtype_subsequence) +
                        "Actual wildtype amino acid: %s\n" %
                        wildtype_subsequence[proximal_variant_position] +
                        "Wildtype amino acid of the proximal_variant: %s" %
                        proximal_variant_wildtype_amino_acid)
                wildtype_subsequence_with_proximal_variants = wildtype_subsequence_with_proximal_variants[:proximal_variant_position] + proximal_variant_mutant_amino_acid + wildtype_subsequence_with_proximal_variants[
                    proximal_variant_position + 1:]
        return wildtype_subsequence_with_proximal_variants
Esempio n. 3
0
class VcfConverter(InputFileConverter):
    def __init__(self, **kwargs):
        InputFileConverter.__init__(self, **kwargs)
        self.pass_only = kwargs.pop('pass_only', False)
        self.sample_name = kwargs.pop('sample_name', None)
        self.normal_sample_name = kwargs.pop('normal_sample_name', None)
        self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None)
        self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None)
        self.peptide_length = kwargs.pop('peptide_length', None)
        if self.proximal_variants_vcf and not (self.proximal_variants_tsv
                                               and self.peptide_length):
            sys.exit(
                "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided."
            )
        if self.proximal_variants_vcf and not lib.utils.is_gz_file(
                self.input_file):
            sys.exit(
                "Input VCF {} needs to be bgzipped when running with a proximal variants VCF."
                .format(self.input_file))
        if self.proximal_variants_vcf and not lib.utils.is_gz_file(
                self.proximal_variants_vcf):
            sys.exit("Proximal variants VCF {} needs to be bgzipped.".format(
                self.proximal_variants_vcf))
        if self.proximal_variants_vcf and not os.path.exists(
                self.proximal_variants_vcf + '.tbi'):
            sys.exit(
                'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.'
                .format(self.proximal_variants_vcf))
        if self.proximal_variants_vcf and not os.path.exists(self.input_file +
                                                             '.tbi'):
            sys.exit(
                'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.'
                .format(self.input_file))
        if lib.utils.is_gz_file(self.input_file):
            mode = 'rb'
        else:
            mode = 'r'
        if self.proximal_variants_vcf:
            self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv,
                                                 'w')
            self.proximal_variants_writer = csv.DictWriter(
                self.proximal_variants_tsv_fh,
                delimiter='\t',
                fieldnames=[
                    'chromosome_name', 'start', 'stop', 'reference', 'variant',
                    'amino_acid_change', 'codon_change', 'protein_position',
                    'type', 'main_somatic_variant'
                ])
            self.proximal_variants_writer.writeheader()
            self.proximal_variant_parser = ProximalVariant(
                self.proximal_variants_vcf, self.pass_only)
            self.somatic_vcf_fh = open(self.input_file, mode)
            self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh)
        self.reader = open(self.input_file, mode)
        self.vcf_reader = vcf.Reader(self.reader)
        if len(self.vcf_reader.samples) > 1:
            if not self.sample_name:
                sys.exit(
                    "VCF contains more than one sample but sample_name is not set."
                )
            elif self.sample_name not in self.vcf_reader.samples:
                sys.exit(
                    "sample_name {} not a sample ID in the #CHROM header of VCF {}"
                    .format(self.sample_name, self.input_file))
            if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples:
                sys.exit(
                    "normal_sample_name {} not a sample ID in the #CHROM header of VCF {}"
                    .format(self.normal_sample_name, self.input_file))
        elif len(self.vcf_reader.samples) == 0:
            sys.exit("VCF doesn't contain any sample genotype information.")
        else:
            self.sample_name = self.vcf_reader.samples[0]
        self.writer = open(self.output_file, 'w')
        self.tsv_writer = csv.DictWriter(self.writer,
                                         delimiter='\t',
                                         fieldnames=self.output_headers(),
                                         restval='NA')
        self.tsv_writer.writeheader()
        self.csq_parser = self.create_csq_parser()
        if 'DownstreamProtein' not in self.csq_parser.csq_format:
            sys.exit(
                "VCF doesn't contain VEP DownstreamProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins."
            )
        if 'WildtypeProtein' not in self.csq_parser.csq_format:
            sys.exit(
                "VCF doesn't contain VEP WildtypeProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins."
            )

    def is_insertion(self, ref, alt):
        return len(alt) > len(ref)

    def is_deletion(self, ref, alt):
        return len(alt) < len(ref)

    def create_csq_parser(self):
        info_fields = self.vcf_reader.infos

        if 'CSQ' not in info_fields:
            sys.exit(
                'Input VCF does not contain a CSQ header. Please annotate the VCF with VEP before running it.'
            )
        if info_fields['CSQ'] is None:
            sys.exit(
                'Failed to extract format string from info description for tag (CSQ)'
            )
        else:
            csq_header = info_fields['CSQ']
            return CsqParser(csq_header.desc)

    def resolve_consequence(self, consequence_string, ref, alt):
        if '&' in consequence_string:
            consequences = {
                consequence.lower()
                for consequence in consequence_string.split('&')
            }
        elif '.' in consequence_string:
            consequences = {
                consequence.lower()
                for consequence in consequence_string.split('.')
            }
        else:
            consequences = [consequence_string.lower()]

        if 'start_lost' in consequences:
            consequence = None
        elif 'frameshift_variant' in consequences:
            consequence = 'FS'
        elif 'missense_variant' in consequences:
            consequence = 'missense'
        elif 'inframe_insertion' in consequences:
            consequence = 'inframe_ins'
        elif 'inframe_deletion' in consequences:
            consequence = 'inframe_del'
        elif 'protein_altering_variant' in consequences:
            if len(ref) > len(alt) and (len(ref) - len(alt)) % 3 == 0:
                consequence = 'inframe_del'
            elif len(alt) > len(ref) and (len(alt) - len(ref)) % 3 == 0:
                consequence = 'inframe_ins'
            else:
                consequence = None
        else:
            consequence = None
        return consequence

    def calculate_vaf(self, var_count, depth):
        if depth == 0:
            return 'NA'
        else:
            return (var_count / depth)

    def get_depth_from_vcf_genotype(self, genotype, tag):
        try:
            depth = genotype[tag]
            if depth is None or depth == "":
                depth = 'NA'
        except AttributeError:
            depth = 'NA'
        return depth

    def get_vaf_from_vcf_genotype(self, genotype, alts, alt, af_tag, ad_tag,
                                  dp_tag):
        try:
            allele_frequencies = genotype[af_tag]
            if isinstance(allele_frequencies, list):
                vaf = allele_frequencies[alts.index(alt)]
            else:
                vaf = allele_frequencies
            if vaf > 1:
                print(
                    "Warning: VAF is expected to be a fraction, but is larger than 1. If VAFs are encoded as percentages, please adjust the coverage cutoffs accordingly."
                )
        except (AttributeError, TypeError):
            try:
                allele_depths = genotype[ad_tag]
                if isinstance(allele_depths, list):
                    #sometimes AF is type R, sometimes it's A
                    if len(allele_depths) == len(alts):
                        var_count = allele_depths[alts.index(alt)]
                    elif len(allele_depths) == len(alts) + 1:
                        var_count = allele_depths[alts.index(alt) + 1]
                    else:
                        print(
                            "Warning: Mismatch between the number of alternate alleles and number of values in the AD field for genotype {}"
                            .format(genotype))
                        return 'NA'
                else:
                    var_count = allele_depths
                if var_count is None or var_count == "":
                    return 'NA'
                depth = genotype[dp_tag]
                if depth is None or depth == "":
                    return 'NA'
                vaf = self.calculate_vaf(int(var_count), int(depth))
            except AttributeError:
                vaf = 'NA'
        return vaf

    def calculate_coverage_for_entry(self, entry, reference, alt, start,
                                     chromosome, genotype):
        coverage_for_entry = {}
        coverage_for_entry['tdna_depth'] = self.get_depth_from_vcf_genotype(
            genotype, 'DP')
        coverage_for_entry['trna_depth'] = self.get_depth_from_vcf_genotype(
            genotype, 'RDP')
        alts = list(map(lambda x: str(x), entry.ALT))
        coverage_for_entry['tdna_vaf'] = self.get_vaf_from_vcf_genotype(
            genotype, alts, alt, 'AF', 'AD', 'DP')
        coverage_for_entry['trna_vaf'] = self.get_vaf_from_vcf_genotype(
            genotype, alts, alt, 'RAF', 'RAD', 'RDP')
        if self.normal_sample_name is not None:
            normal_genotype = entry.genotype(self.normal_sample_name)
            coverage_for_entry[
                'normal_depth'] = self.get_depth_from_vcf_genotype(
                    normal_genotype, 'DP')
            coverage_for_entry['normal_vaf'] = self.get_vaf_from_vcf_genotype(
                normal_genotype, alts, alt, 'AF', 'AD', 'DP')
        return coverage_for_entry

    def write_proximal_variant_entries(self, entry, alt, transcript_name,
                                       index):
        proximal_variants = self.proximal_variant_parser.extract(
            entry, alt, transcript_name, self.peptide_length)
        for (proximal_variant, csq_entry) in proximal_variants:
            if len(
                    list(
                        self.somatic_vcf_reader.fetch(
                            proximal_variant.CHROM, proximal_variant.POS - 1,
                            proximal_variant.POS))) > 0:
                proximal_variant_type = 'somatic'
            else:
                proximal_variant_type = 'germline'
            if '/' in csq_entry['Protein_position']:
                protein_position = csq_entry['Protein_position'].split('/')[0]
            else:
                protein_position = csq_entry['Protein_position']
            proximal_variant_entry = {
                'chromosome_name': proximal_variant.CHROM,
                'start': proximal_variant.affected_start,
                'stop': proximal_variant.affected_end,
                'reference': proximal_variant.REF,
                'variant': proximal_variant.ALT[0],
                'amino_acid_change': csq_entry['Amino_acids'],
                'codon_change': csq_entry['Codons'],
                'protein_position': protein_position,
                'type': proximal_variant_type,
                'main_somatic_variant': index,
            }
            self.proximal_variants_writer.writerow(proximal_variant_entry)

    def close_filehandles(self):
        self.writer.close()
        self.reader.close()
        if self.proximal_variants_vcf:
            self.proximal_variant_parser.fh.close()
            self.proximal_variants_tsv_fh.close()
            self.somatic_vcf_fh.close()

    def decode_hex(self, string):
        hex_string = string.group(0).replace('%', '')
        return binascii.unhexlify(hex_string).decode('utf-8')

    def execute(self):
        indexes = []
        count = 1
        while True:
            try:
                entry = next(self.vcf_reader)
            except StopIteration:
                break
            except ValueError as e:
                raise Exception(
                    "VCF is truncated in the middle of an entry near string '{}'"
                    .format(str(e).split("'")[1]))
            except IndexError as e:
                raise Exception("VCF is truncated at the end of the file")
            except Exception as e:
                raise Exception("Error while reading VCF entry: {}".format(
                    str(e)))
            chromosome = entry.CHROM
            start = entry.affected_start
            stop = entry.affected_end
            reference = entry.REF
            alts = entry.ALT

            genotype = entry.genotype(self.sample_name)
            if genotype.gt_type is None or genotype.gt_type == 0:
                #The genotype is uncalled or hom_ref
                continue

            filt = entry.FILTER
            if self.pass_only and not (filt is None or len(filt) == 0):
                continue

            if 'CSQ' not in entry.INFO:
                continue

            alleles_dict = self.csq_parser.resolve_alleles(entry)
            for alt in alts:
                alt = str(alt)
                if genotype.gt_bases and alt not in genotype.gt_bases.split(
                        '/'):
                    continue

                coverage_for_entry = self.calculate_coverage_for_entry(
                    entry, reference, alt, start, chromosome, genotype)

                transcripts = self.csq_parser.parse_csq_entries_for_allele(
                    entry.INFO['CSQ'], alt)
                if len(transcripts) == 0:
                    csq_allele = alleles_dict[alt]
                    transcripts = self.csq_parser.parse_csq_entries_for_allele(
                        entry.INFO['CSQ'], csq_allele)
                if len(transcripts) == 0 and self.is_deletion(reference, alt):
                    transcripts = self.csq_parser.parse_csq_entries_for_allele(
                        entry.INFO['CSQ'], 'deletion')

                for transcript in transcripts:
                    if '/' in transcript['Protein_position']:
                        protein_position = transcript[
                            'Protein_position'].split('/')[0]
                    else:
                        protein_position = transcript['Protein_position']
                    transcript_name = transcript['Feature']
                    consequence = self.resolve_consequence(
                        transcript['Consequence'], reference, alt)
                    if consequence is None:
                        continue
                    elif consequence == 'FS':
                        if transcript['DownstreamProtein'] == '':
                            print(
                                "frameshift_variant transcript does not contain a DownstreamProtein sequence. Skipping.\n{} {} {} {} {}"
                                .format(entry.CHROM, entry.POS, entry.REF, alt,
                                        transcript['Feature']))
                            continue
                        else:
                            amino_acid_change_position = "%s%s/%s" % (
                                protein_position, entry.REF, alt)
                    else:
                        if transcript['Amino_acids'] == '':
                            print(
                                "Transcript does not contain Amino_acids change information. Skipping.\n{} {} {} {} {}"
                                .format(entry.CHROM, entry.POS, entry.REF, alt,
                                        transcript['Feature']))
                            continue
                        else:
                            amino_acid_change_position = protein_position + transcript[
                                'Amino_acids']
                    gene_name = transcript['SYMBOL']
                    index = '%s.%s.%s.%s.%s' % (count, gene_name,
                                                transcript_name, consequence,
                                                amino_acid_change_position)
                    if index in indexes:
                        sys.exit(
                            "Warning: TSV index already exists: {}".format(
                                index))
                    else:
                        indexes.append(index)
                        count += 1

                    if self.proximal_variants_vcf:
                        self.write_proximal_variant_entries(
                            entry, alt, transcript_name, index)

                    ensembl_gene_id = transcript['Gene']
                    hgvsc = re.sub(
                        r'%[0-9|A-F][0-9|A-F]', self.decode_hex,
                        transcript['HGVSc']) if 'HGVSc' in transcript else 'NA'
                    hgvsp = re.sub(
                        r'%[0-9|A-F][0-9|A-F]', self.decode_hex,
                        transcript['HGVSp']) if 'HGVSp' in transcript else 'NA'
                    if 'TSL' in transcript and transcript[
                            'TSL'] is not None and transcript['TSL'] != '':
                        tsl = transcript['TSL']
                    else:
                        tsl = 'NA'
                    output_row = {
                        'chromosome_name':
                        entry.CHROM,
                        'start':
                        entry.affected_start,
                        'stop':
                        entry.affected_end,
                        'reference':
                        entry.REF,
                        'variant':
                        alt,
                        'gene_name':
                        gene_name,
                        'transcript_name':
                        transcript_name,
                        'transcript_support_level':
                        tsl,
                        'ensembl_gene_id':
                        ensembl_gene_id,
                        'hgvsc':
                        hgvsc,
                        'hgvsp':
                        hgvsp,
                        'wildtype_amino_acid_sequence':
                        transcript['WildtypeProtein'],
                        'downstream_amino_acid_sequence':
                        transcript['DownstreamProtein'],
                        'fusion_amino_acid_sequence':
                        '',
                        'variant_type':
                        consequence,
                        'protein_position':
                        protein_position,
                        'index':
                        index,
                        'protein_length_change':
                        transcript['ProteinLengthChange'],
                    }
                    if transcript['Amino_acids']:
                        output_row['amino_acid_change'] = transcript[
                            'Amino_acids']

                    if transcript['Codons']:
                        output_row['codon_change'] = transcript['Codons']
                    else:
                        output_row['codon_change'] = 'NA'

                    for (tag, key, comparison_fields) in zip(
                        ['TX', 'GX'],
                        ['transcript_expression', 'gene_expression'],
                        [[transcript_name], [ensembl_gene_id, gene_name]]):
                        if tag in self.vcf_reader.formats:
                            if tag in genotype.data._asdict():
                                expressions = genotype[tag]
                                if isinstance(expressions, list):
                                    for expression in expressions:
                                        (item, value) = expression.split('|')
                                        for comparison_field in comparison_fields:
                                            if item == comparison_field:
                                                output_row[key] = value
                                elif expressions is not None:
                                    (item, value) = expressions.split('|')
                                    for comparison_field in comparison_fields:
                                        if item == comparison_field:
                                            output_row[key] = value

                    output_row.update(coverage_for_entry)

                    self.tsv_writer.writerow(output_row)

        self.close_filehandles()
Esempio n. 4
0
class VcfConverter(InputFileConverter):
    def __init__(self, **kwargs):
        InputFileConverter.__init__(self, **kwargs)
        self.gene_expn_file = kwargs.pop('gene_expn_file', None)
        self.transcript_expn_file = kwargs.pop('transcript_expn_file', None)
        self.normal_snvs_coverage_file = kwargs.pop(
            'normal_snvs_coverage_file', None)
        self.normal_indels_coverage_file = kwargs.pop(
            'normal_indels_coverage_file', None)
        self.tdna_snvs_coverage_file = kwargs.pop('tdna_snvs_coverage_file',
                                                  None)
        self.tdna_indels_coverage_file = kwargs.pop(
            'tdna_indels_coverage_file', None)
        self.trna_snvs_coverage_file = kwargs.pop('trna_snvs_coverage_file',
                                                  None)
        self.trna_indels_coverage_file = kwargs.pop(
            'trna_indels_coverage_file', None)
        self.pass_only = kwargs.pop('pass_only', False)
        self.sample_name = kwargs.pop('sample_name', None)
        self.normal_sample_name = kwargs.pop('normal_sample_name', None)
        self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None)
        self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None)
        self.peptide_length = kwargs.pop('peptide_length', None)
        if self.proximal_variants_vcf and not (self.proximal_variants_tsv
                                               and self.peptide_length):
            sys.exit(
                "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided"
            )
        if self.proximal_variants_vcf and not os.path.exists(
                self.proximal_variants_vcf + '.tbi'):
            sys.exit(
                'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed'
                .format(self.proximal_variants_vcf))
        if self.proximal_variants_vcf and not os.path.exists(self.input_file +
                                                             '.tbi'):
            sys.exit(
                'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.'
                .format(self.input_file))
        if lib.utils.is_gz_file(self.input_file):
            mode = 'rb'
        else:
            mode = 'r'
        if self.proximal_variants_vcf:
            self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv,
                                                 'w')
            self.proximal_variants_writer = csv.DictWriter(
                self.proximal_variants_tsv_fh,
                delimiter='\t',
                fieldnames=[
                    'chromosome_name', 'start', 'stop', 'reference', 'variant',
                    'amino_acid_change', 'codon_change', 'protein_position',
                    'type', 'main_somatic_variant'
                ])
            self.proximal_variants_writer.writeheader()
            self.proximal_variant_parser = ProximalVariant(
                self.proximal_variants_vcf, self.pass_only)
            self.somatic_vcf_fh = open(self.input_file, mode)
            self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh)
        self.reader = open(self.input_file, mode)
        self.vcf_reader = vcf.Reader(self.reader)
        if len(self.vcf_reader.samples) > 1:
            if not self.sample_name:
                sys.exit(
                    "VCF contains more than one sample but sample_name is not set."
                )
            elif self.sample_name not in self.vcf_reader.samples:
                sys.exit("sample_name {} not in VCF {}".format(
                    self.sample_name, self.input_file))
            if self.normal_sample_name is not None and self.normal_sample_name not in self.vcf_reader.samples:
                sys.exit("normal_sample_name {} not in VCF {}".format(
                    self.normal_sample_name, self.input_file))
        elif len(self.vcf_reader.samples) == 0:
            sys.exit("VCF doesn't contain any sample genotype information.")
        else:
            self.sample_name = self.vcf_reader.samples[0]
        self.writer = open(self.output_file, 'w')
        self.tsv_writer = csv.DictWriter(self.writer,
                                         delimiter='\t',
                                         fieldnames=self.output_headers(),
                                         restval='NA')
        self.tsv_writer.writeheader()
        self.csq_parser = self.create_csq_parser()

    def parse_bam_readcount_file(self, bam_readcount_file):
        with open(bam_readcount_file, 'r') as reader:
            coverage_tsv_reader = csv.reader(reader, delimiter='\t')
            coverage = {}
            for row in coverage_tsv_reader:
                chromosome = row[0]
                position = row[1]
                reference_base = row[2].upper()
                depth = row[3]
                brct = row[4:]
                if chromosome not in coverage:
                    coverage[chromosome] = {}
                if position not in coverage[chromosome]:
                    coverage[chromosome][position] = {}
                coverage[chromosome][position][
                    reference_base] = self.parse_brct_field(brct)
                coverage[chromosome][position][reference_base]['depth'] = depth
        return coverage

    def parse_brct_field(self, brct_entry):
        parsed_brct = {}
        for brct in brct_entry:
            (base, count, rest) = brct.split(':', 2)
            parsed_brct[base.upper()] = count
        return parsed_brct

    def is_insertion(self, ref, alt):
        return len(alt) > len(ref)

    def is_deletion(self, ref, alt):
        return len(alt) < len(ref)

    def simplify_indel_allele(self, ref, alt):
        while len(ref) > 0 and len(alt) > 0 and ref[-1] == alt[-1]:
            ref = ref[0:-1]
            alt = alt[0:-1]
        while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]:
            ref = ref[1:]
            alt = alt[1:]
        return ref, alt

    def create_csq_parser(self):
        info_fields = self.vcf_reader.infos

        if 'CSQ' not in info_fields:
            sys.exit(
                'Input VCF does not contain a CSQ header. Please annotate the VCF with VEP before running it.'
            )
        if info_fields['CSQ'] is None:
            sys.exit(
                'Failed to extract format string from info description for tag (CSQ)'
            )
        else:
            csq_header = info_fields['CSQ']
            return CsqParser(csq_header.desc)

    def resolve_consequence(self, consequence_string):
        if '&' in consequence_string:
            consequences = {
                consequence.lower()
                for consequence in consequence_string.split('&')
            }
        elif '.' in consequence_string:
            consequences = {
                consequence.lower()
                for consequence in consequence_string.split('.')
            }
        else:
            consequences = [consequence_string.lower()]

        if 'start_lost' in consequences:
            consequence = None
        elif 'frameshift_variant' in consequences:
            consequence = 'FS'
        elif 'missense_variant' in consequences:
            consequence = 'missense'
        elif 'inframe_insertion' in consequences:
            consequence = 'inframe_ins'
        elif 'inframe_deletion' in consequences:
            consequence = 'inframe_del'
        else:
            consequence = None
        return consequence

    def calculate_vaf(self, var_count, depth):
        if depth == 0:
            return 'NA'
        else:
            return (var_count / depth)

    def parse_gene_expns_file(self):
        gene_expns = {}
        if self.gene_expn_file is not None:
            with open(self.gene_expn_file, 'r') as reader:
                genes_tsv_reader = csv.DictReader(reader, delimiter='\t')
                for row in genes_tsv_reader:
                    if row['tracking_id'] not in gene_expns.keys():
                        gene_expns[row['tracking_id']] = {}
                    gene_expns[row['tracking_id']][row['locus']] = row
        return gene_expns

    def parse_transcript_expns_file(self):
        transcript_expns = {}
        if self.transcript_expn_file is not None:
            with open(self.transcript_expn_file, 'r') as reader:
                isoforms_tsv_reader = csv.DictReader(reader, delimiter='\t')
                for row in isoforms_tsv_reader:
                    transcript_expns[row['tracking_id']] = row
        return transcript_expns

    def parse_coverage_files(self):
        coverage = {}
        for variant_type in ['snvs', 'indels']:
            for data_type in ['normal', 'tdna', 'trna']:
                coverage_file_name = '_'.join(
                    [data_type, variant_type, 'coverage_file'])
                coverage_file = getattr(self, coverage_file_name)
                if coverage_file is not None:
                    if variant_type not in coverage:
                        coverage[variant_type] = {}
                    coverage[variant_type][
                        data_type] = self.parse_bam_readcount_file(
                            coverage_file)
        return coverage

    def determine_bam_readcount_bases(self, entry, reference, alt, start):
        if len(reference) == len(alt):
            bam_readcount_position = entry.POS
            variant_type = 'snvs'
            ref_base = reference
            var_base = alt
        else:
            if self.is_deletion(reference, alt):
                bam_readcount_position = start + 1
                (simplified_reference,
                 simplified_alt) = self.simplify_indel_allele(reference, alt)
                ref_base = reference[1:2]
                var_base = '-' + simplified_reference
            elif self.is_insertion(reference, alt):
                bam_readcount_position = start
                (simplified_reference,
                 simplified_alt) = self.simplify_indel_allele(reference, alt)
                ref_base = reference
                var_base = '+' + simplified_alt
            variant_type = 'indels'
        return (bam_readcount_position, ref_base, var_base, variant_type)

    def get_depth_from_vcf_genotype(self, genotype, tag):
        try:
            depth = genotype[tag]
            if depth is None or depth == "":
                depth = 'NA'
        except AttributeError:
            depth = 'NA'
        return depth

    def get_vaf_from_vcf_genotype(self, genotype, alts, alt, af_tag, ad_tag,
                                  dp_tag):
        try:
            allele_frequencies = genotype[af_tag]
            if isinstance(allele_frequencies, list):
                vaf = allele_frequencies[alts.index(alt)]
            else:
                vaf = allele_frequencies
            if vaf > 1:
                print(
                    "Warning: VAF is expected to be a fraction, but is larger than 1. If VAFs are encoded as percentages, please adjust the coverage cutoffs accordingly."
                )
        except (AttributeError, TypeError):
            try:
                allele_depths = genotype[ad_tag]
                if isinstance(allele_depths, list):
                    #sometimes AF is type R, sometimes it's A
                    if len(allele_depths) == len(alts):
                        var_count = allele_depths[alts.index(alt)]
                    elif len(allele_depths) == len(alts) + 1:
                        var_count = allele_depths[alts.index(alt) + 1]
                    else:
                        print(
                            "Warning: Mismatch between the number of alternate alleles and number of values in the AD field for genotype {}"
                            .format(genotype))
                        return 'NA'
                else:
                    var_count = allele_depths
                if var_count is None or var_count == "":
                    return 'NA'
                depth = genotype[dp_tag]
                if depth is None or depth == "":
                    return 'NA'
                vaf = self.calculate_vaf(int(var_count), int(depth))
            except AttributeError:
                vaf = 'NA'
        return vaf

    def calculate_coverage_for_entry(self, coverage, entry, reference, alt,
                                     start, chromosome, genotype):
        (bam_readcount_position, ref_base, var_base,
         variant_type) = self.determine_bam_readcount_bases(
             entry, reference, alt, start)
        coverage_for_entry = {}
        if variant_type in coverage:
            for coverage_type in coverage[variant_type]:
                if (chromosome in coverage[variant_type][coverage_type]
                        and str(bam_readcount_position)
                        in coverage[variant_type][coverage_type][chromosome]
                        and ref_base in coverage[variant_type][coverage_type]
                    [chromosome][str(bam_readcount_position)]):
                    brct = coverage[variant_type][coverage_type][chromosome][
                        str(bam_readcount_position)][ref_base]
                    if 'depth' in brct and var_base in brct:
                        coverage_for_entry[coverage_type + '_depth'] = int(
                            brct['depth'])
                        coverage_for_entry[coverage_type +
                                           '_vaf'] = self.calculate_vaf(
                                               int(brct[var_base]),
                                               int(brct['depth']))
        else:
            coverage_for_entry[
                'tdna_depth'] = self.get_depth_from_vcf_genotype(
                    genotype, 'DP')
            coverage_for_entry[
                'trna_depth'] = self.get_depth_from_vcf_genotype(
                    genotype, 'RDP')
            coverage_for_entry['tdna_vaf'] = self.get_vaf_from_vcf_genotype(
                genotype, entry.ALT, alt, 'AF', 'AD', 'DP')
            coverage_for_entry['trna_vaf'] = self.get_vaf_from_vcf_genotype(
                genotype, entry.ALT, alt, 'RAF', 'RAD', 'RDP')
            if self.normal_sample_name is not None:
                normal_genotype = entry.genotype(self.normal_sample_name)
                coverage_for_entry[
                    'normal_depth'] = self.get_depth_from_vcf_genotype(
                        normal_genotype, 'DP')
                coverage_for_entry[
                    'normal_vaf'] = self.get_vaf_from_vcf_genotype(
                        normal_genotype, entry.ALT, alt, 'AF', 'AD', 'DP')
        return coverage_for_entry

    def write_proximal_variant_entries(self, entry, alt, transcript_name,
                                       index):
        proximal_variants = self.proximal_variant_parser.extract(
            entry, alt, transcript_name, self.peptide_length)
        for (proximal_variant, csq_entry) in proximal_variants:
            if len(
                    list(
                        self.somatic_vcf_reader.fetch(
                            proximal_variant.CHROM, proximal_variant.POS - 1,
                            proximal_variant.POS))) > 0:
                proximal_variant_type = 'somatic'
            else:
                proximal_variant_type = 'germline'
            proximal_variant_entry = {
                'chromosome_name': proximal_variant.CHROM,
                'start': proximal_variant.affected_start,
                'stop': proximal_variant.affected_end,
                'reference': proximal_variant.REF,
                'variant': proximal_variant.ALT[0],
                'amino_acid_change': csq_entry['Amino_acids'],
                'codon_change': csq_entry['Codons'],
                'protein_position': csq_entry['Protein_position'],
                'type': proximal_variant_type,
                'main_somatic_variant': index,
            }
            self.proximal_variants_writer.writerow(proximal_variant_entry)

    def close_filehandles(self):
        self.writer.close()
        self.reader.close()
        if self.proximal_variants_vcf:
            self.proximal_variant_parser.fh.close()
            self.proximal_variants_tsv_fh.close()
            self.somatic_vcf_fh.close()

    def decode_hex(self, string):
        hex_string = string.group(0).replace('%', '')
        return binascii.unhexlify(hex_string).decode('utf-8')

    def execute(self):
        gene_expns = self.parse_gene_expns_file()
        transcript_expns = self.parse_transcript_expns_file()
        coverage = self.parse_coverage_files()

        indexes = []
        count = 1
        for entry in self.vcf_reader:
            chromosome = entry.CHROM
            start = entry.affected_start
            stop = entry.affected_end
            reference = entry.REF
            alts = entry.ALT

            genotype = entry.genotype(self.sample_name)
            if genotype.gt_type is None or genotype.gt_type == 0:
                #The genotype is uncalled or hom_ref
                continue

            filt = entry.FILTER
            if self.pass_only and not (filt is None or len(filt) == 0):
                continue

            if 'CSQ' not in entry.INFO:
                continue

            alleles_dict = self.csq_parser.resolve_alleles(entry)
            for alt in alts:
                alt = str(alt)
                if genotype.gt_bases and alt not in genotype.gt_bases.split(
                        '/'):
                    continue

                coverage_for_entry = self.calculate_coverage_for_entry(
                    coverage, entry, reference, alt, start, chromosome,
                    genotype)

                transcripts = self.csq_parser.parse_csq_entries_for_allele(
                    entry.INFO['CSQ'], alt)
                if len(transcripts) == 0:
                    csq_allele = alleles_dict[alt]
                    transcripts = self.csq_parser.parse_csq_entries_for_allele(
                        entry.INFO['CSQ'], csq_allele)
                if len(transcripts) == 0 and self.is_deletion(reference, alt):
                    transcripts = self.csq_parser.parse_csq_entries_for_allele(
                        entry.INFO['CSQ'], 'deletion')

                for transcript in transcripts:
                    transcript_name = transcript['Feature']
                    consequence = self.resolve_consequence(
                        transcript['Consequence'])
                    if consequence is None:
                        continue
                    elif consequence == 'FS':
                        if transcript['DownstreamProtein'] == '':
                            print(
                                "frameshift_variant transcript does not contain a DownstreamProtein sequence. Skipping.\n{} {} {} {} {}"
                                .format(entry.CHROM, entry.POS, entry.REF, alt,
                                        transcript['Feature']))
                            continue
                        else:
                            amino_acid_change_position = "%s%s/%s" % (
                                transcript['Protein_position'], entry.REF, alt)
                    else:
                        if transcript['Amino_acids'] == '':
                            print(
                                "Transcript does not contain Amino_acids change information. Skipping.\n{} {} {} {} {}"
                                .format(entry.CHROM, entry.POS, entry.REF, alt,
                                        transcript['Feature']))
                            continue
                        else:
                            amino_acid_change_position = transcript[
                                'Protein_position'] + transcript['Amino_acids']
                    gene_name = transcript['SYMBOL']
                    index = '%s.%s.%s.%s.%s' % (count, gene_name,
                                                transcript_name, consequence,
                                                amino_acid_change_position)
                    if index in indexes:
                        sys.exit(
                            "Warning: TSV index already exists: {}".format(
                                index))
                    else:
                        indexes.append(index)
                        count += 1

                    if self.proximal_variants_vcf:
                        self.write_proximal_variant_entries(
                            entry, alt, transcript_name, index)

                    ensembl_gene_id = transcript['Gene']
                    hgvsc = re.sub(
                        r'%[0-9|A-F][0-9|A-F]', self.decode_hex,
                        transcript['HGVSc']) if 'HGVSc' in transcript else 'NA'
                    hgvsp = re.sub(
                        r'%[0-9|A-F][0-9|A-F]', self.decode_hex,
                        transcript['HGVSp']) if 'HGVSp' in transcript else 'NA'
                    if 'TSL' in transcript and transcript[
                            'TSL'] is not None and transcript['TSL'] != '':
                        tsl = transcript['TSL']
                    else:
                        tsl = 'NA'
                    output_row = {
                        'chromosome_name':
                        entry.CHROM,
                        'start':
                        entry.affected_start,
                        'stop':
                        entry.affected_end,
                        'reference':
                        entry.REF,
                        'variant':
                        alt,
                        'gene_name':
                        gene_name,
                        'transcript_name':
                        transcript_name,
                        'transcript_support_level':
                        tsl,
                        'ensembl_gene_id':
                        ensembl_gene_id,
                        'hgvsc':
                        hgvsc,
                        'hgvsp':
                        hgvsp,
                        'wildtype_amino_acid_sequence':
                        transcript['WildtypeProtein'],
                        'downstream_amino_acid_sequence':
                        transcript['DownstreamProtein'],
                        'fusion_amino_acid_sequence':
                        '',
                        'variant_type':
                        consequence,
                        'protein_position':
                        transcript['Protein_position'],
                        'index':
                        index,
                        'protein_length_change':
                        transcript['ProteinLengthChange'],
                    }
                    if transcript['Amino_acids']:
                        output_row['amino_acid_change'] = transcript[
                            'Amino_acids']

                    if transcript['Codons']:
                        output_row['codon_change'] = transcript['Codons']
                    else:
                        output_row['codon_change'] = 'NA'

                    if transcript_name in transcript_expns.keys():
                        transcript_expn_entry = transcript_expns[
                            transcript_name]
                        output_row[
                            'transcript_expression'] = transcript_expn_entry[
                                'FPKM']
                    elif 'TX' in self.vcf_reader.formats:
                        if 'TX' in genotype.data._asdict():
                            transcript_expressions = genotype['TX']
                            if isinstance(transcript_expressions, list):
                                for transcript_expression in transcript_expressions:
                                    (transcript,
                                     value) = transcript_expression.split('|')
                                    if transcript == transcript_name:
                                        output_row[
                                            'transcript_expression'] = value
                            else:
                                (transcript,
                                 value) = transcript_expressions.split('|')
                                if transcript == transcript_name:
                                    output_row['transcript_expression'] = value

                    if ensembl_gene_id in gene_expns.keys():
                        gene_expn_entries = gene_expns[ensembl_gene_id]
                        gene_fpkm = 0
                        for locus, gene_expn_entry in gene_expn_entries.items(
                        ):
                            gene_fpkm += float(gene_expn_entry['FPKM'])
                        output_row['gene_expression'] = gene_fpkm
                    elif 'GX' in self.vcf_reader.formats:
                        if 'GX' in genotype.data._asdict():
                            gene_expressions = genotype['GX']
                            if isinstance(gene_expressions, list):
                                for gene_expression in gene_expressions:
                                    (gene, value) = gene_expression.split('|')
                                    if ensembl_gene_id == gene or gene_name == gene:
                                        output_row['gene_expression'] = value
                            else:
                                (gene, value) = gene_expressions.split('|')
                                if ensembl_gene_id == gene or gene_name == gene:
                                    output_row['gene_expression'] = value

                    output_row.update(coverage_for_entry)

                    self.tsv_writer.writerow(output_row)

        self.close_filehandles()
Esempio n. 5
0
 def __init__(self, **kwargs):
     InputFileConverter.__init__(self, **kwargs)
     self.pass_only = kwargs.pop('pass_only', False)
     self.sample_name = kwargs.pop('sample_name', None)
     self.normal_sample_name = kwargs.pop('normal_sample_name', None)
     self.proximal_variants_vcf = kwargs.pop('proximal_variants_vcf', None)
     self.proximal_variants_tsv = kwargs.pop('proximal_variants_tsv', None)
     self.peptide_length = kwargs.pop('peptide_length', None)
     if self.proximal_variants_vcf and not (self.proximal_variants_tsv
                                            and self.peptide_length):
         sys.exit(
             "A proximal variants TSV output path and peptide length need to be specified if a proximal variants input VCF is provided."
         )
     if self.proximal_variants_vcf and not lib.utils.is_gz_file(
             self.input_file):
         sys.exit(
             "Input VCF {} needs to be bgzipped when running with a proximal variants VCF."
             .format(self.input_file))
     if self.proximal_variants_vcf and not lib.utils.is_gz_file(
             self.proximal_variants_vcf):
         sys.exit("Proximal variants VCF {} needs to be bgzipped.".format(
             self.proximal_variants_vcf))
     if self.proximal_variants_vcf and not os.path.exists(
             self.proximal_variants_vcf + '.tbi'):
         sys.exit(
             'No .tbi file found for proximal variants VCF {}. Proximal variants VCF needs to be tabix indexed.'
             .format(self.proximal_variants_vcf))
     if self.proximal_variants_vcf and not os.path.exists(self.input_file +
                                                          '.tbi'):
         sys.exit(
             'No .tbi file found for input VCF {}. Input VCF needs to be tabix indexed if processing with proximal variants.'
             .format(self.input_file))
     if lib.utils.is_gz_file(self.input_file):
         mode = 'rb'
     else:
         mode = 'r'
     if self.proximal_variants_vcf:
         self.proximal_variants_tsv_fh = open(self.proximal_variants_tsv,
                                              'w')
         self.proximal_variants_writer = csv.DictWriter(
             self.proximal_variants_tsv_fh,
             delimiter='\t',
             fieldnames=[
                 'chromosome_name', 'start', 'stop', 'reference', 'variant',
                 'amino_acid_change', 'codon_change', 'protein_position',
                 'type', 'main_somatic_variant'
             ])
         self.proximal_variants_writer.writeheader()
         self.proximal_variant_parser = ProximalVariant(
             self.proximal_variants_vcf, self.pass_only)
         self.somatic_vcf_fh = open(self.input_file, mode)
         self.somatic_vcf_reader = vcf.Reader(self.somatic_vcf_fh)
     self.reader = open(self.input_file, mode)
     self.vcf_reader = vcf.Reader(self.reader)
     self.writer = open(self.output_file, 'w')
     self.tsv_writer = csv.DictWriter(self.writer,
                                      delimiter='\t',
                                      fieldnames=self.output_headers(),
                                      restval='NA')
     self.tsv_writer.writeheader()
     self.csq_parser = self.create_csq_parser()
     if 'DownstreamProtein' not in self.csq_parser.csq_format:
         sys.exit(
             "VCF doesn't contain VEP DownstreamProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins."
         )
     if 'WildtypeProtein' not in self.csq_parser.csq_format:
         sys.exit(
             "VCF doesn't contain VEP WildtypeProtein annotations. Please re-annotate the VCF with VEP and the Wildtype and Downstream plugins."
         )
Esempio n. 6
0
    def execute(self):
        peptide_sequence_length = self.peptide_sequence_length
        reader = open(self.input_file, 'r')
        tsvin = csv.DictReader(reader, delimiter='\t')
        fasta_sequences = OrderedDict()
        for line in tsvin:
            variant_type = line['variant_type']
            full_wildtype_sequence = line['wildtype_amino_acid_sequence']
            if variant_type == 'FS':
                position = int(line['protein_position'].split('-', 1)[0]) - 1
                if line['amino_acid_change'] is not None and line[
                        'amino_acid_change'].split('/')[0] == '-':
                    if line['wildtype_amino_acid_sequence'][position] != line[
                            'downstream_amino_acid_sequence'][0]:
                        raise Exception(
                            "Leading amino acid of the Downstream protein sequence ({}) expected to match the wildtype amino acid at postion {} ({}). " \
                            "You may need to reannotate your VCF with a newer version of VEP." \
                            .format(line['downstream_amino_acid_sequence'], position, line['wildtype_amino_acid_sequence'][position])
                        )
            elif variant_type == 'missense' or variant_type == 'inframe_ins':
                if '/' not in line['amino_acid_change']:
                    continue
                wildtype_amino_acid, mutant_amino_acid = line[
                    'amino_acid_change'].split('/')
                if '*' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
                elif 'X' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
                if '*' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('*')[0]
                    stop_codon_added = True
                elif 'X' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('X')[0]
                    stop_codon_added = True
                else:
                    stop_codon_added = False
                if wildtype_amino_acid == '-':
                    position = int(line['protein_position'].split('-', 1)[0])
                    wildtype_amino_acid_length = 0
                else:
                    if '-' in line['protein_position']:
                        position = int(line['protein_position'].split(
                            '-', 1)[0]) - 1
                        wildtype_amino_acid_length = len(wildtype_amino_acid)
                    else:
                        position = int(line['protein_position']) - 1
                        wildtype_amino_acid_length = len(wildtype_amino_acid)
            elif variant_type == 'inframe_del':
                variant_type = 'inframe_del'
                wildtype_amino_acid, mutant_amino_acid = line[
                    'amino_acid_change'].split('/')
                if '*' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
                elif 'X' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
                if '*' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('*')[0]
                    stop_codon_added = True
                elif 'X' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('X')[0]
                    stop_codon_added = True
                else:
                    stop_codon_added = False
                position = int(line['protein_position'].split('-', 1)[0]) - 1
                wildtype_amino_acid_length = len(wildtype_amino_acid)
                if mutant_amino_acid == '-':
                    mutant_amino_acid = ''
            else:
                continue

            if self.position_out_of_bounds(position, full_wildtype_sequence):
                continue

            if variant_type == 'missense' and line[
                    'index'] in self.proximal_variants and line[
                        'protein_position'] in self.proximal_variants[
                            line['index']]:
                codon_changes = [
                    item['codon_change'] for item in self.proximal_variants[
                        line['index']][line['protein_position']]
                ]
                codon_changes.append(line['codon_change'])
                mutant_amino_acid_with_proximal_variants = ProximalVariant.combine_conflicting_variants(
                    codon_changes)
            elif variant_type != 'FS':
                mutant_amino_acid_with_proximal_variants = mutant_amino_acid

            if variant_type == 'FS':
                wildtype_subsequence, left_flanking_subsequence = self.get_frameshift_subsequences(
                    position, full_wildtype_sequence, peptide_sequence_length,
                    line)
                downstream_sequence = line['downstream_amino_acid_sequence']
                if self.downstream_sequence_length and len(
                        downstream_sequence) > self.downstream_sequence_length:
                    downstream_sequence = downstream_sequence[
                        0:self.downstream_sequence_length]
                mutation_start_position = len(left_flanking_subsequence)
                wildtype_subsequence = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, True)
                left_flanking_subsequence_with_proximal_variants = self.add_proximal_variants(
                    line['index'], left_flanking_subsequence,
                    mutation_start_position, position, False)
                #The caveat here is that if a nearby variant is in the downstream sequence, the protein sequence would be further altered, which we aren't taking into account.
                #we would need to recalculate the downstream protein sequence taking all downstream variants into account.
                mutant_subsequence = left_flanking_subsequence_with_proximal_variants + downstream_sequence
            else:
                mutation_start_position, wildtype_subsequence = self.get_wildtype_subsequence(
                    position, full_wildtype_sequence,
                    wildtype_amino_acid_length, peptide_sequence_length, line)
                mutation_end_position = mutation_start_position + wildtype_amino_acid_length
                if wildtype_amino_acid != '-' and wildtype_amino_acid != wildtype_subsequence[
                        mutation_start_position:mutation_end_position]:
                    if line['amino_acid_change'].split('/')[0].count('*') > 1:
                        print(
                            "Warning: Amino acid change is not sane - contains multiple stops. Skipping entry {}"
                            .format(line['index']))
                        continue
                    else:
                        sys.exit(
                            "ERROR: There was a mismatch between the actual wildtype amino acid sequence ({}) and the expected amino acid sequence ({}). Did you use the same reference build version for VEP that you used for creating the VCF?\n{}"
                            .format(
                                wildtype_subsequence[mutation_start_position:
                                                     mutation_end_position],
                                wildtype_amino_acid, line))
                wildtype_subsequence_with_proximal_variants = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, False)
                wildtype_subsequence = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, True)
                if stop_codon_added:
                    mutant_subsequence = wildtype_subsequence_with_proximal_variants[:
                                                                                     mutation_start_position] + mutant_amino_acid_with_proximal_variants
                else:
                    mutant_subsequence = wildtype_subsequence_with_proximal_variants[:mutation_start_position] + mutant_amino_acid_with_proximal_variants + wildtype_subsequence_with_proximal_variants[
                        mutation_end_position:]

            if '*' in wildtype_subsequence or '*' in mutant_subsequence:
                continue

            if 'X' in wildtype_subsequence or 'X' in mutant_subsequence:
                continue

            if 'U' in wildtype_subsequence or 'U' in mutant_subsequence:
                print(
                    "Warning. Sequence contains unsupported amino acid U. Skipping entry {}"
                    .format(line['index']))
                continue

            if mutant_subsequence in wildtype_subsequence:
                #This is not a novel peptide
                continue

            if len(wildtype_subsequence) < self.epitope_length or len(
                    mutant_subsequence) < self.epitope_length:
                continue

            variant_id = line['index']
            for designation, subsequence in zip(
                ['WT', 'MT'], [wildtype_subsequence, mutant_subsequence]):
                key = '%s.%s' % (designation, variant_id)
                fasta_sequences.setdefault(subsequence, []).append(key)

        writer = open(self.output_file, 'w')
        key_writer = open(self.output_key_file, 'w')
        count = 1
        for (subsequence, keys) in fasta_sequences.items():
            writer.writelines('>%s\n' % count)
            writer.writelines('%s\n' % subsequence)
            yaml.dump({count: keys}, key_writer, default_flow_style=False)
            count += 1

        reader.close()
        writer.close()
        key_writer.close()