Esempio n. 1
0
    def add_proximal_variants(self, somatic_variant_index,
                              wildtype_subsequence, mutation_position,
                              original_position, germline_variants_only):
        mutation_offset = original_position - mutation_position
        wildtype_subsequence_with_proximal_variants = wildtype_subsequence
        if somatic_variant_index in self.proximal_variants.keys():
            for (protein_position, lines
                 ) in self.proximal_variants[somatic_variant_index].items():
                if protein_position == original_position:
                    continue

                if germline_variants_only:
                    filtered_lines = [
                        line for line in lines if line['type'] == 'germline'
                    ]
                else:
                    filtered_lines = lines

                if len(filtered_lines) == 0:
                    continue
                elif len(filtered_lines) == 1:
                    line = filtered_lines[0]
                    proximal_variant_wildtype_amino_acid, proximal_variant_mutant_amino_acid = line[
                        'amino_acid_change'].split('/')
                else:
                    line = filtered_lines[0]
                    proximal_variant_wildtype_amino_acid = line[
                        'amino_acid_change'].split('/')[0]
                    codon_changes = [
                        item['codon_change'] for item in filtered_lines
                    ]
                    proximal_variant_mutant_amino_acid = ProximalVariant.combine_conflicting_variants(
                        codon_changes)

                proximal_variant_position = int(
                    protein_position) - 1 - mutation_offset
                if proximal_variant_position <= 0 or proximal_variant_position >= len(
                        wildtype_subsequence):
                    continue
                if len(proximal_variant_wildtype_amino_acid) != len(
                        proximal_variant_mutant_amino_acid):
                    print(
                        "Nearby variant is not a missense mutation. Skipping.")
                    continue
                if wildtype_subsequence[
                        proximal_variant_position] != proximal_variant_wildtype_amino_acid:
                    sys.exit(
                        "Error when processing proximal variant.\n" +
                        "The wildtype amino acid for variant %s with substring %s is different than expected.\n"
                        % (somatic_variant_index, wildtype_subsequence) +
                        "Actual wildtype amino acid: %s\n" %
                        wildtype_subsequence[proximal_variant_position] +
                        "Wildtype amino acid of the proximal_variant: %s" %
                        proximal_variant_wildtype_amino_acid)
                wildtype_subsequence_with_proximal_variants = wildtype_subsequence_with_proximal_variants[:proximal_variant_position] + proximal_variant_mutant_amino_acid + wildtype_subsequence_with_proximal_variants[
                    proximal_variant_position + 1:]
        return wildtype_subsequence_with_proximal_variants
Esempio n. 2
0
    def execute(self):
        peptide_sequence_length = self.peptide_sequence_length
        reader = open(self.input_file, 'r')
        tsvin = csv.DictReader(reader, delimiter='\t')
        fasta_sequences = OrderedDict()
        for line in tsvin:
            variant_type = line['variant_type']
            full_wildtype_sequence = line['wildtype_amino_acid_sequence']
            if variant_type == 'FS':
                position = int(line['protein_position'].split('-', 1)[0]) - 1
                if line['amino_acid_change'] is not None and line[
                        'amino_acid_change'].split('/')[0] == '-':
                    if line['wildtype_amino_acid_sequence'][position] != line[
                            'downstream_amino_acid_sequence'][0]:
                        raise Exception(
                            "Leading amino acid of the Downstream protein sequence ({}) expected to match the wildtype amino acid at postion {} ({}). " \
                            "You may need to reannotate your VCF with a newer version of VEP." \
                            .format(line['downstream_amino_acid_sequence'], position, line['wildtype_amino_acid_sequence'][position])
                        )
            elif variant_type == 'missense' or variant_type == 'inframe_ins':
                if '/' not in line['amino_acid_change']:
                    continue
                wildtype_amino_acid, mutant_amino_acid = line[
                    'amino_acid_change'].split('/')
                if '*' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
                elif 'X' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
                if '*' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('*')[0]
                    stop_codon_added = True
                elif 'X' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('X')[0]
                    stop_codon_added = True
                else:
                    stop_codon_added = False
                if wildtype_amino_acid == '-':
                    position = int(line['protein_position'].split('-', 1)[0])
                    wildtype_amino_acid_length = 0
                else:
                    if '-' in line['protein_position']:
                        position = int(line['protein_position'].split(
                            '-', 1)[0]) - 1
                        wildtype_amino_acid_length = len(wildtype_amino_acid)
                    else:
                        position = int(line['protein_position']) - 1
                        wildtype_amino_acid_length = len(wildtype_amino_acid)
            elif variant_type == 'inframe_del':
                variant_type = 'inframe_del'
                wildtype_amino_acid, mutant_amino_acid = line[
                    'amino_acid_change'].split('/')
                if '*' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('*')[0]
                elif 'X' in wildtype_amino_acid:
                    wildtype_amino_acid = wildtype_amino_acid.split('X')[0]
                if '*' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('*')[0]
                    stop_codon_added = True
                elif 'X' in mutant_amino_acid:
                    mutant_amino_acid = mutant_amino_acid.split('X')[0]
                    stop_codon_added = True
                else:
                    stop_codon_added = False
                position = int(line['protein_position'].split('-', 1)[0]) - 1
                wildtype_amino_acid_length = len(wildtype_amino_acid)
                if mutant_amino_acid == '-':
                    mutant_amino_acid = ''
            else:
                continue

            if self.position_out_of_bounds(position, full_wildtype_sequence):
                continue

            if variant_type == 'missense' and line[
                    'index'] in self.proximal_variants and line[
                        'protein_position'] in self.proximal_variants[
                            line['index']]:
                codon_changes = [
                    item['codon_change'] for item in self.proximal_variants[
                        line['index']][line['protein_position']]
                ]
                codon_changes.append(line['codon_change'])
                mutant_amino_acid_with_proximal_variants = ProximalVariant.combine_conflicting_variants(
                    codon_changes)
            elif variant_type != 'FS':
                mutant_amino_acid_with_proximal_variants = mutant_amino_acid

            if variant_type == 'FS':
                wildtype_subsequence, left_flanking_subsequence = self.get_frameshift_subsequences(
                    position, full_wildtype_sequence, peptide_sequence_length,
                    line)
                downstream_sequence = line['downstream_amino_acid_sequence']
                if self.downstream_sequence_length and len(
                        downstream_sequence) > self.downstream_sequence_length:
                    downstream_sequence = downstream_sequence[
                        0:self.downstream_sequence_length]
                mutation_start_position = len(left_flanking_subsequence)
                wildtype_subsequence = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, True)
                left_flanking_subsequence_with_proximal_variants = self.add_proximal_variants(
                    line['index'], left_flanking_subsequence,
                    mutation_start_position, position, False)
                #The caveat here is that if a nearby variant is in the downstream sequence, the protein sequence would be further altered, which we aren't taking into account.
                #we would need to recalculate the downstream protein sequence taking all downstream variants into account.
                mutant_subsequence = left_flanking_subsequence_with_proximal_variants + downstream_sequence
            else:
                mutation_start_position, wildtype_subsequence = self.get_wildtype_subsequence(
                    position, full_wildtype_sequence,
                    wildtype_amino_acid_length, peptide_sequence_length, line)
                mutation_end_position = mutation_start_position + wildtype_amino_acid_length
                if wildtype_amino_acid != '-' and wildtype_amino_acid != wildtype_subsequence[
                        mutation_start_position:mutation_end_position]:
                    if line['amino_acid_change'].split('/')[0].count('*') > 1:
                        print(
                            "Warning: Amino acid change is not sane - contains multiple stops. Skipping entry {}"
                            .format(line['index']))
                        continue
                    else:
                        sys.exit(
                            "ERROR: There was a mismatch between the actual wildtype amino acid sequence ({}) and the expected amino acid sequence ({}). Did you use the same reference build version for VEP that you used for creating the VCF?\n{}"
                            .format(
                                wildtype_subsequence[mutation_start_position:
                                                     mutation_end_position],
                                wildtype_amino_acid, line))
                wildtype_subsequence_with_proximal_variants = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, False)
                wildtype_subsequence = self.add_proximal_variants(
                    line['index'], wildtype_subsequence,
                    mutation_start_position, position, True)
                if stop_codon_added:
                    mutant_subsequence = wildtype_subsequence_with_proximal_variants[:
                                                                                     mutation_start_position] + mutant_amino_acid_with_proximal_variants
                else:
                    mutant_subsequence = wildtype_subsequence_with_proximal_variants[:mutation_start_position] + mutant_amino_acid_with_proximal_variants + wildtype_subsequence_with_proximal_variants[
                        mutation_end_position:]

            if '*' in wildtype_subsequence or '*' in mutant_subsequence:
                continue

            if 'X' in wildtype_subsequence or 'X' in mutant_subsequence:
                continue

            if 'U' in wildtype_subsequence or 'U' in mutant_subsequence:
                print(
                    "Warning. Sequence contains unsupported amino acid U. Skipping entry {}"
                    .format(line['index']))
                continue

            if mutant_subsequence in wildtype_subsequence:
                #This is not a novel peptide
                continue

            if len(wildtype_subsequence) < self.epitope_length or len(
                    mutant_subsequence) < self.epitope_length:
                continue

            variant_id = line['index']
            for designation, subsequence in zip(
                ['WT', 'MT'], [wildtype_subsequence, mutant_subsequence]):
                key = '%s.%s' % (designation, variant_id)
                fasta_sequences.setdefault(subsequence, []).append(key)

        writer = open(self.output_file, 'w')
        key_writer = open(self.output_key_file, 'w')
        count = 1
        for (subsequence, keys) in fasta_sequences.items():
            writer.writelines('>%s\n' % count)
            writer.writelines('%s\n' % subsequence)
            yaml.dump({count: keys}, key_writer, default_flow_style=False)
            count += 1

        reader.close()
        writer.close()
        key_writer.close()