def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele): """ :param vc: :return: """ if vc.get_vc() == VariantClassification.SPLICE_SITE and vc.get_secondary_vc() == VariantClassification.INTRON: return "" # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i()) # exon_i = vc.get_exon_i() # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON) if vc.get_cds_start_in_exon_space() == "" or vc.get_cds_start_in_exon_space() < 0: return "" exon_position_start,exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(int(start_genomic_space), int(end_genomic_space), tx) if tx.get_strand() == "-": cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space())+1 cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space())+1 else: cds_position_start_cds_space = exon_position_start - int(vc.get_cds_start_in_exon_space()) cds_position_end_cds_space = exon_position_end - int(vc.get_cds_start_in_exon_space()) observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx) result = TranscriptProviderUtils.render_transcript_change(variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc()) return result
def test_seq(self, start, end, gt): """Test that we can successfully determine the codon at an arbitrary location on test transcript""" tx = self.retrieve_test_transcript_MAPK1() transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) transcript_seq = tx.get_seq() seq = transcript_seq[transcript_position_start:transcript_position_end+1] self.assertTrue(seq == gt, "Incorrect seq found guess,gt (%s, %s)" %(seq, gt))
def test_codon_single_base(self, start, end, ref_base_stranded, gt_codon): """Test that we can grab the proper three bases of a codon for an arbitrary single base """ tx = self.retrieve_test_transcript_MAPK1() transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions(transcript_position_start, transcript_position_end, cds_start) cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start) codon_seq = tx.get_seq()[cds_codon_start:cds_codon_end+1] self.assertTrue(codon_seq == gt_codon, "Did not get correct codon (%s): %s loc: %s-%s" %(gt_codon, codon_seq, start, end))
def test_convert_genomic_space_to_exon_space(self, loc, gt_d): """Test genomic --> exon transform on real data. """ gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_variant_classification" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST") tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790") start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0]) loc_length = (int(loc[1]) - int(loc[0])) self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length)) self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + " exons: " + str(tx[0].get_exons()))
def test_querying_transcripts_by_region(self): """Test web api backend call /transcripts/.... """ datasource_list = DatasourceFactory.createDatasources( self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411) self.assertTranscriptsFound(txs) ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt # None of these values are validated. for tx in txs: transcript_id = tx.get_transcript_id() tx_start = tx.determine_transcript_start() tx_end = tx.determine_transcript_stop() gene = tx.get_gene() chr = tx.get_contig() n_exons = len(tx.get_exons()) strand = tx.get_strand() footprint_start, footprint_end = tx.determine_cds_footprint() klass = tx.get_gene_type() cds_start = tx.determine_cds_start() cds_end = tx.determine_cds_stop() id = tx.get_gene_id() genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()] transcript_coords = [[ TranscriptProviderUtils.convert_genomic_space_to_exon_space( exon[0] + 1, exon[1], tx) ] for exon in tx.get_exons()] code_len = int(cds_end) - int(cds_start) + 1 # If refseq datasources are not available, this will fail. # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations. dummy_mut = annotator.annotate_transcript(tx) refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"] refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"] # Description is unavailable right now description = "" self.assertTrue(refseq_mRNA_id is not None) self.assertTrue(refseq_prot_id is not None) self.assertTrue(len(transcript_coords) == n_exons)
def test_querying_transcripts_by_region(self): """Test web api backend call /transcripts/.... """ datasource_list = DatasourceFactory.createDatasources(self._determine_db_dir(), "hg19", isMulticore=False) annotator = Annotator() for ds in datasource_list: annotator.addDatasource(ds) txs = annotator.retrieve_transcripts_by_region("4", 50164411, 60164411) self.assertTranscriptsFound(txs) ## Here is an example of getting enough data to populate the json in doc/transcript_json_commented.json.txt # None of these values are validated. for tx in txs: transcript_id = tx.get_transcript_id() tx_start = tx.determine_transcript_start() tx_end = tx.determine_transcript_stop() gene = tx.get_gene() chr = tx.get_contig() n_exons = len(tx.get_exons()) strand = tx.get_strand() footprint_start, footprint_end = tx.determine_cds_footprint() klass = tx.get_gene_type() cds_start = tx.determine_cds_start() cds_end = tx.determine_cds_stop() id = tx.get_gene_id() genomic_coords = [[exon[0], exon[1]] for exon in tx.get_exons()] transcript_coords = [ [TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0] + 1, exon[1], tx)] for exon in tx.get_exons() ] code_len = int(cds_end) - int(cds_start) + 1 # If refseq datasources are not available, this will fail. # Step 2 annotate the transcript, which produces a dummy mutation with the refseq annotations. dummy_mut = annotator.annotate_transcript(tx) refseq_mRNA_id = dummy_mut["gencode_xref_refseq_mRNA_id"] refseq_prot_id = dummy_mut["gencode_xref_refseq_prot_acc"] # Description is unavailable right now description = "" self.assertTrue(refseq_mRNA_id is not None) self.assertTrue(refseq_prot_id is not None) self.assertTrue(len(transcript_coords) == n_exons)
def generate_transcript_change_from_tx(self, tx, variant_type, vc, start_genomic_space, end_genomic_space, ref_allele, alt_allele): """ :param vc: :return: """ if vc.get_vc( ) == VariantClassification.SPLICE_SITE and vc.get_secondary_vc( ) == VariantClassification.INTRON: return "" # dist_from_exon = self._get_splice_site_coordinates(tx, start_genomic_space, end_genomic_space, vc.get_exon_i()) # exon_i = vc.get_exon_i() # return TranscriptProviderUtils.render_splice_site_transcript_change(tx, dist_from_exon, exon_i, vc.get_secondary_vc() == VariantClassification.INTRON) if vc.get_cds_start_in_exon_space( ) == "" or vc.get_cds_start_in_exon_space() < 0: return "" exon_position_start, exon_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( int(start_genomic_space), int(end_genomic_space), tx) if tx.get_strand() == "-": cds_position_start_cds_space = exon_position_start - int( vc.get_cds_start_in_exon_space()) + 1 cds_position_end_cds_space = exon_position_end - int( vc.get_cds_start_in_exon_space()) + 1 else: cds_position_start_cds_space = exon_position_start - int( vc.get_cds_start_in_exon_space()) cds_position_end_cds_space = exon_position_end - int( vc.get_cds_start_in_exon_space()) observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles( ref_allele, alt_allele, tx) result = TranscriptProviderUtils.render_transcript_change( variant_type, vc.get_vc(), cds_position_start_cds_space, cds_position_end_cds_space, reference_allele_stranded, observed_allele_stranded, vc.get_secondary_vc()) return result
def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2): """Perform classifications. Everything handled in genomic space *RNA* x'UTR Splice_Site (Intron) Intron Splice_Site (Exon) {Missense, Silent} {Nonsense, Silent} {Nonstop, Silent} IGR x'Flank De_novo_Start """ gene_type = tx.get_gene_type() if gene_type != "protein_coding": if gene_type == VariantClassification.LINCRNA: return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id()) else: return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id()) if ref_allele == "-": ref_allele = "" if alt_allele == "-": alt_allele = "" s = int(start) e = int(end) is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap(s, e, tx, variant_type) is_splice_site_tuple = self._determine_if_splice_site_overlap(s, e, tx, variant_type, dist) is_splice_site = is_splice_site_tuple[0] is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt(start, end, tx, variant_type) if not is_exon_overlap and not is_beyond_exons: exon_i = TranscriptProviderUtils.determine_closest_exon(tx, int(start), int(end)) if is_splice_site: # Intron Splice Site return VariantClassification(VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i) else: return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i) if not is_exon_overlap and is_beyond_exons: if is_flank: # Flanks if side.startswith("3"): return VariantClassification(VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: return VariantClassification(VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: # IGR return VariantClassification(VariantClassification.IGR, variant_type) is_start_codon_overlap = self._determine_codon_overlap(s, e, tx.get_start_codon(), variant_type) is_stop_codon_overlap = self._determine_codon_overlap(s, e, tx.get_stop_codon(), variant_type) if is_start_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) if is_stop_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type) if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap: # UTR if side.startswith("3"): vc_tmp = VariantClassification.THREE_PRIME_UTR else: vc_tmp = VariantClassification.FIVE_PRIME_UTR transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(start, end, tx) vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type) return VariantClassification(vc, variant_type, transcript_id=tx.get_transcript_id(), ) # We have a clean overlap in the CDS. Includes start codon or stop codon. if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap: is_frameshift_indel = self.is_frameshift_indel(variant_type, int(start), int(end), alt_allele) return self._determine_vc_for_cds_overlap(start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap) raise ValueError("Could not determine variant classification: " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon): """ Note: This method can also handle start and stop codons. :param start: :param end: :param ref_allele: :param alt_allele: :param is_frameshift_indel: :param is_splice_site: :param tx: :param variant_type: :return: """ observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles(ref_allele, alt_allele, tx) transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) if tx.get_strand() == "+" and not variant_type == VariantClassification.VT_INS: transcript_position_start -= 1 transcript_position_end -= 1 transcript_seq = tx.get_seq() protein_seq = tx.get_protein_seq() cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space(tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions( transcript_position_start, transcript_position_end, cds_start) new_ref_transcript_seq = transcript_seq if (transcript_seq[transcript_position_start:transcript_position_end+1] != reference_allele_stranded) and variant_type != VariantClassification.VT_INS: new_ref_transcript_seq = list(transcript_seq) new_ref_transcript_seq[transcript_position_start:transcript_position_end+1] = reference_allele_stranded new_ref_transcript_seq = ''.join(new_ref_transcript_seq) ref_tx_seq_has_been_changed = True else: ref_tx_seq_has_been_changed = False cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions(protein_position_start, protein_position_end, cds_start) if variant_type == "DEL": reference_codon_seq = new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower() else: reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(new_ref_transcript_seq[cds_codon_start:cds_codon_end+1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type) if variant_type == "INS" and tx.get_strand() == "-": mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) else: mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence(reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) observed_aa = Bio.Seq.translate(mutated_codon_seq) if ref_tx_seq_has_been_changed: reference_aa = Bio.Seq.translate(reference_codon_seq) else: reference_aa = protein_seq[protein_position_start-1:protein_position_end] if variant_type != VariantClassification.VT_SNP: try: reference_aa, observed_aa, protein_position_start, protein_position_end = \ self._adjust_protein_position_and_alleles(protein_seq, protein_position_start, protein_position_end, reference_aa, observed_aa) except InvalidVariantException as ive: logging.getLogger(__name__).error("Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type)) logging.getLogger(__name__).error(str(ive)) logging.getLogger(__name__).warn("Above error may not have exact start and end positions if this is a VCF input.") logging.getLogger(__name__).warn("Variant type is likely incorrect. This can happen with some GATK VCFs") logging.getLogger(__name__).warn(TranscriptProviderUtils.is_valid_xNP(variant_type, ref_allele, alt_allele)) logging.getLogger(__name__).warn("The protein_change annotation may not be properly rendered.") vc_tmp, vc_tmp_secondary = self.infer_variant_classification(variant_type, reference_aa, observed_aa, ref_allele, alt_allele, is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon) cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space(tx) exon_i = TranscriptProviderUtils.determine_exon_index(int(start), int(end), tx, variant_type) final_vc = VariantClassification(vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary) return final_vc
def variant_classify(self, tx, ref_allele, alt_allele, start, end, variant_type, dist=2): """Perform classifications. Everything handled in genomic space *RNA* x'UTR Splice_Site (Intron) Intron Splice_Site (Exon) {Missense, Silent} {Nonsense, Silent} {Nonstop, Silent} IGR x'Flank De_novo_Start """ gene_type = tx.get_gene_type() if gene_type != "protein_coding": if gene_type == VariantClassification.LINCRNA: return VariantClassification(VariantClassification.LINCRNA, variant_type, tx.get_transcript_id()) else: return VariantClassification(VariantClassification.RNA, variant_type, tx.get_transcript_id()) if ref_allele == "-": ref_allele = "" if alt_allele == "-": alt_allele = "" s = int(start) e = int(end) is_exon_overlap = TranscriptProviderUtils.determine_if_exon_overlap( s, e, tx, variant_type) is_splice_site_tuple = self._determine_if_splice_site_overlap( s, e, tx, variant_type, dist) is_splice_site = is_splice_site_tuple[0] is_beyond_exons, side, is_flank = self._determine_beyond_exon_info_vt( start, end, tx, variant_type) if not is_exon_overlap and not is_beyond_exons: exon_i = TranscriptProviderUtils.determine_closest_exon( tx, int(start), int(end)) if is_splice_site: # Intron Splice Site return VariantClassification( VariantClassification.SPLICE_SITE, variant_type, tx.get_transcript_id(), vc_secondary=VariantClassification.INTRON, exon_i=exon_i) else: return VariantClassification(VariantClassification.INTRON, variant_type, tx.get_transcript_id(), exon_i=exon_i) if not is_exon_overlap and is_beyond_exons: if is_flank: # Flanks if side.startswith("3"): return VariantClassification( VariantClassification.THREE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: return VariantClassification( VariantClassification.FIVE_PRIME_PRIME_FLANK, variant_type, transcript_id=tx.get_transcript_id()) else: # IGR return VariantClassification(VariantClassification.IGR, variant_type) is_start_codon_overlap = self._determine_codon_overlap( s, e, tx.get_start_codon(), variant_type) is_stop_codon_overlap = self._determine_codon_overlap( s, e, tx.get_stop_codon(), variant_type) if is_start_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Start_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) if is_stop_codon_overlap and not variant_type.endswith("NP"): return VariantClassification('Stop_Codon_' + variant_type.capitalize(), variant_type, transcript_id=tx.get_transcript_id()) is_cds_overlap = self._determine_if_cds_overlap(s, e, tx, variant_type) if is_exon_overlap and not is_cds_overlap and not is_start_codon_overlap and not is_stop_codon_overlap: # UTR if side.startswith("3"): vc_tmp = VariantClassification.THREE_PRIME_UTR else: vc_tmp = VariantClassification.FIVE_PRIME_UTR transcript_position_exon_space_start, transcript_position_exon_space_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) vc = self._determine_de_novo(vc_tmp, transcript_position_exon_space_start, ref_allele, alt_allele, tx, variant_type) return VariantClassification( vc, variant_type, transcript_id=tx.get_transcript_id(), ) # We have a clean overlap in the CDS. Includes start codon or stop codon. if is_cds_overlap or is_stop_codon_overlap or is_start_codon_overlap: is_frameshift_indel = self.is_frameshift_indel( variant_type, int(start), int(end), alt_allele) return self._determine_vc_for_cds_overlap( start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon_overlap) raise ValueError("Could not determine variant classification: " + tx.get_trancript_id() + " " + str([ref_allele, alt_allele, start, end]))
def _determine_vc_for_cds_overlap(self, start, end, ref_allele, alt_allele, is_frameshift_indel, is_splice_site, tx, variant_type, is_start_codon): """ Note: This method can also handle start and stop codons. :param start: :param end: :param ref_allele: :param alt_allele: :param is_frameshift_indel: :param is_splice_site: :param tx: :param variant_type: :return: """ observed_allele_stranded, reference_allele_stranded = self._get_stranded_alleles( ref_allele, alt_allele, tx) transcript_position_start, transcript_position_end = TranscriptProviderUtils.convert_genomic_space_to_exon_space( start, end, tx) if tx.get_strand( ) == "+" and not variant_type == VariantClassification.VT_INS: transcript_position_start -= 1 transcript_position_end -= 1 transcript_seq = tx.get_seq() protein_seq = tx.get_protein_seq() cds_start, cds_stop = TranscriptProviderUtils.determine_cds_in_exon_space( tx) protein_position_start, protein_position_end = TranscriptProviderUtils.get_protein_positions( transcript_position_start, transcript_position_end, cds_start) new_ref_transcript_seq = transcript_seq if (transcript_seq[transcript_position_start:transcript_position_end + 1] != reference_allele_stranded ) and variant_type != VariantClassification.VT_INS: new_ref_transcript_seq = list(transcript_seq) new_ref_transcript_seq[ transcript_position_start:transcript_position_end + 1] = reference_allele_stranded new_ref_transcript_seq = ''.join(new_ref_transcript_seq) ref_tx_seq_has_been_changed = True else: ref_tx_seq_has_been_changed = False cds_codon_start, cds_codon_end = TranscriptProviderUtils.get_cds_codon_positions( protein_position_start, protein_position_end, cds_start) if variant_type == "DEL": reference_codon_seq = new_ref_transcript_seq[ cds_codon_start:cds_codon_end + 1].lower() else: reference_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( new_ref_transcript_seq[cds_codon_start:cds_codon_end + 1].lower(), cds_codon_start, transcript_position_start, transcript_position_end, reference_allele_stranded, variant_type) if variant_type == "INS" and tx.get_strand() == "-": mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start - 1, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) else: mutated_codon_seq = TranscriptProviderUtils.mutate_reference_sequence( reference_codon_seq.lower(), cds_codon_start, transcript_position_start, transcript_position_end, observed_allele_stranded, variant_type) observed_aa = MutUtils.translate_sequence(mutated_codon_seq) if ref_tx_seq_has_been_changed: reference_aa = MutUtils.translate_sequence(reference_codon_seq) else: reference_aa = protein_seq[protein_position_start - 1:protein_position_end] if variant_type != VariantClassification.VT_SNP: try: reference_aa, observed_aa, protein_position_start, protein_position_end = \ self._adjust_protein_position_and_alleles(protein_seq, protein_position_start, protein_position_end, reference_aa, observed_aa) except InvalidVariantException as ive: logging.getLogger(__name__).error( "Could not properly adjust protein position for variant: %s, %s, %s, %s, %s VT: %s" % (tx.get_contig(), start, end, ref_allele, alt_allele, variant_type)) logging.getLogger(__name__).error(str(ive)) logging.getLogger(__name__).warn( "Above error may not have exact start and end positions if this is a VCF input." ) logging.getLogger(__name__).warn( "Variant type is likely incorrect. This can happen with some GATK VCFs" ) logging.getLogger(__name__).warn( TranscriptProviderUtils.is_valid_xNP( variant_type, ref_allele, alt_allele)) logging.getLogger(__name__).warn( "The protein_change annotation may not be properly rendered." ) vc_tmp, vc_tmp_secondary = self.infer_variant_classification( variant_type, reference_aa, observed_aa, ref_allele, alt_allele, is_frameshift_indel=is_frameshift_indel, is_splice_site=is_splice_site, is_start_codon=is_start_codon) cds_start_exon_space, cds_end_exon_space = TranscriptProviderUtils.determine_cds_in_exon_space( tx) exon_i = TranscriptProviderUtils.determine_exon_index( int(start), int(end), tx, variant_type) final_vc = VariantClassification( vc_tmp, variant_type, transcript_id=tx.get_transcript_id(), alt_codon=mutated_codon_seq, ref_codon=reference_codon_seq, ref_aa=reference_aa, ref_protein_start=protein_position_start, ref_protein_end=protein_position_end, alt_aa=observed_aa, alt_codon_start_in_exon=cds_codon_start, alt_codon_end_in_exon=cds_codon_end, ref_codon_start_in_exon=cds_codon_start, ref_codon_end_in_exon=cds_codon_end, cds_start_in_exon_space=cds_start_exon_space, ref_allele_stranded=reference_allele_stranded, alt_allele_stranded=observed_allele_stranded, exon_i=exon_i, vc_secondary=vc_tmp_secondary) return final_vc