def test_del_annotation(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert str(seq.translate()) == str(prot_seq) region = \ mutate.mutate_protein_from_transcript( seq, 3, "GCTATT", "", padding = 8) print(str(region)) assert region.annot == 'AI2del', region
def test_mutate_protein_from_transcript_snp(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert(str(seq.translate()) == str(prot_seq)) mutated_seq = Seq("AATGCTATTCGTAGT").translate() region = mutate.mutate_protein_from_transcript( seq, 1, 'C', 'A', padding = 8) print(str(region)) assert(region.seq[0] == 'N') assert(str(mutated_seq) == str(region.seq)) assert(len(region.seq) == 5) assert(region.annot == 'T1N')
def test_stop_codon_annotation(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert str(seq.translate()) == str(prot_seq) stop_seq = Seq("ACTTAGCCCATTCGTAGT") assert str(stop_seq.translate()) == str(Seq("T*PIRS")) # change the 4th-6th chars to stop codon TAG region = \ mutate.mutate_protein_from_transcript( seq, 3, "GCT", "TAGCCC", padding = 8) print(str(region)) assert region.seq == 'T', region.seq assert region.annot == 'A2*', region
def test_mutate_protein_prefix_stop_codon(): seq = Seq("TAGGCTATTCGTAGT") prot_seq = Seq("*AIRS") assert(str(seq.translate()) == str(prot_seq)) mutated_seq = Seq("TAGGATATTCGTAGT").translate() print (str(mutated_seq)) region = mutate.mutate_protein_from_transcript( seq, 4, 'C', 'A', padding = 8) print(str(region)) assert(region.seq[0] == 'D') assert(region.seq[1] == 'I') assert(str(region.seq) == str(mutated_seq[1:])) assert(region.annot == 'A2D')
def test_mutate_protein_from_transcript_indel(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert(str(seq.translate()) == str(prot_seq)) mutated_seq = Seq("AAATGCTATTCGTAGT").translate() print (str(mutated_seq)) region = mutate.mutate_protein_from_transcript( seq, 1, 'C', 'AA', padding = 8) print(str(region)) assert(region.seq[0] == 'K') assert(region.seq[1] == 'C') assert(region.seq[2] == 'Y') assert(str(region.seq) == str(mutated_seq[:-1])) assert(region.annot == 'T1fs')
def test_stop_codon_after_subst_annotation(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert str(seq.translate()) == str(prot_seq) stop_seq = Seq("ACTCCCTAGATTCGTAGT") assert str(stop_seq.translate()) == str(Seq("TP*IRS")) # change the 4th-6th chars to stop codon TAG region = \ mutate.mutate_protein_from_transcript( seq, 3, "GCT", "CCCTAG", padding = 8) print(str(region)) assert region.n_removed == 1 assert region.n_inserted == 1 assert region.seq == 'TP', region.seq assert region.annot == 'A2P*', region
def test_mutate_protein_from_transcript_snp_coordinates(): seq = Seq("ACTGCTATTCGTAGT") prot_seq = Seq("TAIRS") assert(str(seq.translate()) == str(prot_seq)) mutated_seq = Seq("AATGCTATTCGTAGT").translate() region = \ mutate.mutate_protein_from_transcript( seq, 1, 'C', 'A', padding = 8) print(str(region)) assert region.seq[0] == 'N' assert str(mutated_seq) == str(region.seq) assert len(region.seq) == 5 assert region.mutation_start == 0 assert region.n_removed == 1 assert region.n_inserted == 1 assert region.annot == 'T1N'
def test_get_transcript_and_mutate_vcf(): variant = { 'chr' : '10', 'pos' : 43617416, 'ref' : 'T', 'alt' : 'C' } vcf = pd.DataFrame.from_records([variant]) transcripts_ids = ensembl.annotate_vcf_transcripts(vcf) transcript_ids = set(transcripts_ids['stable_id_transcript']) assert( "ENST00000355710" in transcript_ids) assert( "ENST00000340058" in transcript_ids) transcript_id = "ENST00000355710" cdna_idx = ensembl.get_transcript_index_from_pos( variant['pos'], transcript_id, skip_untranslated_region = False) assert cdna_idx is not None assert cdna_idx < 5569 cdna_transcript = ref_data.get_cdna(transcript_id) assert(cdna_transcript[cdna_idx] == variant['ref']) cds_idx = ensembl.get_transcript_index_from_pos( variant['pos'], transcript_id, skip_untranslated_region = True) assert cds_idx is not None cds_transcript = ref_data.get_cds(transcript_id) assert(cds_transcript[cds_idx] == variant['ref']) region = mutate_protein_from_transcript( cds_transcript, cds_idx, variant['ref'], variant['alt'], padding = 10) assert region is not None assert len(region.seq) == 21, (region.seq, len(region.seq)) assert region.seq == 'RSQGRIPVKWTAIESLFDHIY'
def peptide_from_transcript_variant(transcript_id, pos, ref, alt, padding=None, max_length=None): # sometimes empty strings get represented with a '.' if ref == ".": ref = "" if alt == ".": alt = "" forward = annotation.is_forward_strand(transcript_id) ref = ref if forward else annotation.reverse_complement(ref) alt = alt if forward else annotation.reverse_complement(alt) transcript = _ensembl.get_cds(transcript_id) def error_result(msg, *args): logging.warning(msg, *args) return None, -1, -1, msg % args if not transcript: return error_result("Couldn't find transcript for ID %s", transcript_id) idx = annotation.get_transcript_index_from_pos( pos, transcript_id, skip_untranslated_region=True) if idx is None: return error_result( "Couldn't translate gene position %s into transcript index for %s", pos, transcript_id) elif idx >= len(transcript): return error_result( "Index %d longer than sequence (len %d) for transcript %s (%s)", idx, len(transcript), transcript_id, gene_mutation_description(pos, ref, alt)) idx = idx if forward else idx - len(ref) + 1 # 'ref' represents what the VCF file thought were the reference bases # at this position, now we actually check to make sure the transcript # agrees transcript_ref = str(transcript[idx:idx + len(ref)]) if transcript_ref != ref: mutation_description = gene_mutation_description(pos, ref, alt) return error_result( "VCF/MAF expected %s at idx %d of transcript %s, found %s (%s)" % \ (ref, idx, transcript_id, transcript_ref, mutation_description) ) region = mutate_protein_from_transcript(transcript, idx, ref, alt, padding=padding) start = region.mutation_start stop = start + region.n_inserted if max_length and len(region.seq) > max_length: seq = region.seq[:max_length] stop = min(stop, max_length) else: seq = region.seq return seq, start, stop, region.annot
def peptide_from_transcript_variant( transcript_id, pos, ref, alt, padding = None, max_length = None): # sometimes empty strings get represented with a '.' if ref == ".": ref = "" if alt == ".": alt = "" forward = annotation.is_forward_strand(transcript_id) ref = ref if forward else annotation.reverse_complement(ref) alt = alt if forward else annotation.reverse_complement(alt) transcript = _ensembl.get_cds(transcript_id) def error_result(msg, *args): logging.warning(msg, *args) return None, -1, -1, msg % args if not transcript: return error_result("Couldn't find transcript for ID %s", transcript_id) idx = annotation.get_transcript_index_from_pos( pos, transcript_id, skip_untranslated_region = True) if idx is None: return error_result( "Couldn't translate gene position %s into transcript index for %s", pos, transcript_id) elif idx >= len(transcript): return error_result( "Index %d longer than sequence (len %d) for transcript %s (%s)", idx, len(transcript), transcript_id, gene_mutation_description(pos, ref, alt)) idx = idx if forward else idx - len(ref) + 1 # 'ref' represents what the VCF file thought were the reference bases # at this position, now we actually check to make sure the transcript # agrees transcript_ref = str(transcript[idx:idx+len(ref)]) if transcript_ref != ref: mutation_description = gene_mutation_description(pos, ref, alt) return error_result( "VCF/MAF expected %s at idx %d of transcript %s, found %s (%s)" % \ (ref, idx, transcript_id, transcript_ref, mutation_description) ) region = mutate_protein_from_transcript( transcript, idx, ref, alt, padding = padding) start = region.mutation_start stop = start + region.n_inserted if max_length and len(region.seq) > max_length: seq = region.seq[:max_length] stop = min(stop, max_length) else: seq = region.seq return seq, start, stop, region.annot