def get_protein_seq(self, transcript_id: str): """ Extract amino acid sequence for given transcript_id :param transcript_id: :return: amino acid sequence """ return translate(self.get_seq(transcript_id), hg38=True)
def _prepare_seq(cls, *args, **kwargs): """ Prepare the dna sequence and translate it into amino acid sequence :param seqs: current dna sequence :param intervals: the list of intervals corresponding to the sequence snippets :param reverse_complement: should the dna be reverse-complemented? :return: amino acid sequence """ return translate(super()._prepare_seq(*args, **kwargs), hg38=True)
def test_ensembl_uniprot_seq(tse): id_and_seq = {} with open(uniprot_seq_ref, 'r+') as f: key = "" for line in f: if '>' in line: key = (line.replace('>', '')).rstrip() else: id_and_seq[key] = line.rstrip() for transkript_id, ref_seq in tqdm(id_and_seq.items()): test_seq = translate(tse.get_seq(transkript_id), True) assert test_seq == ref_seq, test_seq
def test_vcf_single_variant_synonymous_mutations(tse, svp): transcript_id = 'ENST00000356175' ref_seq = translate(tse.get_seq(transcript_id), True) single_var_seq = list(svp.extract(transcript_id)) for seq in single_var_seq: assert seq == ref_seq, seq assert len( single_var_seq) == 337, 'Number of sequences != number of variants' count = 0 single_var_seq = list(svp.extract_all()) for t_id in single_var_seq: count += len(list(t_id)) assert count == 825
def test_hg38(tse): with open('err_transcripts', 'w+') as f: dfp = read_pep_fa(protein_file) dfp['transcript_id'] = dfp.transcript.str.split(".", n=1, expand=True)[0] #assert not dfp['transcript_id'].duplicated().any() dfp = dfp.set_index("transcript_id") #dfp = dfp[~dfp.chromosome.isnull()] assert len(tse) > 100 assert tse.transcripts.isin(dfp.index).all() div3_error = 0 seq_mismatch_err = 0 err_transcripts = [] for transcript_id in tqdm(tse.transcripts): # make sure all ids can be found in the proteome dna_seq = tse.get_seq(transcript_id) if dna_seq == "NNN": f.write(transcript_id + ' has an ambiguous start and end.Skip!') continue # dna_seq = dna_seq[:(len(dna_seq) // 3) * 3] # if len(dna_seq) % 3 != 0: # div3_error += 1 # print("len(dna_seq) % 3 != 0: {}".format(transcript_id)) # err_transcripts.append({"transcript_id": transcript_id, "div3_err": True}) # continue if len(dna_seq) % 3 != 0: f.write(transcript_id) continue prot_seq = translate(dna_seq, hg38=True) if dfp.loc[transcript_id].seq != prot_seq: seq_mismatch_err += 1 f.write("seq.mismatch: {}".format(transcript_id)) n_mismatch = 0 for i in range(len(prot_seq)): a = dfp.loc[transcript_id].seq[i] b = prot_seq[i] if a != b: n_mismatch += 1 f.write("{} {} {}/{}".format(a, b, i, len(prot_seq)))