コード例 #1
0
ファイル: protein.py プロジェクト: kipoi/kipoiseq
 def get_protein_seq(self, transcript_id: str):
     """
     Extract amino acid sequence for given transcript_id
     :param transcript_id: 
     :return: amino acid sequence
     """
     return translate(self.get_seq(transcript_id), hg38=True)
コード例 #2
0
ファイル: protein.py プロジェクト: kipoi/kipoiseq
 def _prepare_seq(cls, *args, **kwargs):
     """
     Prepare the dna sequence and translate it into amino acid sequence
     :param seqs: current dna sequence
     :param intervals: the list of intervals corresponding to the sequence snippets
     :param reverse_complement: should the dna be reverse-complemented?
     :return: amino acid sequence
     """
     return translate(super()._prepare_seq(*args, **kwargs), hg38=True)
コード例 #3
0
def test_ensembl_uniprot_seq(tse):
    id_and_seq = {}
    with open(uniprot_seq_ref, 'r+') as f:
        key = ""
        for line in f:
            if '>' in line:
                key = (line.replace('>', '')).rstrip()
            else:
                id_and_seq[key] = line.rstrip()

    for transkript_id, ref_seq in tqdm(id_and_seq.items()):
        test_seq = translate(tse.get_seq(transkript_id), True)
        assert test_seq == ref_seq, test_seq
コード例 #4
0
def test_vcf_single_variant_synonymous_mutations(tse, svp):
    transcript_id = 'ENST00000356175'
    ref_seq = translate(tse.get_seq(transcript_id), True)
    single_var_seq = list(svp.extract(transcript_id))
    for seq in single_var_seq:
        assert seq == ref_seq, seq
    assert len(
        single_var_seq) == 337, 'Number of sequences != number of variants'

    count = 0
    single_var_seq = list(svp.extract_all())
    for t_id in single_var_seq:
        count += len(list(t_id))

    assert count == 825
コード例 #5
0
def test_hg38(tse):
    with open('err_transcripts', 'w+') as f:
        dfp = read_pep_fa(protein_file)
        dfp['transcript_id'] = dfp.transcript.str.split(".", n=1,
                                                        expand=True)[0]
        #assert not dfp['transcript_id'].duplicated().any()
        dfp = dfp.set_index("transcript_id")
        #dfp = dfp[~dfp.chromosome.isnull()]
        assert len(tse) > 100
        assert tse.transcripts.isin(dfp.index).all()
        div3_error = 0
        seq_mismatch_err = 0
        err_transcripts = []
        for transcript_id in tqdm(tse.transcripts):
            # make sure all ids can be found in the proteome
            dna_seq = tse.get_seq(transcript_id)
            if dna_seq == "NNN":
                f.write(transcript_id +
                        ' has an ambiguous start and end.Skip!')
                continue
            # dna_seq = dna_seq[:(len(dna_seq) // 3) * 3]
            # if len(dna_seq) % 3 != 0:
            #   div3_error += 1
            #  print("len(dna_seq) % 3 != 0: {}".format(transcript_id))
            # err_transcripts.append({"transcript_id": transcript_id, "div3_err": True})
            # continue
            if len(dna_seq) % 3 != 0:
                f.write(transcript_id)
                continue
            prot_seq = translate(dna_seq, hg38=True)
            if dfp.loc[transcript_id].seq != prot_seq:
                seq_mismatch_err += 1
                f.write("seq.mismatch: {}".format(transcript_id))
                n_mismatch = 0
                for i in range(len(prot_seq)):
                    a = dfp.loc[transcript_id].seq[i]
                    b = prot_seq[i]
                    if a != b:
                        n_mismatch += 1
                        f.write("{} {} {}/{}".format(a, b, i, len(prot_seq)))