def test_non_coding_exons_gprin1(gprin1):
    trx_data = transcript_info.read_transcript_info(gprin1['tsl'],
                                                    gprin1['exontable'],
                                                    gprin1['seqs'],
                                                    remove_na=False)

    # The last two exons of ENSMUST00000099506 are non-coding.
    # The unique coding exon has UTR at both ends.
    assert ''.join(
        str(exon) for exon in trx_data.loc[
            trx_data['TranscriptIDCluster'].
            map(lambda ids: 'ENSMUST00000099506' in ids.split('/')),
            'ExonProteinSequence']) == (
                'MRDCCSSPKAIPAPPRHALDQSLGMDPRHTSSSGAAEGASCSERPAGSLACPSPNCSPLP'
                'ETPRAHGALTSDNSGTTLFGKPEPMSSAEATPTASEIRNPVFSGKMDGNSLKQADSTSTR'
                'KEEAGSLRNEESMLKGKAEPMIYGKGEPGTVGRVDCTASGAENSGSLGKVDMPCSSKVDI'
                'VSPGGDNAGSLRKVETISSGKMDPKTENVMHSRRERPGSTGEGDLVSLRENDMKPPDNTD'
                'SASTKKTDPEFSGKLTPGSSGKTELVSSVTVAPVTSENVNPVCSGGAGPAAVGNSETLSS'
                'VKKDPQLLGKKEAVSSGEGGSVSVRMAETVSARQPEGMFPAKTDSTSSNSTGPSGRADPV'
                'SLRNSELVSPVKPERLSSGQAERVSLVKTETLSSGKEDPRSSRRVDHTTVTGNMQTSQKG'
                'NPESSGKTDLGSSSSGDTRSLGTWGSLSAAKAEVTEGKGDPQPWKKASLPASEKTDPLAS'
                'SKAGSASQGKAETVSPGEVDAMTLGKTVPTSSGKTALVSPGKVDLMTSERAEGIPELQAS'
                'EKGNPVNSTRVDTGATGSTEPKSGVKVITQIPGATSPGKVETPSLQKEQPQLSEKTDPSR'
                'KVDPPTTVEPVSLGKADSASPSPRKAESQTSAKTVPQAPDKATSSLRQSDGTPYSSAQPQ'
                'RDTRSIGSLPEREPSASTSQKDLAAAAAQKSPSAEAAAPPPGPRTRDNFTKAPSWDAGAP'
                'PPREDAGTQAGAQACVSVAVSPMSPQDGAGGPAFSFQAAPRAPSPAPRPPSRRDAGLQVS'
                'LGAAETRSVATGPMTPQAAAPPAVPPVFPEVRVRPGSVLAAALAPQEATEPVRDVSWDEK'
                'GMTWEVYGASMEVEVLGMAIQKHLERQIEEHGRQGAPAPAPPPAVRAGPGRAGSVRTAPA'
                'EGAAKRPPGLFRALLQSVRRPRCCSRAGPTAE*')
def test_keep_badquality_sequences(mapk8):
    trx_data = transcript_info.read_transcript_info(mapk8['tsl'],
                                                    mapk8['exontable'],
                                                    mapk8['seqs'],
                                                    remove_na=False,
                                                    remove_badquality=False)

    # ENSRNOT00000083933 has Xs in its sequence: ...VILGMGYKENGQXVXHVQRGLICC*
    assert sum(trx_data['TranscriptID'] == 'ENSRNOT00000083933') == 5
Example #3
0
def get_transcripts(input_folder, max_tsl_level=3.0, species_list=None):
    """Return a DataFrame with the transcript information."""
    ensembl_info = os.path.join(input_folder, 'Ensembl')
    return transcript_info.read_transcript_info(
        os.path.join(ensembl_info, 'tsl.csv'),
        os.path.join(ensembl_info, 'exonstable.tsv'),
        os.path.join(ensembl_info, 'sequences.fasta'),
        max_tsl_level=max_tsl_level,
        remove_na=False,
        species_list=species_list)
def test_species_list(mapk8):
    species_list = ['bos_taurus', 'homo_sapiens']

    trx_data = transcript_info.read_transcript_info(mapk8['tsl'],
                                                    mapk8['exontable'],
                                                    mapk8['seqs'],
                                                    remove_na=False,
                                                    species_list=species_list)

    assert sorted(trx_data.Species.unique()) == species_list
def test_read_transcript_info(mapk8):
    trx_data = transcript_info.read_transcript_info(mapk8['tsl'],
                                                    mapk8['exontable'],
                                                    mapk8['seqs'])

    assert trx_data.loc[
        trx_data['TranscriptID'] == 'ENST00000374179',
        'TSL'].unique()[0] == 1.0  # '1 (assigned to previous version 7)'

    # Only h. sapiens & m. musculus have TSL information:
    assert all(trx_data.Species.unique() == ['homo_sapiens', 'mus_musculus'])
Example #6
0
 def _get_subexon_data(folder):
     test_dir = os.path.dirname(filename)
     datadir = os.path.join(test_dir, 'data')
     folder_path = os.path.join(datadir, folder, 'Ensembl')
     trx_data = transcript_info.read_transcript_info(
         os.path.join(folder_path, 'tsl.csv'),
         os.path.join(folder_path, 'exonstable.tsv'),
         os.path.join(folder_path, 'sequences.fasta'),
         remove_na=False)
     clustered = transcript_info.exon_clustering(trx_data)
     return subexons.create_subexon_table(clustered)
Example #7
0
 def _get_clustered_trx_data(folder):
     test_dir = os.path.dirname(filename)
     datadir = os.path.join(test_dir, 'data')
     folder_path = os.path.join(datadir, folder, 'Ensembl')
     species_list = None
     if folder == "MAPK8_all":
         species_list = ['anser_brachyrhynchus']
     trx_data = transcript_info.read_transcript_info(
         os.path.join(folder_path, 'tsl.csv'),
         os.path.join(folder_path, 'exonstable.tsv'),
         os.path.join(folder_path, 'sequences.fasta'),
         remove_na=False,
         species_list=species_list)
     return transcript_info.exon_clustering(trx_data)
def test_non_coding_exons_camk2a(camk2a):
    trx_data = transcript_info.read_transcript_info(camk2a['tsl'],
                                                    camk2a['exontable'],
                                                    camk2a['seqs'],
                                                    remove_na=False)

    # The two first exons of ENSSSCT00000052397 are non-coding
    assert ''.join(
        str(exon) for exon in trx_data.loc[
            trx_data['TranscriptID'] == 'ENSSSCT00000052397',
            'ExonProteinSequence']) == (
                'MLLFLALWALVPCLVLLSLYFYSSAGGKSGGNKKNDGVKKRKSSSSVQLMESSESTNTTI'
                'EDEDTKVRKQEIIKVTEQLIEAISNGDFESYTKMCDPGMTAFEPEALGNLVEGLDFHRFY'
                'FENLWSRNSKPVHTTILNPHIHLMGDESACIAYIRITQYLDAGGIPRTAQSEETRVWHRR'
                'DGKWQIVHFHRSGAPSVLPH*')
def test_remove_na(mapk8):
    trx_data = transcript_info.read_transcript_info(mapk8['tsl'],
                                                    mapk8['exontable'],
                                                    mapk8['seqs'],
                                                    remove_na=False)

    # I keep other species, not only h. sapiens & m. musculus:
    assert len(trx_data.Species.unique()) > 2

    # remove_na doesn't interfere with selecting the correct biotype
    assert sum(value not in ['Protein coding', 'protein_coding']
               for value in trx_data.Biotype.unique()) == 0

    # ENSRNOT00000083933 has Xs in its sequence: ...VILGMGYKENGQXVXHVQRGLICC*
    assert sum(trx_data['TranscriptID'] == 'ENSRNOT00000083933') == 0
def test_exon_clustering(mapk8):
    trx_data = transcript_info.read_transcript_info(mapk8['tsl'],
                                                    mapk8['exontable'],
                                                    mapk8['seqs'])

    clustered = transcript_info.exon_clustering(trx_data)

    # Input order doesn't change
    assert all(clustered['ExonID'] == trx_data['ExonID'])

    # Exon can not have more than one cluster
    assert all(
        clustered.groupby('ExonID').apply(
            lambda df: len(df['Cluster'].unique()) == 1))

    # Non-clustered exons have Cluster == 0 and QueryExon == ''
    assert all(row.QueryExon == '' for row in clustered.itertuples()
               if row.Cluster == 0)
    assert all(row.Cluster == 0 for row in clustered.itertuples()
               if row.QueryExon == '')

    # Sequences with less than 4 residues are non-clustered by default
    assert all(clustered.loc[clustered['ExonProteinSequence'].map(len) < 4,
                             'Cluster'] == 0)

    for _, group in clustered.groupby('Cluster'):
        nans = np.isnan(group['PercentIdentity'])
        # There is a nan in PercentIdentity when a sequence initialize its own
        # cluster, so the QueryExon and the ExonID should be the same
        assert np.all(group.loc[nans, 'QueryExon'] == group.loc[nans,
                                                                'ExonID'])
        # Also, if there are more exons, the exons with nan should be the
        # QueryExon of other exon in the cluster. It can not be alone.
        if len(group) > 1:
            assert np.all([
                exon in group.loc[np.logical_not(nans), 'QueryExon'].values
                for exon in group.loc[nans, 'QueryExon'].unique()
            ])

            # The aligned seq should be in the exon
            subset = group[np.logical_not(nans)]
            for _, exon in subset.groupby('ExonIDCluster'):
                assert np.any([((row.AlignedTarget.replace('-', '')
                                 in row.ExonProteinSequence)
                                or (row.AlignedQuery.replace(
                                    '-', '')) in row.ExonProteinSequence)
                               for row in exon.itertuples()])