Beispiel #1
0
    def _get_aa_freq(self, fasta, analysis='amino', include_gaps=True):
        '''Return a dataframe of the frequencies of all 20 amino acids (AAs)
        in each site of an MSA. If include_gaps=True, gaps are treated as
        AAs and are included in the analysis. 
        If analysis == 'amino', frequencies of AAs are computed.
        if analysis == 'type', frequencies of AA types are computed'''

        if analysis == 'amino':
            fasta_df = bioinf.fasta_to_df(fasta)
            amino_acids = list('ACDEFGHIKLMNPQRSTVWY')
        elif analysis == 'type':
            # Replace AA single letter with single letter describing
            # the AA type
            # Aliphatic (A), Aromatic (R), Polar (P), Positve (T),
            # and Negative (N)
            fasta_df = bioinf.residue_to_group(fasta)
            amino_acids = list('ARPTN')
        if include_gaps:
            amino_acids += ['-']

        # Determine frequency
        store = []
        length = len(fasta_df.index)
        for k in range(len(fasta_df.columns)):
            aa_list = list(fasta_df.iloc[:, k])
            aa_count = [aa_list.count(x) / length for x in amino_acids]
            store.append(aa_count)
        store = pd.DataFrame(store).transpose()
        store.index = amino_acids
        return store
Beispiel #2
0
# Prepare sequences and data
#=====================================================#
GH13_df = pd.read_csv('results_final/ncbi_subtypes.csv')
GH13_SH = GH13_df[(GH13_df.ncbi_pred_class == 0)]
accession_SH = GH13_SH.Accession.tolist()
accession_all = bioinf.get_accession('fasta/initial_blast/nrblast_all.fasta')
GH13 = [1 if x in accession_SH else 0 for x in accession_all]
# class labels
y = pd.Series(GH13)
GH13_not_SH = y[y == 0]
GH13_yes_SH = y[y == 1]

# Derive features for machine learning with one-hot encoding
#============================================================#
cat_domain_fasta = 'fasta/GH13_positions_only/GH13_cat.fasta'
sequence_df = bioinf.fasta_to_df(cat_domain_fasta)
X_features = pd.DataFrame()  # empty dataframe for storing features

for i in range(len(sequence_df.columns)):
    # Convert amino acids to integers
    X_resid = list(sequence_df.iloc[:, i])
    labelencoder = LabelEncoder()
    X_label = list(labelencoder.fit_transform(X_resid))
    X_resid_unique = sorted(set(X_resid))
    X_label_unique = sorted(set(X_label))

    # Map integer labels to amino acids
    label_resid = [X_label.index(num) for num in X_label_unique]
    label_resid = [X_resid[num] for num in label_resid]

    # Convert labels to binary features (one-hot encoding)
Beispiel #3
0
                store1 = [len(select_i[select_i.iloc[:,1]==jjj]) for jjj in range2]
                storeall.append(store1)
            storeall = pd.DataFrame(storeall)
            sns.heatmap(storeall, cmap='Blues', linewidths=1, annot=True, 
                        annot_kws={'size':7}, fmt='.0f')
            plt.ylabel(loops[i])
            plt.xlabel(loops[k])
            plt.savefig(f'plots/loop_corr_plots/{loops[i]}{loops[k]}.pdf')
            plt.close()
            done.append(set((i, k)))
  





# Amino acid distribution at positions forming disulfide bonds in GH7 sequences
#====================================================================================#
df = bioinf.fasta_to_df('fasta/trecel7a_positions_only/cel7_cat.fasta')
df.columns = range(1, df.shape[1]+1d)
cysbonds = [4, 72, 19, 25, 50, 71, 61, 67, 138, 397, 172, 210, 176, 209, 230, 256, 238, 243, 
            261, 331]
cysfreq = [list(df[pos]).count('C') / 1748 * 100 for pos in cysbonds]

plt.rcParams['figure.figsize'] = [6,3]
xindex = [1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30, 33, 34, 37, 38]
plt.bar(xindex, cysfreq, color='dodgerblue', linewidth=1.25, edgecolor='black' )
plt.xticks(xindex, [f'C-{i}' for i in cysbonds], rotation=90)
plt.ylabel('Frequency (%)')
plt.tight_layout()
plt.savefig('plots/disulfide_distribution.pdf')