def dist_pI(output, plot_suffix, EC): """ Plot distribution of protein pI for all sequences in FASTA file. """ fig, ax = plt.subplots(figsize=(8, 5)) width = 0.5 X_bins = arange(0, 14.1, width) hist, bin_edges = histogram(a=list(output.pI), bins=X_bins) # output.GRAVY.plot.hist(bins=50, # color='#607c8e') ax.bar(bin_edges[:-1], hist, align='edge', alpha=0.4, width=width, color=EC_descriptions()[str(EC)][1], edgecolor='k', label=EC_descriptions()[str(EC)][0]) # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2') dist_plot(fig, ax, name='pI', xlim=(0, 14), xtitle='pI', plot_suffix=plot_suffix)
def dist_GRAVY(output, plot_suffix, EC): """ Plot distribution of protein GRAVY for all sequences in FASTA file. """ fig, ax = plt.subplots(figsize=(8, 5)) width = 0.05 X_bins = arange(-2, 2.2, width) hist, bin_edges = histogram(a=list(output.GRAVY), bins=X_bins) # output.GRAVY.plot.hist(bins=50, # color='#607c8e') ax.bar(bin_edges[:-1], hist, align='edge', alpha=0.4, width=width, color=EC_descriptions()[str(EC)][1], edgecolor='k', label=EC_descriptions()[str(EC)][0]) # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2') # GRAVY specific visuals # ax.text(-1.45, 40, 'hydrophilic', fontsize=16) # get ylim ylim = ax.get_ylim() ax.text(0.55, max(ylim) / 2 + 0.05 * (max(ylim) / 2), 'hydrophobic', fontsize=16) ax.arrow(0.5, max(ylim) / 2, 0.7, 0, head_width=0.05 * (max(ylim) / 2), head_length=0.1, fc='k', ec='k') # avg_GRAVY = -0.4 # ax.axvline(x=avg_GRAVY, c='grey', alpha=1.0, linestyle='--') # catalase_GRAVY = -0.605 # ax.axvline(x=catalase_GRAVY, c='r', alpha=1.0) # urease_GRAVY = -0.1524 # ax.axvline(x=urease_GRAVY, c='b', alpha=1.0) dist_plot(fig, ax, name='GRAVY', xlim=(-1.5, 1.5), xtitle='GRAVY', plot_suffix=plot_suffix)
def dist_Iindex(output, plot_suffix, EC): """ Plot distribution of protein I index for all sequences in FASTA file. """ fig, ax = plt.subplots(figsize=(8, 5)) width = 5 X_bins = arange(0, 150, width) hist, bin_edges = histogram(a=list(output.I_index), bins=X_bins) # output.GRAVY.plot.hist(bins=50, # color='#607c8e') # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2') ax.bar(bin_edges[:-1], hist, align='edge', alpha=0.4, width=width, color=EC_descriptions()[str(EC)][1], edgecolor='k', label=EC_descriptions()[str(EC)][0]) # instability specific visuals # get ylim ylim = ax.get_ylim() ax.text(51, max(ylim) / 2 + 0.05 * (max(ylim) / 2), 'unstable', fontsize=16) ax.arrow(50, max(ylim) / 2, 30, 0, head_width=0.05 * (max(ylim) / 2), head_length=4, fc='k', ec='k') II_cutoff = 40 ax.axvline(x=II_cutoff, c='k', alpha=1.0, linestyle='--', lw=2) # catalase_II = 27.010 # ax.axvline(x=catalase_II, c='r', alpha=1.0) # urease_II = 31.75 # ax.axvline(x=urease_II, c='b', alpha=1.0) dist_plot(fig, ax, name='Iindex', xlim=(0, 100), xtitle='instability index', plot_suffix=plot_suffix)
def main_analysis(plot_suffix, fasta_file, output_file, EC): """Analyse all sequences in FASTA file from BRENDA. """ print('---------------------------------------------------------') print('Analyse properties of all sequences in FASTA:', fasta_file) print('---------------------------------------------------------') # percent_w_sequence(output_dir=search_output_dir) temp_time = time.time() if input('run calculations? (t/f)') == 't': get_fasta_sequence_properties(output_file=output_file, fasta_file=fasta_file) # do plotting + analysis -- plots if EC in list(EC_descriptions().keys()): print('------------------------------------------------------') print('doing analysis...') # load existing data from this FASTA file fasta_plotting(output_file=output_file, plot_suffix=plot_suffix, EC=EC) print('--- time taken =', '{0:.2f}'.format(time.time() - temp_time), 's') else: print('------------------------------------------------------') print('doing specific sequence analysis...') output = read_seq_output(output_file) dist_TMindex_specific(output, plot_suffix, EC)
def screen_pIs(database_names, redo_pI, redo_pI_plots, pI_csv, pI_output_dir, cutoff_pi, descriptors): """ Screen the pI of all sequences with chosen EC numbers. """ if descriptors is None: descriptors = {} for EC_file in database_names: EC = EC_file.replace(pI_output_dir, '') EC = EC.replace('__BRENDA_sequences.fasta', '').replace('_', '.') top_EC = EC.split('.')[0] # read the file but to avoid memory issues # we will calculate # the pI on the fly using the bio python module print('doing:', EC_file) file_mod = EC_file.replace(".fasta", "_mod.fasta") if redo_pI is True: calculate_pI_from_file(file_mod, pI_output_dir, cutoff_pi, pI_csv) if redo_pI_plots is True: print('plot distribution of pIs') pi_data = pd.read_csv(pI_output_dir + pI_csv, index_col=False) EC_pi_data = pi_data[pi_data['fasta_file'] == file_mod] plot_EC_pI_dist(EC_pi_data, filename=file_mod.replace('.fasta', '.pdf'), title=EC_descriptions()[top_EC][0], cutoff_pi=cutoff_pi) print('done') if redo_pI_plots is True: print('plot full distribution of pIs') pi_data = pd.read_csv(pI_output_dir + pI_csv, index_col=False) plot_pI_dist(pi_data, filename='full_pI_dist.pdf', output_dir=pI_output_dir, cutoff_pi=cutoff_pi)
def dist_Aindex(output, plot_suffix, EC): """ Plot distribution of protein Aindex for all sequences in FASTA file. """ fig, ax = plt.subplots(figsize=(8, 5)) width = 5 X_bins = arange(0, 150, width) hist, bin_edges = histogram(a=list(output.A_index), bins=X_bins) # output.GRAVY.plot.hist(bins=50, # color='#607c8e') # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2') ax.bar(bin_edges[:-1], hist, align='edge', alpha=0.4, width=width, color=EC_descriptions()[str(EC)][1], edgecolor='k', label=EC_descriptions()[str(EC)][0]) # AI specific visuals ylim = ax.get_ylim() ax.text(10, max(ylim) / 2 + 0.05 * (max(ylim) / 2), 'more stable', fontsize=16) ax.arrow(10, max(ylim) / 2, 40, 0, head_width=0.05 * (max(ylim) / 2), head_length=5, fc='k', ec='k') # catalase_AI = 68 # ax.axvline(x=catalase_AI, c='r', alpha=1.0) # urease_AI = 90.476 # ax.axvline(x=urease_AI, c='b', alpha=1.0) dist_plot(fig, ax, name='Aindex', xlim=(0, 150), xtitle='aliphatic index', plot_suffix=plot_suffix)
def dist_TMindex(output, plot_suffix, EC): """ Plot distribution of protein TM index for all sequences in FASTA file. """ fig, ax = plt.subplots(figsize=(8, 5)) width = 0.2 X_bins = arange(-5, 5.1, width) hist, bin_edges = histogram(a=list(output.TM_index), bins=X_bins) # output.GRAVY.plot.hist(bins=50, # color='#607c8e') # ax.plot(X_bins[:-1]+width/2, hist, c='k', lw='2') ax.bar(bin_edges[:-1], hist, align='edge', alpha=0.4, width=width, color=EC_descriptions()[str(EC)][1], edgecolor='k', label=EC_descriptions()[str(EC)][0]) # melting temperature index specific visuals TM_cutoff = (0, 1) ax.axvspan(xmin=TM_cutoff[0], xmax=TM_cutoff[1], facecolor='grey', alpha=0.2) # catalase_TMI = 1.22 # ax.axvline(x=catalase_TMI, c='r', alpha=1.0) # urease_TMI = 0.62 # ax.axvline(x=urease_TMI, c='b', alpha=1.0) dist_plot(fig, ax, name='TMindex', xlim=(-5, 5), xtitle='thermostability index', plot_suffix=plot_suffix)
def all_EC_violin_plot(): """Do violin plots of all properties for all EC output files. """ properties = ['I_index', 'A_index', 'TM_index', 'pI', 'GRAVY'] prop_label = [ 'instability index', 'aliphatic index', 'TM index', 'pI', 'GRAVY' ] prop_lim = [(0, 100), (0, 150), (-5, 5), (0, 14), (-1.5, 1.5)] ECs = ['1', '2', '3', '4', '5', '6'] output_files = [i + '__BRENDA_sequences_output.csv' for i in ECs] for i, prop in enumerate(properties): print('doing', prop, '....') fig, ax = plt.subplots(figsize=(8, 5)) for out_file in output_files: print(out_file) EC = out_file[0] print(EC) output = read_seq_output(out_file) parts = ax.violinplot( output[prop], [int(EC)], showmeans=False, showmedians=False, showextrema=False, ) for pc in parts['bodies']: pc.set_facecolor(EC_descriptions()[EC][1]) pc.set_edgecolor('black') pc.set_alpha(0.6) if prop == 'TM_index': # melting temperature index specific visuals TM_cutoff = (0, 1) ax.axhspan(ymin=TM_cutoff[0], ymax=TM_cutoff[1], facecolor='grey', alpha=0.2) if prop == 'I_index': II_cutoff = 40 ax.axhline(y=II_cutoff, c='k', alpha=1.0, linestyle='--', lw=2) if prop == 'A_index': ax.text(0.21, 60, 'more stable', fontsize=16, ha='left', va='bottom', rotation=90) ax.arrow(0.5, 40, 0, 80, head_width=0.2, head_length=10, fc='k', ec='k') ax.tick_params(axis='both', which='major', labelsize=16) ax.set_xlabel('EC number', fontsize=16) ax.set_ylabel(prop_label[i], fontsize=16) ax.set_xlim(0, 7) ax.set_ylim(prop_lim[i]) ax.set_xticks([1, 2, 3, 4, 5, 6]) ax.set_xticklabels(['1', '2', '3', '4', '5', '6']) fig.tight_layout() fig.savefig("violin_" + prop + ".pdf", dpi=720, bbox_inches='tight')