def main(input_dir, output_dir): """ Make interim PDB structures (from pdb/raw/) for final processing (saved in pdb/processed/). """ config = utils.read_config() pdb_code = config['pdb']['id'] # Data import pdb_struct = load_structure(pdb_code, input_dir, file_extension="pdb1") # Data processing pdb_models = get_models(pdb_struct) # Remove water pdb_struct.df['HETATM'] = pdb_struct.df['HETATM'][pdb_struct.df['HETATM']['residue_name'] != 'HOH'] # Remove HEZ pdb_struct.df['HETATM'] = pdb_struct.df['HETATM'][pdb_struct.df['HETATM']['residue_name'] != 'HEZ'] # Select A form pdb_struct.df['ATOM'] = pdb_struct.df['ATOM'][pdb_struct.df['ATOM']['residue_name'] != 'BGLU'] pdb_struct.df['ATOM']['occupancy'] = 1.00 # Rename chains pdb_struct.df['ATOM']['chain_id'] = rename_chains(pdb_struct.df['ATOM']['chain_id'], \ no_protomers=pdb_models) pdb_struct.df['HETATM']['chain_id'] = rename_chains(pdb_struct.df['HETATM']['chain_id'], \ no_protomers=pdb_models) # Save data save_structure(pdb_struct, pdb_code, output_dir) return None
def clean_ons_time_series(key, dataset_id, timeseries_id): """ Opens raw data (in json) as downloaded from ONS API and puts it into a clean monthly and tidy format. """ config = utils.read_config() raw_file_name = os.path.join(config['data']['rawFilePath'], key+'_data.txt') with open(raw_file_name) as json_file: data = json.load(json_file) title_text = data['description']['title'] print("Code output:\n") print(title_text) # Check if monthly data exist; if not go on to quarterly if data['months']: df = pd.DataFrame(pd.io.json.json_normalize(data['months'])) df['date'] = pd.to_datetime(df['date']) + pd.offsets.MonthEnd(0) df = df.set_index('date') df['value'] = df['value'].astype(float) else: # Assume quarterly df = pd.DataFrame(pd.io.json.json_normalize(data['quarters'])) df['date'] = (pd.to_datetime(df['date'].str.replace(' ', '')) + pd.offsets.QuarterEnd(0)) df = df.set_index('date') df['value'] = df['value'].astype(float) # Upscale to monthly df = df['value'].resample('M').interpolate(method='time') df = pd.DataFrame(df) cols_to_drop = [x for x in df.columns if x != 'value'] df = df.drop(cols_to_drop, axis=1) df['timeseries_id'] = timeseries_id df['dataset_id'] = dataset_id df['value_name'] = key return df
def main(output_dir): """ Downloads raw PDB structures listed in YAML file from PDB website (saved in pdb/raw). """ config = utils.read_config() pdb_code = config['pdb']['id'] download_pdb(pdb_code, output_dir, biounit=True, compressed=False) return None
def download_raw_data(): """ Master script for download raw data from ONS Writes out to rawFilePath in config """ config = utils.read_config() # Retrieve all series and save to file with value name/key in title for i, key in enumerate(config['timeSeries'].keys()): print('Downloading ' + key) data = grab_ons_time_series_data(*config['timeSeries'][key]) output_dir = os.path.join(config['data']['rawFilePath'], key + '_data.txt') with open(output_dir, 'w') as outfile: json.dump(data, outfile)
def create_clean_data(): """ Master function which takes all raw series, cleans them, and outputs to a flat file """ # Get config file config = utils.read_config() # Create empty list for vector of dataframes df_vec = [] for key in list(config['timeSeries'].keys()): df_vec.append(clean_ons_time_series(key, *config['timeSeries'][key])) # Put this into tidy format df = pd.concat(df_vec, axis=0) # Write it to clean data df.to_csv(os.path.join(config['data']['clnFilePath'], 'ts_data.csv'))
def main(input_dir, output_dir): """ The main fucniton of this script. Parameters ---------- input_dir : str The location of the input directory output_dir : str The location of the output directory Returns ------- None """ config = utils.read_config() # plt.style.use(config['viz']['default']) color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color'] bfactors = pd.read_csv(join_paths(input_dir, "bfactors.csv")) _, ax = plt.subplots(figsize=(4,2.5), constrained_layout=True) ax.plot(bfactors['residue_number'], bfactors['bfactor_md'], label='MD', color=color_cycler[0]) ax.plot(bfactors['residue_number'], bfactors['bfactor_enm'], label='ENM', color=color_cycler[1]) ax.plot(bfactors['residue_number'], bfactors['bfactor_exp'], label='exp', color=color_cycler[2]) ax.set_ylabel("B-factor") ax.set_xlabel("Residue number") ax.set_title("B-factor comparison") ax.legend(frameon=False) plt.show() plt.savefig(join_paths(output_dir, "bfactors.pdf"), bbox_inches='tight') plt.savefig(join_paths(output_dir, "bfactors.png"), bbox_inches='tight') return None
def main(input_dir, output_dir): """ Proccesses interim PDB structures (from pdb/interim/) and creates PDB structural forms for simulations (saved in pdb/processed/). """ config = utils.read_config() pdb_code = config['pdb']['id'] # Data import pdb_struct = load_structure(pdb_code, input_dir, file_extension="pdb") # Data processing # Delete residues 1 and 216 # pdb_struct.df['ATOM'] = pdb_struct.df['ATOM'][(pdb_struct.df['ATOM']['residue_number'] != 1) \ # & (pdb_struct.df['ATOM']['residue_number'] != 216)] # Create structural forms pdb_0 = create_form(pdb_struct, form_idx=0) pdb_1 = create_form(pdb_struct, form_idx=1) pdb_2 = create_form(pdb_struct, form_idx=2) # Save processed data save_structure(pdb_0, 0, output_dir) save_structure(pdb_1, 1, output_dir) save_structure(pdb_2, 2, output_dir)
import os, sys, math, numpy as np, itertools from matplotlib.patches import Patch import matplotlib.pyplot as plt from pylab import * import src.utilities as utils config = utils.read_config() mpl.rcParams.update(mpl.rcParamsDefault) # VS Code plots not black plt.style.use(config['viz']) infile = 'crosscor.dat' #First input file outname = 'crosscor' #Name output files will take xlbl = 'Amino Acid Number' ylbl = 'Amino Acid Number' ttl = '' mi = [] mj = [] ol = [] i = -1 ############################################################################# # Read arguments from terminal, and assign input files and a name that all output files will contain. ############################################################################# for x in range(1, len(sys.argv)): if sys.argv[x] == '-i': infile = sys.argv[x + 1] if sys.argv[x] == '-xlabel': xlbl = sys.argv[x + 1]
def main(input_dir, output_dir): """ Runs data visualization scripts to turn processed data (from data/processed) into plots (saved in scratch/). """ config = utils.read_config() pdb_codes = config['pdb']['codeList'] mpl.rcParams.update(mpl.rcParamsDefault) # VS Code plots not black plt.style.use(config['viz']['jupyter']) # Get paths # Directory path example: "data/processed/-c09.50/-mass-ca-het/0" cutoff_paths = sorted(glob.glob(join_paths(input_dir, "*"), recursive=True)) # Plot parameters eigenvals_ylims = [-350, -100] diss_consts_ylims = [0, 15] coop_ylims = [0.9, 1.1] plot_no_modes = 300 for cutoff_path in cutoff_paths: cutoff_flag = os.path.basename(cutoff_path) flag_paths = sorted(glob.glob(join_paths(cutoff_path, "*"), recursive=True)) figure_path = "allo.{}.pdf".format(cutoff_flag) rows = 3 cols = len(flag_paths) fig_side = 3 # inches fig_len = fig_side * cols fig_wid = fig_side * rows fig1, axs = plt.subplots(rows, cols, figsize=(fig_len, fig_wid)) fig1.suptitle("{}".format(cutoff_flag)) for idx, flag_path in enumerate(flag_paths): other_flags = os.path.basename(flag_path) # Load data eigenvals = pd.read_csv(join_paths(flag_path, "eigenvals.csv"), index_col='mode_number') diss_consts = pd.read_csv(join_paths(flag_path, "diss_consts.csv"), index_col='mode_number') coop = pd.read_csv(join_paths(flag_path, "coop.csv"), index_col='mode_number') # Plot data ttl = "{}".format(other_flags) ax1 = axs[0][idx] ax2 = axs[1][idx] ax3 = axs[2][idx] # EIGENVALS sns.scatterplot(data=eigenvals[7:plot_no_modes+7], ax=ax1) ax1.set_title(ttl, pad=15) ax1.set_xlabel("") ax1.set_xticklabels([]) ax1.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), useOffset=False) # DISS_CONSTS sns.scatterplot(data=diss_consts[7:plot_no_modes+7], ax=ax2) ax2.set_xlabel("") ax2.set_xticklabels([]) ax2.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), useOffset=False) # COOP sns.scatterplot(data=coop[7:plot_no_modes+7], ax=ax3) # Show non-cooperativity ax3.axhline(y=1.0, color='black', linestyle=':') ax3.get_legend().remove() ax3.set_ylim(coop_ylims) if idx == 0: ax1.set_ylabel("$\log(\lambda_{n})_{total}$") ax2.set_ylabel("$K$") ax3.set_xlabel("Mode number") ax3.set_ylabel("$K_{2}/K_{1}$") else: ax1.set_xlabel("") ax1.get_legend().remove() ax2.set_xlabel("") ax2.get_legend().remove() ax3.set_xlabel("") # Subplots' axes aspect ratio ax1.set_box_aspect(1) ax2.set_box_aspect(1) ax3.set_box_aspect(1) fig1.tight_layout(w_pad=1) plt.savefig(join_paths(output_dir, figure_path), bbox_inches='tight') plt.close(fig1)
def main(input_dir, output_dir): """ The main fucniton of this script. Parameters ---------- input_dir : str The location of the input directory output_dir : str The location of the output directory Returns ------- None """ config = utils.read_config() mpl.rcParams.update(mpl.rcParamsDefault) # plt.style.use(config['viz']['default']) color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color'] MD_EIGVALS_PATH = "data/05-analysis/eigvals.dat" ENM_FREQ_PATH = "data/external/mode.frequencies" ENM_EIGVALS_PATH = "data/external/eigenvalues" eigvals_md = pd.read_fwf(MD_EIGVALS_PATH, infer_nrows=7000, header=None) eigvals_md.columns = ['mode_number', 'eigenvalue'] # eigvals_md['eigenvalue'] = np.sqrt(eigvals_md['eigenvalue'].to_numpy()) eigvals_md['mode_number'] = eigvals_md['mode_number'] + 6 eigvals_md.set_index('mode_number', drop=True, inplace=True) # eigvals_enm = pd.read_csv(ENM_EIGVALS_PATH, index_col='mode_number') freq_enm = pd.read_csv(ENM_FREQ_PATH, comment='#', header=None) freq_enm.columns = ['eigenvalue'] freq_enm['mode_number'] = np.arange(1, freq_enm.shape[0] + 1) freq_enm.set_index('mode_number', drop=True, inplace=True) freq_enm = freq_enm[6:] plot_modes = 1197 plot_modes += 6 # PLOT _, axs = plt.subplots(2, 2, figsize=(6, 6), constrained_layout=True) ax = axs[0][0] ax.scatter(eigvals_md[:106].index, eigvals_md[:106]['eigenvalue'], label='MD', color=color_cycler[0], s=5) ax.scatter(freq_enm[:106].index, freq_enm[:106]['eigenvalue'], label='ENM', color=color_cycler[1], s=5) ax.set_ylabel("$\lambda$ [$cm^{-1}$]") ax.set_xlabel("Mode number") ax.set_title("First real 100 modes") ax.legend(frameon=False) eigvals_ratio = eigvals_md['eigenvalue'] / freq_enm['eigenvalue'] ax = axs[0][1] ax.scatter(eigvals_md[:plot_modes].index, eigvals_md[:plot_modes]['eigenvalue'], label='MD', color=color_cycler[0], s=5) ax.scatter(freq_enm[:plot_modes].index, freq_enm[:plot_modes]['eigenvalue'], label='ENM', color=color_cycler[1], s=5) # ax.set_ylabel("$\lambda$ [$cm^{-1}$]") ax.set_xlabel("Mode number") ax.set_title("All modes") # ax.legend(frameon=False) no_bins = 25 no_modes = len(eigvals_md.index) hist_kwargs = dict( histtype='step', alpha=1, # color = colourWheel[j%len(colourWheel)], # linestyle = lineStyles_hist[j%len(lineStyles_hist)], # density=True, # label = column_name, bins=no_bins) ax = axs[1][0] ax.hist(eigvals_md['eigenvalue'].to_numpy(), **hist_kwargs, color=color_cycler[0], label="MD") ax.set_ylabel("# modes per bin") ax.set_xlabel("$\lambda$ [$cm^{-1}$]") ax.set_title("# bins = {} | # modes = {}".format(no_bins, no_modes)) ax.legend(frameon=False) ax = axs[1][1] ax.hist(freq_enm['eigenvalue'][:no_modes + 6].to_numpy(), **hist_kwargs, color=color_cycler[1], label="ENM") # ax.set_ylabel("# modes per bin") ax.set_xlabel("$\lambda$ [$cm^{-1}$]") ax.legend(frameon=False) plt.show() plt.savefig(join_paths(output_dir, "eigvals.pdf"), bbox_inches='tight') plt.savefig(join_paths(output_dir, "eigvals.png"), bbox_inches='tight') # fig, axs = plt.subplots(2,2, figsize=(6,6), constrained_layout=True) # ax = axs[0][0] # ax.scatter(eigvals_md[:plot_modes].index, eigvals_md[:plot_modes]['eigenvalue'], # label='MD', color=color_cycler[0], s=5) # ax.scatter(eigvals_enm[:plot_modes].index, eigvals_enm[:plot_modes]['eigenvalue'], # label='ENM', color=color_cycler[1], s=5) # ax.set_ylabel("$\lambda$ [$cm^{-1}$]") # ax.set_xlabel("Mode number") # ax.set_title("Eigenvalues") # ax.legend(frameon=False) # eigvals_ratio = eigvals_md['eigenvalue'] / eigvals_enm['eigenvalue'] # ax = axs[0][1] # ax.scatter(eigvals_ratio[:plot_modes].index, eigvals_ratio[:plot_modes], # color=color_cycler[2], s=5) # ax.set_ylabel("$\lambda_{MD}/\lambda_{ENM}$") # ax.set_xlabel("Mode number") # ax.set_title("MD/ENM ratio") # no_bins = 35 # hist_kwargs = dict( histtype='step', # alpha=1, # # color = colourWheel[j%len(colourWheel)], # # linestyle = lineStyles_hist[j%len(lineStyles_hist)], # # density=True, # # label = column_name, # bins=no_bins) # ax = axs[1][0] # ax.hist(eigvals_md['eigenvalue'][:950].to_numpy(), **hist_kwargs, # color=color_cycler[0]) # ax.set_ylabel("# modes per bin") # ax.set_xlabel("$\lambda$ [$cm^{-1}$]") # ax.set_title("MD DOS | # bins = {}".format(no_bins)) # ax = axs[1][1] # ax.hist(eigvals_enm['eigenvalue'][:950].to_numpy(), **hist_kwargs, # color=color_cycler[1]) # ax.set_ylabel("# modes per bin") # ax.set_xlabel("$\lambda$ [?]") # ax.set_title("ENM DOS | # bins = {}".format(no_bins)) # plt.show() # plt.savefig(join_paths(output_dir, "eigvals.eigvals.pdf"), bbox_inches='tight') # plt.savefig(join_paths(output_dir, "eigvals.eigvals.png"), bbox_inches='tight') return None
def main(input_dir, output_dir): """ The main fucniton of this script. Parameters ---------- input_dir : str The location of the input directory output_dir : str The location of the output directory Returns ------- None """ config = utils.read_config() # plt.style.use(config['viz']['default']) summary_data = pd.read_csv(join_paths(input_dir, "summary_mdout.csv"), index_col='time') color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color'] fig = plt.figure(constrained_layout=True, figsize=(10, 5)) subfigs = fig.subfigures(1, 2) subfigsnest = subfigs[0].subfigures(2, 1) gs = subfigsnest[0].add_gridspec(3, hspace=0) axsnest0 = gs.subplots(sharex=True) # axsnest0 = subfigsnest[0].subplots(3, 1, sharex=True) subplot_axs = axsnest0 subplot_axs[0].plot(summary_data.index, summary_data['eptot'], color=color_cycler[0]) subplot_axs[1].plot(summary_data.index, summary_data['ektot'], color=color_cycler[1]) subplot_axs[2].plot(summary_data.index, summary_data['etot'], color=color_cycler[2]) # subfigsnest[0].supylabel("Energy [$kcal\:mol^{-1}$]") axsnest1 = subfigsnest[1].subplots(2, 1, sharex=True) subplot_axs = axsnest1 ax = subplot_axs[0] ax.plot(summary_data.index, summary_data['temp'], color=color_cycler[4]) ax.set_xlabel("") ax.set_ylabel("Temp [K]") ax = subplot_axs[1] ax.plot(summary_data.index, summary_data['pres'], color=color_cycler[5]) ax.set_xlabel("Time [ps]") ax.set_ylabel("Pres\n[$kcal\:mol^{-1}\:\AA^{-1}$]") if 'volume' in summary_data.columns: axsRight = subfigs[1].subplots(3, 1, sharex=True) ax = axsRight[0] ax.plot(summary_data.index, summary_data['volume'], color=color_cycler[6]) ax.set_xlabel("") ax.set_ylabel("Volume [$\AA^{-3}$]") if 'density' in summary_data.columns: ax = axsRight[1] ax.plot(summary_data.index, summary_data['density'], color=color_cycler[7]) ax.set_xlabel("") ax.set_ylabel("Density [$g\:cm^{-3}$]") if 'rmsd' in summary_data.columns: ax = axsRight[2] ax.plot(summary_data.index, summary_data['rmsd'], color=color_cycler[8]) ax.set_xlabel("Time [ps]") ax.set_ylabel("RMSD [$\AA$]") plt.savefig(join_paths(output_dir, "summary_mdout.pdf"), bbox_inches='tight') plt.savefig(join_paths(output_dir, "summary_mdout.png"), bbox_inches='tight') plt.show() return None
def main(input_filepath, output_filepath): """ Runs data visualization scripts to turn processed data (from data/processed) into plots (saved in scratch/). """ logger = logging.getLogger(__name__) logger.info('making plots from processed data') config = utils.read_config() pdb_codes = config['pdb']['codeList'] mpl.rcParams.update(mpl.rcParamsDefault) # VS Code plots not black plt.style.use(config['viz']) # Get filepaths entropy_wt_filepaths = sorted( glob.glob(os.path.join(input_filepath, "*.entropy"))) allostery_wt_filepaths = sorted( glob.glob(os.path.join(input_filepath, "*.allostery"))) allostery_1point_filepaths = sorted( glob.glob(os.path.join(input_filepath, "*.1point.allostery.m???.csv"))) # Load data entropy_wt_data = { os.path.basename(filepath).replace(".entropy", ""): load_data(filepath) for filepath in entropy_wt_filepaths } allostery_wt_data = { os.path.basename(filepath).replace(".allostery", ""): load_data(filepath) for filepath in allostery_wt_filepaths } allostery_1point_data = { os.path.basename(filepath).replace(".csv", ""): load_data(filepath) for filepath in allostery_1point_filepaths } # Plot wild-type data # for pdb_code in pdb_codes: # entropy_wt_plot = entropy_wt_data[pdb_code][:100] # allostery_wt_plot = allostery_wt_data[pdb_code][:100] # # Entropy # _, ax = plt.subplots() # x_lbl = "Mode (non-trivial)" # y_lbl = "$-S/(kT)$" # ttl = "Entropy | {}".format(pdb_code.upper()) # ax.set_xlabel(x_lbl) # ax.set_ylabel(y_lbl) # ax.set_title(ttl) # for column_name in ['S_0', 'S_1', 'S_2']: # sns.lineplot(data=entropy_wt_plot[column_name], label="${}$".format(column_name), ax=ax) # ax.legend() # plt.savefig(os.path.join(output_filepath, "{}.wt.entropy.png".format(pdb_code))) # # Free energy # _, ax = plt.subplots() # x_lbl = "Mode (non-trivial)" # y_lbl = "G/(kT)" # ttl = "Free energy | {}".format(pdb_code.upper()) # ax.set_xlabel(x_lbl) # ax.set_ylabel(y_lbl) # ax.set_title(ttl) # for column_name in ['G_0', 'G_1', 'G_2']: # sns.lineplot(data=allostery_wt_plot[column_name], label="${}$".format(column_name), ax=ax) # ax.legend() # plt.savefig(os.path.join(output_filepath, "{}.wt.free_energy.png".format(pdb_code))) # # Free energy change # _, ax = plt.subplots() # x_lbl = "Mode (non-trivial)" # y_lbl = r"$\Delta G/(kT)$" # ttl = "Free energy change | {}".format(pdb_code.upper()) # ax.set_xlabel(x_lbl) # ax.set_ylabel(y_lbl) # ax.set_title(ttl) # for column_name in ['dG_1', 'dG_2']: # sns.lineplot(data=allostery_wt_plot[column_name], label="${}$".format(column_name), ax=ax) # ax.legend() # plt.savefig(os.path.join(output_filepath, "{}.wt.free_energy_change.png".format(pdb_code))) # # Allostery # _, ax1 = plt.subplots(figsize=(12,6)) # x_lbl = "Mode (non-trivial)" # y1_lbl = "$K_{2}/K_{1}$" # ttl = "Allostery | {}".format(pdb_code.upper()) # ax1.set_xlabel(x_lbl) # ax1.set_ylabel(y1_lbl) # ax1.set_title(ttl) # # Show non-cooperative region # ax1.axhline(y=1.0, color='black', linestyle=':', linewidth=1) # sns.lineplot(data=allostery_wt_plot['allostery'], ax=ax1) # # Plot ddG # ax2 = ax1.twinx() # y2_lbl = r"$\Delta \Delta G/(kT)$" # ax2.set_ylabel(y2_lbl) # sns.lineplot(data=allostery_wt_plot['ddG'], ax=ax2) # plt.savefig(os.path.join(output_filepath, "{}.wt.allostery.png".format(pdb_code))) # plt.close('all') # Plot 1-point mut scan heatmap for filename, data in allostery_1point_data.items(): pdb_code = filename[:4] no_modes = int(filename[-3:].lstrip("0")) # Free energy for form_idx in range(3): column_name = "G_{}".format(form_idx) # Convert to wide format free_energy_1point_plot = data.pivot(index='spring_strength', columns='residue_number', values=column_name) _, ax = plt.subplots() ttl = "${}$ | {} | {} modes".format(column_name, pdb_code.upper(), no_modes) ax.set_title(ttl) viz_1point.plot_heatmap(free_energy_1point_plot, cbar_lbl="$G/(kT)$".format(column_name), axis=ax) plt.savefig( os.path.join( output_filepath, "{}.1point.free_energy_{}.m{:03d}.png".format( pdb_code, form_idx, no_modes))) plt.close('all') # Free energy change for form_idx in range(1, 3): column_name = "dG_{}".format(form_idx) # Convert to wide format free_energy_1point_plot = data.pivot(index='spring_strength', columns='residue_number', values=column_name) _, ax = plt.subplots() ttl = "${}$ | {} | {} modes".format( column_name.replace("d", "\Delta "), pdb_code.upper(), no_modes) ax.set_title(ttl) viz_1point.plot_heatmap(free_energy_1point_plot, cbar_lbl="$\Delta G/(kT)$", axis=ax) plt.savefig( os.path.join( output_filepath, "{}.1point.free_energy_change_{}.m{:03d}.png".format( pdb_code, form_idx, no_modes))) plt.close('all') # Cooperativity # Convert to wide format allostery_1point_plot = data.pivot(index='spring_strength', columns='residue_number', values='allostery') _, ax = plt.subplots() ttl = "1-point scan | {} | {} modes".format(pdb_code.upper(), no_modes) ax.set_title(ttl) viz_1point.plot_heatmap(allostery_1point_plot, axis=ax) plt.savefig( os.path.join( output_filepath, "{}.1point.allostery.m{:03d}.png".format(pdb_code, no_modes))) # Plot heatmap in real-space filename = "1m9a.1point.allostery.m025" data = allostery_1point_data[filename] pdb_code = filename[:4] no_modes = int(filename[-3:].lstrip("0")) # Free energy for apo structure for form_idx in range(1): column_name = "G_{}".format(form_idx) # Convert to wide format selected_data = data.pivot(index='spring_strength', columns='residue_number', values=column_name) for spring_strength in [0.25, 4.00]: selected_kcust_data = selected_data.loc[spring_strength, :] vmin = selected_data.min().min() vmax = selected_data.max().max() vcentre = selected_data.loc[1.00, :].iloc[0] # print("vmin = {}\nvcentre = {}\nvmax = {}".format(vmin, vcentre, vmax)) colour_data, _ = viz_1point.code_heatmap(selected_kcust_data, vmin=vmin, vmax=vmax, vcenter=vcentre) path = os.path.join( output_filepath, "{}.1point.free_energy.m{:03d}.k{:06.3f}".format( pdb_code, no_modes, spring_strength)) cmd.delete('all') viz_1point.colour_by_heatmap( colour_data, structure_path="pdb/processed/1m9a.0.pdb", molecule_name="1m9a", output_path=path) # Cooperativity column_name = "allostery" # Convert to wide format selected_data = data.pivot(index='spring_strength', columns='residue_number', values=column_name) for spring_strength in [0.25, 4.00]: selected_kcust_data = selected_data.loc[spring_strength, :] vmin = selected_data.min().min() vmax = selected_data.max().max() vcentre = selected_data.loc[1.00, :].iloc[0] colour_data, _ = viz_1point.code_heatmap(selected_kcust_data, vmin=vmin, vmax=vmax, vcenter=vcentre) path = os.path.join( output_filepath, "{}.1point.allostery.m{:03d}.k{:06.3f}".format( pdb_code, no_modes, spring_strength)) cmd.delete('all') viz_1point.colour_by_heatmap(colour_data, structure_path="pdb/processed/1m9a.2.pdb", molecule_name="1m9a", output_path=path)
def main(input_dir, output_dir): """ Runs data processing scripts to turn interim data (from data/interim/) into processed data ready to be analysed (saved in data/processed/). """ config = utils.read_config() pdb_codes = config['pdb']['codeList'] # Get paths # Directory path example: "data/raw/-c09.50/-mass-ca-het/0" cutoff_paths = glob.glob(join_paths(input_dir, "*"), recursive=True) for cutoff_path in cutoff_paths: flag_paths = glob.glob(join_paths(cutoff_path, "*"), recursive=True) for flag_path in flag_paths: cutoff_flag = os.path.basename(cutoff_path) other_flags = os.path.basename(flag_path) # Read martix.eigenfacs files idxs = ["0", "1", "2"] a_files = [join_paths(flag_path, idx, "matrix.eigenfacs") for idx in idxs] a_exist = [f for f in a_files if os.path.isfile(f)] if a_files.sort() == a_exist.sort(): eigenfacs_0 = read_file(join_paths(flag_path, "0", "matrix.eigenfacs")) eigenfacs_1 = read_file(join_paths(flag_path, "1", "matrix.eigenfacs")) eigenfacs_2 = read_file(join_paths(flag_path, "2", "matrix.eigenfacs")) else: print("matrix.eigenfacs might be missing") return None # Create DataFrames with eigenvalues eigenvals_0 = extract_eigenvals(eigenfacs_0) eigenvals_1 = extract_eigenvals(eigenfacs_1) eigenvals_2 = extract_eigenvals(eigenfacs_2) # Move all eigenvalues into one DataFrame eigenvals_all = eigenvals_0.copy() eigenvals_all.rename(columns={"eigenvalue": "eigenvalue_0"}, inplace=True) eigenvals_all['eigenvalue_1'] = eigenvals_1['eigenvalue'][eigenvals_1.index \ .isin(eigenvals_all.index)] eigenvals_all['eigenvalue_2'] = eigenvals_2['eigenvalue'][eigenvals_2.index \ .isin(eigenvals_all.index)] # Calculate dissociation constants and cooperativity mode_number = eigenvals_all.index diss_consts = pd.DataFrame(index=mode_number) diss_consts['K_1'] = (eigenvals_all['eigenvalue_1'] / eigenvals_all['eigenvalue_0']).to_numpy() diss_consts['K_2'] = eigenvals_all['eigenvalue_2'] / eigenvals_all['eigenvalue_1'] coop = pd.DataFrame(index=mode_number) coop['coop'] = (eigenvals_all['eigenvalue_2'] * eigenvals_all['eigenvalue_0']) / \ (eigenvals_all['eigenvalue_1'] ** 2) # Calcualte cumulative (total) values eigenvals_cum = eigenvals_all.copy() eigenvals_cum[:] = np.log(eigenvals_all[:]).cumsum() diss_consts_cum = diss_consts.copy() diss_consts_cum[:] = np.cumprod(diss_consts_cum[:]) coop_cum = coop.copy() coop_cum[:] = np.cumprod(coop[:]) # Create results directory output_subdir = join_paths(output_dir, cutoff_flag, other_flags) os.makedirs(output_subdir, exist_ok=True) # Save data eigenvals_cum.to_csv(join_paths(output_subdir, "eigenvals.csv")) diss_consts_cum.to_csv(join_paths(output_subdir, "diss_consts.csv")) coop_cum.to_csv(join_paths(output_subdir, "coop.csv"))
def main_commandline(input_dir, output_dir): """ Runs data processing scripts to turn interim data (from data/interim/) into processed data ready to be analysed (saved in data/processed/). Commandline function with Click. """ logger = logging.getLogger(__name__) logger.info('making processed data set from interim data') config = utils.read_config() pdb_codes = config['pdb']['codeList'] # Get paths eigenvalues_paths = sorted(glob.glob(os.path.join(input_dir, "*.eigenvalues"))) # bfactors_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.m025.bfactors"))) energy_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.energy"))) # frequencies_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.frequencies"))) # Load interim data eigenvalues = {os.path.basename(path) : load_data(path) for path in eigenvalues_paths} # interim_bfactors = {path.replace(input_dir, "") : load_data(path) for path in bfactors_paths} energy = {os.path.basename(path) : load_data(path) for path in energy_paths} # interim_frequencies = {path.replace(input_dir, "") : load_data(path) for path in frequencies_paths} # Restructure dictionary with energy dataframes restruct_eigenvalues = {} for pdb_code in pdb_codes: eigenvalues_dict = {} for form_idx in range(3): filename = "{}.{}.eigenvalues".format(pdb_code, form_idx) eigenvalues_dict[form_idx] = eigenvalues[filename] restruct_eigenvalues[pdb_code] = eigenvalues_dict restruct_energy = {} for pdb_code in pdb_codes: energy_dict = {} for form_idx in range(3): filename = "{}.{}.mode.energy".format(pdb_code, form_idx) energy_dict[form_idx] = energy[filename] restruct_energy[pdb_code] = energy_dict # Process data entropy = {} cooperativities = {} for pdb_code in pdb_codes: entropy[pdb_code] = collate_entropy(restruct_energy[pdb_code]) # Cooperativity combined_eigenvalues = collate_eigenvalues(restruct_eigenvalues[pdb_code]) cooperativity = calculate_cooperativity(combined_eigenvalues) cooperativities[pdb_code] = cooperativity # Calcualte cooperativity using the classical limit # Thomas Rodgers alorithm from 2015 JBC study # subprocess.call(['bash', 'src/data/calculate_cooperativity.sh', '1m9a']) # Save data for pdb_code in pdb_codes: save_data(cooperativities[pdb_code], "{}.cooperativity".format(pdb_code), output_dir) save_data(entropy[pdb_code], "{}.entropy".format(pdb_code), output_dir) # Copy files for path in glob.glob(os.path.join(input_dir, "*.CAonly.pdb")): copy(path, output_dir) for path in glob.glob(os.path.join(input_dir, "*.draw_enm.pml")): copy(path, output_dir)