Ejemplo n.º 1
0
def main(input_dir, output_dir):
    """ Make interim PDB structures (from pdb/raw/) 
        for final processing (saved in pdb/processed/).
    """
    config = utils.read_config()
    pdb_code = config['pdb']['id']

    # Data import
    pdb_struct = load_structure(pdb_code, input_dir, file_extension="pdb1")                    

    # Data processing
    pdb_models = get_models(pdb_struct)

    # Remove water
    pdb_struct.df['HETATM'] = pdb_struct.df['HETATM'][pdb_struct.df['HETATM']['residue_name'] != 'HOH']
    # Remove HEZ
    pdb_struct.df['HETATM'] = pdb_struct.df['HETATM'][pdb_struct.df['HETATM']['residue_name'] != 'HEZ']

    # Select A form
    pdb_struct.df['ATOM'] = pdb_struct.df['ATOM'][pdb_struct.df['ATOM']['residue_name'] != 'BGLU']
    pdb_struct.df['ATOM']['occupancy'] = 1.00
    
    # Rename chains
    pdb_struct.df['ATOM']['chain_id'] = rename_chains(pdb_struct.df['ATOM']['chain_id'], \
        no_protomers=pdb_models)
    pdb_struct.df['HETATM']['chain_id'] = rename_chains(pdb_struct.df['HETATM']['chain_id'], \
        no_protomers=pdb_models)

    # Save data
    save_structure(pdb_struct, pdb_code, output_dir)

    return None
Ejemplo n.º 2
0
def clean_ons_time_series(key, dataset_id, timeseries_id):
    """
    Opens raw data (in json) as downloaded from ONS API
    and puts it into a clean monthly and tidy format.
    """
    config = utils.read_config()
    raw_file_name = os.path.join(config['data']['rawFilePath'],
                                 key+'_data.txt')
    with open(raw_file_name) as json_file:
        data = json.load(json_file)
    title_text = data['description']['title']
    print("Code output:\n")
    print(title_text)
    # Check if monthly data exist; if not go on to quarterly
    if data['months']:
        df = pd.DataFrame(pd.io.json.json_normalize(data['months']))
        df['date'] = pd.to_datetime(df['date']) + pd.offsets.MonthEnd(0)
        df = df.set_index('date')
        df['value'] = df['value'].astype(float)
    else:
        # Assume quarterly
        df = pd.DataFrame(pd.io.json.json_normalize(data['quarters']))
        df['date'] = (pd.to_datetime(df['date'].str.replace(' ', '')) +
                      pd.offsets.QuarterEnd(0))
        df = df.set_index('date')
        df['value'] = df['value'].astype(float)
        # Upscale to monthly
        df = df['value'].resample('M').interpolate(method='time')
        df = pd.DataFrame(df)
    cols_to_drop = [x for x in df.columns if x != 'value']
    df = df.drop(cols_to_drop, axis=1)
    df['timeseries_id'] = timeseries_id
    df['dataset_id'] = dataset_id
    df['value_name'] = key
    return df
Ejemplo n.º 3
0
def main(output_dir):
    """ Downloads raw PDB structures listed in YAML file
        from PDB website (saved in pdb/raw).
    """
    config = utils.read_config()
    pdb_code = config['pdb']['id']
    download_pdb(pdb_code, output_dir, biounit=True, compressed=False)

    return None
Ejemplo n.º 4
0
def download_raw_data():
    """
    Master script for download raw data from ONS
    Writes out to rawFilePath in config
    """
    config = utils.read_config()
    # Retrieve all series and save to file with value name/key in title
    for i, key in enumerate(config['timeSeries'].keys()):
        print('Downloading ' + key)
        data = grab_ons_time_series_data(*config['timeSeries'][key])
        output_dir = os.path.join(config['data']['rawFilePath'],
                                  key + '_data.txt')
        with open(output_dir, 'w') as outfile:
            json.dump(data, outfile)
Ejemplo n.º 5
0
def create_clean_data():
    """
    Master function which takes all raw series, cleans them,
    and outputs to a flat file
    """
    # Get config file
    config = utils.read_config()
    # Create empty list for vector of dataframes
    df_vec = []
    for key in list(config['timeSeries'].keys()):
        df_vec.append(clean_ons_time_series(key, *config['timeSeries'][key]))
    # Put this into tidy format
    df = pd.concat(df_vec, axis=0)
    # Write it to clean data
    df.to_csv(os.path.join(config['data']['clnFilePath'], 'ts_data.csv'))
Ejemplo n.º 6
0
def main(input_dir, output_dir):
    """ The main fucniton of this script. 

        Parameters
        ----------
        input_dir : str
            The location of the input directory
        output_dir : str
            The location of the output directory

        Returns
        -------
        None

    """
    config = utils.read_config()
    # plt.style.use(config['viz']['default'])
    color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color']

    bfactors = pd.read_csv(join_paths(input_dir, "bfactors.csv"))

    _, ax = plt.subplots(figsize=(4,2.5), constrained_layout=True)

    ax.plot(bfactors['residue_number'], bfactors['bfactor_md'], label='MD', color=color_cycler[0])
    ax.plot(bfactors['residue_number'], bfactors['bfactor_enm'], label='ENM', color=color_cycler[1])
    ax.plot(bfactors['residue_number'], bfactors['bfactor_exp'], label='exp', color=color_cycler[2])

    ax.set_ylabel("B-factor")
    ax.set_xlabel("Residue number")
    ax.set_title("B-factor comparison")
    ax.legend(frameon=False)
    plt.show()

    plt.savefig(join_paths(output_dir, "bfactors.pdf"), bbox_inches='tight')
    plt.savefig(join_paths(output_dir, "bfactors.png"), bbox_inches='tight')


    return None
Ejemplo n.º 7
0
def main(input_dir, output_dir):
    """ Proccesses interim PDB structures (from pdb/interim/) and creates PDB 
        structural forms for simulations (saved in pdb/processed/).
    """
    config = utils.read_config()
    pdb_code = config['pdb']['id']

    # Data import
    pdb_struct = load_structure(pdb_code, input_dir, file_extension="pdb") 

    # Data processing
    # Delete residues 1 and 216
    # pdb_struct.df['ATOM'] = pdb_struct.df['ATOM'][(pdb_struct.df['ATOM']['residue_number'] != 1) \
    #     & (pdb_struct.df['ATOM']['residue_number'] != 216)]

    # Create structural forms
    pdb_0 = create_form(pdb_struct, form_idx=0)
    pdb_1 = create_form(pdb_struct, form_idx=1)
    pdb_2 = create_form(pdb_struct, form_idx=2)

    # Save processed data
    save_structure(pdb_0, 0, output_dir)
    save_structure(pdb_1, 1, output_dir)
    save_structure(pdb_2, 2, output_dir)
import os, sys, math, numpy as np, itertools
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
from pylab import *
import src.utilities as utils

config = utils.read_config()
mpl.rcParams.update(mpl.rcParamsDefault)  # VS Code plots not black
plt.style.use(config['viz'])

infile = 'crosscor.dat'  #First input file
outname = 'crosscor'  #Name output files will take
xlbl = 'Amino Acid Number'
ylbl = 'Amino Acid Number'
ttl = ''

mi = []
mj = []
ol = []
i = -1

#############################################################################
# Read arguments from terminal, and assign input files and a name that all output files will contain.
#############################################################################
for x in range(1, len(sys.argv)):
    if sys.argv[x] == '-i':
        infile = sys.argv[x + 1]

    if sys.argv[x] == '-xlabel':
        xlbl = sys.argv[x + 1]
def main(input_dir, output_dir):
    """ Runs data visualization scripts to turn processed data (from data/processed)
        into plots (saved in scratch/).
    """
    config = utils.read_config()
    pdb_codes = config['pdb']['codeList']

    mpl.rcParams.update(mpl.rcParamsDefault)  # VS Code plots not black
    plt.style.use(config['viz']['jupyter'])

    # Get paths
    # Directory path example: "data/processed/-c09.50/-mass-ca-het/0"
    cutoff_paths = sorted(glob.glob(join_paths(input_dir, "*"), recursive=True))

    # Plot parameters
    eigenvals_ylims = [-350, -100]
    diss_consts_ylims = [0, 15]
    coop_ylims = [0.9, 1.1]
    plot_no_modes = 300

    for cutoff_path in cutoff_paths:
        cutoff_flag = os.path.basename(cutoff_path)
        flag_paths = sorted(glob.glob(join_paths(cutoff_path, "*"), recursive=True))
        figure_path = "allo.{}.pdf".format(cutoff_flag)

        rows = 3
        cols = len(flag_paths)
        fig_side = 3 # inches
        fig_len = fig_side * cols
        fig_wid = fig_side * rows

        fig1, axs = plt.subplots(rows, cols, figsize=(fig_len, fig_wid))

        fig1.suptitle("{}".format(cutoff_flag))

        for idx, flag_path in enumerate(flag_paths):
            other_flags = os.path.basename(flag_path)

            # Load data
            eigenvals = pd.read_csv(join_paths(flag_path, "eigenvals.csv"), index_col='mode_number')
            diss_consts = pd.read_csv(join_paths(flag_path, "diss_consts.csv"), index_col='mode_number')
            coop = pd.read_csv(join_paths(flag_path, "coop.csv"), index_col='mode_number')


            # Plot data
            ttl = "{}".format(other_flags)
            ax1 = axs[0][idx]
            ax2 = axs[1][idx]
            ax3 = axs[2][idx]

            # EIGENVALS
            sns.scatterplot(data=eigenvals[7:plot_no_modes+7], ax=ax1)
            ax1.set_title(ttl, pad=15)
            ax1.set_xlabel("")
            ax1.set_xticklabels([])
            ax1.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), useOffset=False)

            # DISS_CONSTS
            sns.scatterplot(data=diss_consts[7:plot_no_modes+7], ax=ax2)
            ax2.set_xlabel("")
            ax2.set_xticklabels([])
            ax2.ticklabel_format(axis='y', style='sci', scilimits=(0, 0), useOffset=False)

            # COOP
            sns.scatterplot(data=coop[7:plot_no_modes+7], ax=ax3)
            # Show non-cooperativity
            ax3.axhline(y=1.0, color='black', linestyle=':')
            ax3.get_legend().remove()
            ax3.set_ylim(coop_ylims)

            if idx == 0:
                ax1.set_ylabel("$\log(\lambda_{n})_{total}$")
                ax2.set_ylabel("$K$")
                ax3.set_xlabel("Mode number")
                ax3.set_ylabel("$K_{2}/K_{1}$")
            else:
                ax1.set_xlabel("")
                ax1.get_legend().remove()
                ax2.set_xlabel("")
                ax2.get_legend().remove()
                ax3.set_xlabel("")

            # Subplots' axes aspect ratio
            ax1.set_box_aspect(1)
            ax2.set_box_aspect(1)
            ax3.set_box_aspect(1)

        fig1.tight_layout(w_pad=1)
        
        plt.savefig(join_paths(output_dir, figure_path), bbox_inches='tight')
        plt.close(fig1)
Ejemplo n.º 10
0
def main(input_dir, output_dir):
    """ The main fucniton of this script. 

        Parameters
        ----------
        input_dir : str
            The location of the input directory
        output_dir : str
            The location of the output directory

        Returns
        -------
        None

    """
    config = utils.read_config()
    mpl.rcParams.update(mpl.rcParamsDefault)
    # plt.style.use(config['viz']['default'])
    color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color']

    MD_EIGVALS_PATH = "data/05-analysis/eigvals.dat"
    ENM_FREQ_PATH = "data/external/mode.frequencies"
    ENM_EIGVALS_PATH = "data/external/eigenvalues"

    eigvals_md = pd.read_fwf(MD_EIGVALS_PATH, infer_nrows=7000, header=None)
    eigvals_md.columns = ['mode_number', 'eigenvalue']
    # eigvals_md['eigenvalue'] = np.sqrt(eigvals_md['eigenvalue'].to_numpy())
    eigvals_md['mode_number'] = eigvals_md['mode_number'] + 6
    eigvals_md.set_index('mode_number', drop=True, inplace=True)

    # eigvals_enm = pd.read_csv(ENM_EIGVALS_PATH, index_col='mode_number')

    freq_enm = pd.read_csv(ENM_FREQ_PATH, comment='#', header=None)
    freq_enm.columns = ['eigenvalue']
    freq_enm['mode_number'] = np.arange(1, freq_enm.shape[0] + 1)
    freq_enm.set_index('mode_number', drop=True, inplace=True)
    freq_enm = freq_enm[6:]

    plot_modes = 1197
    plot_modes += 6

    # PLOT
    _, axs = plt.subplots(2, 2, figsize=(6, 6), constrained_layout=True)
    ax = axs[0][0]
    ax.scatter(eigvals_md[:106].index,
               eigvals_md[:106]['eigenvalue'],
               label='MD',
               color=color_cycler[0],
               s=5)
    ax.scatter(freq_enm[:106].index,
               freq_enm[:106]['eigenvalue'],
               label='ENM',
               color=color_cycler[1],
               s=5)
    ax.set_ylabel("$\lambda$ [$cm^{-1}$]")
    ax.set_xlabel("Mode number")
    ax.set_title("First real 100 modes")
    ax.legend(frameon=False)

    eigvals_ratio = eigvals_md['eigenvalue'] / freq_enm['eigenvalue']
    ax = axs[0][1]
    ax.scatter(eigvals_md[:plot_modes].index,
               eigvals_md[:plot_modes]['eigenvalue'],
               label='MD',
               color=color_cycler[0],
               s=5)
    ax.scatter(freq_enm[:plot_modes].index,
               freq_enm[:plot_modes]['eigenvalue'],
               label='ENM',
               color=color_cycler[1],
               s=5)
    # ax.set_ylabel("$\lambda$ [$cm^{-1}$]")
    ax.set_xlabel("Mode number")
    ax.set_title("All modes")
    # ax.legend(frameon=False)

    no_bins = 25
    no_modes = len(eigvals_md.index)
    hist_kwargs = dict(
        histtype='step',
        alpha=1,
        # color = colourWheel[j%len(colourWheel)],
        # linestyle = lineStyles_hist[j%len(lineStyles_hist)],
        # density=True,
        # label = column_name,
        bins=no_bins)

    ax = axs[1][0]
    ax.hist(eigvals_md['eigenvalue'].to_numpy(),
            **hist_kwargs,
            color=color_cycler[0],
            label="MD")
    ax.set_ylabel("# modes per bin")
    ax.set_xlabel("$\lambda$ [$cm^{-1}$]")
    ax.set_title("# bins = {} | # modes = {}".format(no_bins, no_modes))
    ax.legend(frameon=False)

    ax = axs[1][1]
    ax.hist(freq_enm['eigenvalue'][:no_modes + 6].to_numpy(),
            **hist_kwargs,
            color=color_cycler[1],
            label="ENM")
    # ax.set_ylabel("# modes per bin")
    ax.set_xlabel("$\lambda$ [$cm^{-1}$]")
    ax.legend(frameon=False)

    plt.show()

    plt.savefig(join_paths(output_dir, "eigvals.pdf"), bbox_inches='tight')
    plt.savefig(join_paths(output_dir, "eigvals.png"), bbox_inches='tight')

    # fig, axs = plt.subplots(2,2, figsize=(6,6), constrained_layout=True)

    # ax = axs[0][0]
    # ax.scatter(eigvals_md[:plot_modes].index, eigvals_md[:plot_modes]['eigenvalue'],
    #     label='MD', color=color_cycler[0], s=5)
    # ax.scatter(eigvals_enm[:plot_modes].index, eigvals_enm[:plot_modes]['eigenvalue'],
    #     label='ENM', color=color_cycler[1], s=5)
    # ax.set_ylabel("$\lambda$ [$cm^{-1}$]")
    # ax.set_xlabel("Mode number")
    # ax.set_title("Eigenvalues")
    # ax.legend(frameon=False)

    # eigvals_ratio = eigvals_md['eigenvalue'] / eigvals_enm['eigenvalue']
    # ax = axs[0][1]
    # ax.scatter(eigvals_ratio[:plot_modes].index, eigvals_ratio[:plot_modes],
    #     color=color_cycler[2], s=5)
    # ax.set_ylabel("$\lambda_{MD}/\lambda_{ENM}$")
    # ax.set_xlabel("Mode number")
    # ax.set_title("MD/ENM ratio")

    # no_bins = 35
    # hist_kwargs = dict( histtype='step',
    #                     alpha=1,
    #                     # color = colourWheel[j%len(colourWheel)],
    #                     # linestyle = lineStyles_hist[j%len(lineStyles_hist)],
    #                     # density=True,
    #                     # label = column_name,
    #                     bins=no_bins)

    # ax = axs[1][0]
    # ax.hist(eigvals_md['eigenvalue'][:950].to_numpy(), **hist_kwargs,
    #     color=color_cycler[0])
    # ax.set_ylabel("# modes per bin")
    # ax.set_xlabel("$\lambda$ [$cm^{-1}$]")
    # ax.set_title("MD DOS | # bins = {}".format(no_bins))

    # ax = axs[1][1]
    # ax.hist(eigvals_enm['eigenvalue'][:950].to_numpy(), **hist_kwargs,
    #     color=color_cycler[1])
    # ax.set_ylabel("# modes per bin")
    # ax.set_xlabel("$\lambda$ [?]")
    # ax.set_title("ENM DOS | # bins = {}".format(no_bins))
    # plt.show()

    # plt.savefig(join_paths(output_dir, "eigvals.eigvals.pdf"), bbox_inches='tight')
    # plt.savefig(join_paths(output_dir, "eigvals.eigvals.png"), bbox_inches='tight')

    return None
Ejemplo n.º 11
0
def main(input_dir, output_dir):
    """ The main fucniton of this script. 

        Parameters
        ----------
        input_dir : str
            The location of the input directory
        output_dir : str
            The location of the output directory

        Returns
        -------
        None

    """
    config = utils.read_config()
    # plt.style.use(config['viz']['default'])
    summary_data = pd.read_csv(join_paths(input_dir, "summary_mdout.csv"),
                               index_col='time')
    color_cycler = plt.rcParams['axes.prop_cycle'].by_key()['color']

    fig = plt.figure(constrained_layout=True, figsize=(10, 5))

    subfigs = fig.subfigures(1, 2)

    subfigsnest = subfigs[0].subfigures(2, 1)

    gs = subfigsnest[0].add_gridspec(3, hspace=0)
    axsnest0 = gs.subplots(sharex=True)
    # axsnest0 = subfigsnest[0].subplots(3, 1, sharex=True)
    subplot_axs = axsnest0
    subplot_axs[0].plot(summary_data.index,
                        summary_data['eptot'],
                        color=color_cycler[0])
    subplot_axs[1].plot(summary_data.index,
                        summary_data['ektot'],
                        color=color_cycler[1])
    subplot_axs[2].plot(summary_data.index,
                        summary_data['etot'],
                        color=color_cycler[2])
    # subfigsnest[0].supylabel("Energy [$kcal\:mol^{-1}$]")

    axsnest1 = subfigsnest[1].subplots(2, 1, sharex=True)
    subplot_axs = axsnest1

    ax = subplot_axs[0]
    ax.plot(summary_data.index, summary_data['temp'], color=color_cycler[4])
    ax.set_xlabel("")
    ax.set_ylabel("Temp [K]")

    ax = subplot_axs[1]
    ax.plot(summary_data.index, summary_data['pres'], color=color_cycler[5])
    ax.set_xlabel("Time [ps]")
    ax.set_ylabel("Pres\n[$kcal\:mol^{-1}\:\AA^{-1}$]")

    if 'volume' in summary_data.columns:
        axsRight = subfigs[1].subplots(3, 1, sharex=True)
        ax = axsRight[0]
        ax.plot(summary_data.index,
                summary_data['volume'],
                color=color_cycler[6])
        ax.set_xlabel("")
        ax.set_ylabel("Volume [$\AA^{-3}$]")

    if 'density' in summary_data.columns:
        ax = axsRight[1]
        ax.plot(summary_data.index,
                summary_data['density'],
                color=color_cycler[7])
        ax.set_xlabel("")
        ax.set_ylabel("Density [$g\:cm^{-3}$]")

    if 'rmsd' in summary_data.columns:
        ax = axsRight[2]
        ax.plot(summary_data.index,
                summary_data['rmsd'],
                color=color_cycler[8])
        ax.set_xlabel("Time [ps]")
        ax.set_ylabel("RMSD [$\AA$]")

    plt.savefig(join_paths(output_dir, "summary_mdout.pdf"),
                bbox_inches='tight')
    plt.savefig(join_paths(output_dir, "summary_mdout.png"),
                bbox_inches='tight')

    plt.show()

    return None
def main(input_filepath, output_filepath):
    """ Runs data visualization scripts to turn processed data (from data/processed)
        into plots (saved in scratch/).
    """
    logger = logging.getLogger(__name__)
    logger.info('making plots from processed data')

    config = utils.read_config()
    pdb_codes = config['pdb']['codeList']

    mpl.rcParams.update(mpl.rcParamsDefault)  # VS Code plots not black
    plt.style.use(config['viz'])

    # Get filepaths
    entropy_wt_filepaths = sorted(
        glob.glob(os.path.join(input_filepath, "*.entropy")))
    allostery_wt_filepaths = sorted(
        glob.glob(os.path.join(input_filepath, "*.allostery")))
    allostery_1point_filepaths = sorted(
        glob.glob(os.path.join(input_filepath, "*.1point.allostery.m???.csv")))

    # Load data
    entropy_wt_data = {
        os.path.basename(filepath).replace(".entropy", ""): load_data(filepath)
        for filepath in entropy_wt_filepaths
    }
    allostery_wt_data = {
        os.path.basename(filepath).replace(".allostery", ""):
        load_data(filepath)
        for filepath in allostery_wt_filepaths
    }
    allostery_1point_data = {
        os.path.basename(filepath).replace(".csv", ""): load_data(filepath)
        for filepath in allostery_1point_filepaths
    }

    # Plot wild-type data
    # for pdb_code in pdb_codes:
    #     entropy_wt_plot = entropy_wt_data[pdb_code][:100]
    #     allostery_wt_plot = allostery_wt_data[pdb_code][:100]

    #     # Entropy
    #     _, ax = plt.subplots()

    #     x_lbl = "Mode (non-trivial)"
    #     y_lbl = "$-S/(kT)$"
    #     ttl = "Entropy | {}".format(pdb_code.upper())

    #     ax.set_xlabel(x_lbl)
    #     ax.set_ylabel(y_lbl)
    #     ax.set_title(ttl)

    #     for column_name in ['S_0', 'S_1', 'S_2']:
    #         sns.lineplot(data=entropy_wt_plot[column_name], label="${}$".format(column_name), ax=ax)

    #     ax.legend()

    #     plt.savefig(os.path.join(output_filepath, "{}.wt.entropy.png".format(pdb_code)))

    #     # Free energy
    #     _, ax = plt.subplots()

    #     x_lbl = "Mode (non-trivial)"
    #     y_lbl = "G/(kT)"
    #     ttl = "Free energy | {}".format(pdb_code.upper())

    #     ax.set_xlabel(x_lbl)
    #     ax.set_ylabel(y_lbl)
    #     ax.set_title(ttl)

    #     for column_name in ['G_0', 'G_1', 'G_2']:
    #         sns.lineplot(data=allostery_wt_plot[column_name], label="${}$".format(column_name), ax=ax)

    #     ax.legend()

    #     plt.savefig(os.path.join(output_filepath, "{}.wt.free_energy.png".format(pdb_code)))

    #     # Free energy change
    #     _, ax = plt.subplots()

    #     x_lbl = "Mode (non-trivial)"
    #     y_lbl = r"$\Delta G/(kT)$"
    #     ttl = "Free energy change | {}".format(pdb_code.upper())

    #     ax.set_xlabel(x_lbl)
    #     ax.set_ylabel(y_lbl)
    #     ax.set_title(ttl)

    #     for column_name in ['dG_1', 'dG_2']:
    #         sns.lineplot(data=allostery_wt_plot[column_name], label="${}$".format(column_name), ax=ax)

    #     ax.legend()

    #     plt.savefig(os.path.join(output_filepath, "{}.wt.free_energy_change.png".format(pdb_code)))

    #     # Allostery
    #     _, ax1 = plt.subplots(figsize=(12,6))

    #     x_lbl = "Mode (non-trivial)"
    #     y1_lbl = "$K_{2}/K_{1}$"
    #     ttl = "Allostery | {}".format(pdb_code.upper())

    #     ax1.set_xlabel(x_lbl)
    #     ax1.set_ylabel(y1_lbl)
    #     ax1.set_title(ttl)

    #     # Show non-cooperative region
    #     ax1.axhline(y=1.0, color='black', linestyle=':', linewidth=1)

    #     sns.lineplot(data=allostery_wt_plot['allostery'], ax=ax1)

    #     # Plot ddG
    #     ax2 = ax1.twinx()
    #     y2_lbl = r"$\Delta \Delta G/(kT)$"
    #     ax2.set_ylabel(y2_lbl)

    #     sns.lineplot(data=allostery_wt_plot['ddG'], ax=ax2)

    #     plt.savefig(os.path.join(output_filepath, "{}.wt.allostery.png".format(pdb_code)))

    # plt.close('all')

    # Plot 1-point mut scan heatmap
    for filename, data in allostery_1point_data.items():
        pdb_code = filename[:4]
        no_modes = int(filename[-3:].lstrip("0"))

        # Free energy
        for form_idx in range(3):
            column_name = "G_{}".format(form_idx)
            # Convert to wide format
            free_energy_1point_plot = data.pivot(index='spring_strength',
                                                 columns='residue_number',
                                                 values=column_name)

            _, ax = plt.subplots()

            ttl = "${}$ | {} | {} modes".format(column_name, pdb_code.upper(),
                                                no_modes)
            ax.set_title(ttl)
            viz_1point.plot_heatmap(free_energy_1point_plot,
                                    cbar_lbl="$G/(kT)$".format(column_name),
                                    axis=ax)

            plt.savefig(
                os.path.join(
                    output_filepath,
                    "{}.1point.free_energy_{}.m{:03d}.png".format(
                        pdb_code, form_idx, no_modes)))

        plt.close('all')
        # Free energy change
        for form_idx in range(1, 3):
            column_name = "dG_{}".format(form_idx)
            # Convert to wide format
            free_energy_1point_plot = data.pivot(index='spring_strength',
                                                 columns='residue_number',
                                                 values=column_name)

            _, ax = plt.subplots()

            ttl = "${}$ | {} | {} modes".format(
                column_name.replace("d", "\Delta "), pdb_code.upper(),
                no_modes)
            ax.set_title(ttl)
            viz_1point.plot_heatmap(free_energy_1point_plot,
                                    cbar_lbl="$\Delta G/(kT)$",
                                    axis=ax)

            plt.savefig(
                os.path.join(
                    output_filepath,
                    "{}.1point.free_energy_change_{}.m{:03d}.png".format(
                        pdb_code, form_idx, no_modes)))

        plt.close('all')
        # Cooperativity
        # Convert to wide format
        allostery_1point_plot = data.pivot(index='spring_strength',
                                           columns='residue_number',
                                           values='allostery')

        _, ax = plt.subplots()

        ttl = "1-point scan | {} | {} modes".format(pdb_code.upper(), no_modes)
        ax.set_title(ttl)
        viz_1point.plot_heatmap(allostery_1point_plot, axis=ax)

        plt.savefig(
            os.path.join(
                output_filepath,
                "{}.1point.allostery.m{:03d}.png".format(pdb_code, no_modes)))

    # Plot heatmap in real-space
    filename = "1m9a.1point.allostery.m025"
    data = allostery_1point_data[filename]

    pdb_code = filename[:4]
    no_modes = int(filename[-3:].lstrip("0"))

    # Free energy for apo structure
    for form_idx in range(1):
        column_name = "G_{}".format(form_idx)
        # Convert to wide format
        selected_data = data.pivot(index='spring_strength',
                                   columns='residue_number',
                                   values=column_name)
        for spring_strength in [0.25, 4.00]:
            selected_kcust_data = selected_data.loc[spring_strength, :]
            vmin = selected_data.min().min()
            vmax = selected_data.max().max()
            vcentre = selected_data.loc[1.00, :].iloc[0]
            # print("vmin = {}\nvcentre = {}\nvmax = {}".format(vmin, vcentre, vmax))

            colour_data, _ = viz_1point.code_heatmap(selected_kcust_data,
                                                     vmin=vmin,
                                                     vmax=vmax,
                                                     vcenter=vcentre)
            path = os.path.join(
                output_filepath,
                "{}.1point.free_energy.m{:03d}.k{:06.3f}".format(
                    pdb_code, no_modes, spring_strength))
            cmd.delete('all')
            viz_1point.colour_by_heatmap(
                colour_data,
                structure_path="pdb/processed/1m9a.0.pdb",
                molecule_name="1m9a",
                output_path=path)

    # Cooperativity
    column_name = "allostery"
    # Convert to wide format
    selected_data = data.pivot(index='spring_strength',
                               columns='residue_number',
                               values=column_name)
    for spring_strength in [0.25, 4.00]:
        selected_kcust_data = selected_data.loc[spring_strength, :]
        vmin = selected_data.min().min()
        vmax = selected_data.max().max()
        vcentre = selected_data.loc[1.00, :].iloc[0]

        colour_data, _ = viz_1point.code_heatmap(selected_kcust_data,
                                                 vmin=vmin,
                                                 vmax=vmax,
                                                 vcenter=vcentre)

        path = os.path.join(
            output_filepath, "{}.1point.allostery.m{:03d}.k{:06.3f}".format(
                pdb_code, no_modes, spring_strength))
        cmd.delete('all')
        viz_1point.colour_by_heatmap(colour_data,
                                     structure_path="pdb/processed/1m9a.2.pdb",
                                     molecule_name="1m9a",
                                     output_path=path)
def main(input_dir, output_dir):
    """ Runs data processing scripts to turn interim data (from data/interim/) into
        processed data ready to be analysed (saved in data/processed/).
    """

    config = utils.read_config()
    pdb_codes = config['pdb']['codeList']
    
    # Get paths
    # Directory path example: "data/raw/-c09.50/-mass-ca-het/0"
    cutoff_paths = glob.glob(join_paths(input_dir, "*"), recursive=True)
    for cutoff_path in cutoff_paths:
        flag_paths = glob.glob(join_paths(cutoff_path, "*"), recursive=True)
        for flag_path in flag_paths:
            cutoff_flag = os.path.basename(cutoff_path)
            other_flags = os.path.basename(flag_path)
            
            # Read martix.eigenfacs files
            idxs = ["0", "1", "2"]
            a_files = [join_paths(flag_path, idx, "matrix.eigenfacs") for idx in idxs]
            a_exist = [f for f in a_files if os.path.isfile(f)]

            if a_files.sort() == a_exist.sort():
                eigenfacs_0 = read_file(join_paths(flag_path, "0", "matrix.eigenfacs"))
                eigenfacs_1 = read_file(join_paths(flag_path, "1", "matrix.eigenfacs"))
                eigenfacs_2 = read_file(join_paths(flag_path, "2", "matrix.eigenfacs"))
            else:
                print("matrix.eigenfacs might be missing")
                return None

            # Create  DataFrames with eigenvalues 
            eigenvals_0 = extract_eigenvals(eigenfacs_0)
            eigenvals_1 = extract_eigenvals(eigenfacs_1)
            eigenvals_2 = extract_eigenvals(eigenfacs_2)

            # Move all eigenvalues into one DataFrame
            eigenvals_all = eigenvals_0.copy()
            eigenvals_all.rename(columns={"eigenvalue": "eigenvalue_0"}, inplace=True)
            
            eigenvals_all['eigenvalue_1'] = eigenvals_1['eigenvalue'][eigenvals_1.index \
                .isin(eigenvals_all.index)]
            eigenvals_all['eigenvalue_2'] = eigenvals_2['eigenvalue'][eigenvals_2.index \
                .isin(eigenvals_all.index)]

            # Calculate dissociation constants and cooperativity
            mode_number = eigenvals_all.index

            diss_consts = pd.DataFrame(index=mode_number)
            diss_consts['K_1'] = (eigenvals_all['eigenvalue_1'] / eigenvals_all['eigenvalue_0']).to_numpy()
            diss_consts['K_2'] = eigenvals_all['eigenvalue_2'] / eigenvals_all['eigenvalue_1']
            
            coop = pd.DataFrame(index=mode_number)
            coop['coop'] = (eigenvals_all['eigenvalue_2'] * eigenvals_all['eigenvalue_0']) / \
                        (eigenvals_all['eigenvalue_1'] ** 2)

            # Calcualte cumulative (total) values
            eigenvals_cum = eigenvals_all.copy()
            eigenvals_cum[:] = np.log(eigenvals_all[:]).cumsum()
            diss_consts_cum = diss_consts.copy()
            diss_consts_cum[:] = np.cumprod(diss_consts_cum[:])
            coop_cum = coop.copy()
            coop_cum[:] = np.cumprod(coop[:])

            # Create results directory
            output_subdir = join_paths(output_dir, cutoff_flag, other_flags)
            os.makedirs(output_subdir, exist_ok=True)

            # Save data
            eigenvals_cum.to_csv(join_paths(output_subdir, "eigenvals.csv"))
            diss_consts_cum.to_csv(join_paths(output_subdir, "diss_consts.csv"))
            coop_cum.to_csv(join_paths(output_subdir, "coop.csv"))
def main_commandline(input_dir, output_dir):
    """ Runs data processing scripts to turn interim data (from data/interim/) into
        processed data ready to be analysed (saved in data/processed/).
        Commandline function with Click.
    """
    logger = logging.getLogger(__name__)
    logger.info('making processed data set from interim data')

    config = utils.read_config()
    pdb_codes = config['pdb']['codeList']
    
    # Get paths
    eigenvalues_paths = sorted(glob.glob(os.path.join(input_dir, "*.eigenvalues")))
    # bfactors_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.m025.bfactors")))
    energy_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.energy")))
    # frequencies_paths = sorted(glob.glob(os.path.join(input_dir, "*.mode.frequencies")))

    # Load interim data
    eigenvalues = {os.path.basename(path) : load_data(path) for path in eigenvalues_paths}
    # interim_bfactors = {path.replace(input_dir, "") : load_data(path) for path in bfactors_paths}
    energy = {os.path.basename(path) : load_data(path) for path in energy_paths}
    # interim_frequencies = {path.replace(input_dir, "") : load_data(path) for path in frequencies_paths}

    # Restructure dictionary with energy dataframes
    restruct_eigenvalues = {}
    for pdb_code in pdb_codes:
        eigenvalues_dict = {}

        for form_idx in range(3):
            filename = "{}.{}.eigenvalues".format(pdb_code, form_idx)
            eigenvalues_dict[form_idx] = eigenvalues[filename]

        restruct_eigenvalues[pdb_code] = eigenvalues_dict

    restruct_energy = {}
    for pdb_code in pdb_codes:
        energy_dict = {}

        for form_idx in range(3):
            filename = "{}.{}.mode.energy".format(pdb_code, form_idx)
            energy_dict[form_idx] = energy[filename]

        restruct_energy[pdb_code] = energy_dict

    # Process data
    entropy = {}
    cooperativities = {}
    for pdb_code in pdb_codes:
        entropy[pdb_code] = collate_entropy(restruct_energy[pdb_code])
        # Cooperativity
        combined_eigenvalues = collate_eigenvalues(restruct_eigenvalues[pdb_code])

        cooperativity = calculate_cooperativity(combined_eigenvalues)

        cooperativities[pdb_code] = cooperativity

    # Calcualte cooperativity using the classical limit
    # Thomas Rodgers alorithm from 2015 JBC study
    # subprocess.call(['bash', 'src/data/calculate_cooperativity.sh', '1m9a'])

    # Save data
    for pdb_code in pdb_codes:
        save_data(cooperativities[pdb_code], "{}.cooperativity".format(pdb_code), output_dir)
        save_data(entropy[pdb_code], "{}.entropy".format(pdb_code), output_dir)

    # Copy files
    for path in glob.glob(os.path.join(input_dir, "*.CAonly.pdb")):
        copy(path, output_dir)

    for path in glob.glob(os.path.join(input_dir, "*.draw_enm.pml")):
        copy(path, output_dir)