def dist(X, xtitle, xlim, width): """ Plot histograms of data. """ fig, ax = plt.subplots(figsize=(8, 5)) if xlim is None: xlim = (min(X)-2*width, max(X)+2*width) X_bins = np.arange(xlim[0], xlim[1], width) hist, bin_edges = np.histogram(a=X, bins=X_bins) if xtitle == 'purchasability class': align = 'center' else: align = 'edge' ax.bar( bin_edges[:-1], hist, align=align, alpha=1.0, width=width, color='#2980B9', edgecolor='k' ) pfn.define_standard_plot( ax, xtitle=xtitle, ytitle='count', xlim=xlim, ylim=None ) return fig, ax
def cs_logPvsNHA(logPs, Xs, HlogPs, HXs): fig, ax = plt.subplots(figsize=(8, 5)) xlim = (0, 40) ylim = (-9, 14) CS = [(1.0, 1.0, 1.0), (44 / 255, 62 / 255, 80 / 255)] cm = colors.LinearSegmentedColormap.from_list('test', CS, N=10) fig, ax, hist = pfn.twoD_histogram(X_data=Xs, Y_data=logPs, xlim=xlim, ylim=ylim, cmap=cm, fig=fig, ax=ax) cbar = fig.colorbar(hist[3], ax=ax) cbar.ax.set_ylabel('count', fontsize=16) cbar.ax.tick_params(labelsize=16) ax.scatter(HXs, HlogPs, c='#E74C3C', edgecolors='k', marker='o', alpha=1.0, s=120) pfn.define_standard_plot( ax, ylim=ylim, xlim=xlim, # xtitle='number of heavy atoms', ytitle=r'logP', xtitle=r'no. heavy atoms', ) fig.tight_layout() fig.savefig(f'chemical_space_logPNHA.pdf', dpi=720, bbox_inches='tight')
def mid_plots(parameter_sets, molecules, test_mol, full_results, colours, markers): for t in parameter_sets: fig, ax = plt.subplots() for name in molecules: if name not in test_mol: continue X = [] Y = [] Y_err = [] for i, v in enumerate(parameter_sets[t]): RES = full_results[t][name][v] _, _, mid_diam_avg, mid_diam_std, _ = RES avg = float(mid_diam_avg) std = float(mid_diam_std) # if i == 0: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name], label=name) # else: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name]) X.append(float(v)) Y.append(avg) Y_err.append(std) X = np.asarray(X) Y = np.asarray(Y) Y_err = np.asarray(Y_err) ax.plot(X, Y, c=colours[name], marker=markers[name], label=name) ax.fill_between(X, Y - Y_err, Y + Y_err, alpha=0.2, facecolor=colours[name]) if t == 'N_conformers': t_lim = (0, 1100) t_name = '$N$' # 'no. conformers' if t == 'spacing': t_lim = (0.2, 1.1) t_name = r'grid spacing [$\mathrm{\AA}$]' if t == 'vdw': t_lim = (0.4, 1.1) t_name = 'vdW scale parameter' if t == 'boxMargin': t_lim = (3, 9) t_name = r'box margin [$\mathrm{\AA}$]' pfn.define_standard_plot( ax, xtitle=t_name, ytitle=r'avg. intermediate diameter [$\mathrm{\AA}$]', xlim=t_lim, ylim=(3.5, 9)) # ax.legend(fontsize=16, ncol=2) fig.tight_layout() fig.savefig(f"mid_{t}.pdf", dpi=720, bbox_inches='tight')
def cs_purchCT(purch, not_purch): fig, ax = plt.subplots(figsize=(8, 5)) plot_prop = { 't': { 'c': '#FA7268', 'e': 'none', 'a': 0.5, 'm': 'o', 's': 50, 'label': 'purchasable' }, 'f': { 'c': '#DAF7A6', 'e': 'none', 'a': 0.5, 'm': 'x', 's': 50, 'label': 'not purchasable' } } # bin each of the sets of data based on X value for p in plot_prop: pp = plot_prop[p] if p == 't': data = purch else: data = not_purch width = 50 X_bins = np.arange(0, 2000, width) hist, bin_edges = np.histogram(a=data, bins=X_bins, density=False) ax.bar( bin_edges[:-1], hist, align='edge', alpha=0.8, width=width, color=pp['c'], edgecolor='k', label=pp['label'], ) ax.legend(fontsize=16) pfn.define_standard_plot( ax, # xtitle='number of heavy atoms', xtitle=r'BertzCT', ytitle='frequency', ) fig.tight_layout() fig.savefig(f'chemical_space_purchCT.pdf', dpi=720, bbox_inches='tight')
def shapes_with_known(molecules, known_df, threshold, output_dir): """ Plot molecule shapes considering experimental results. """ fig, ax = plt.subplots(figsize=(5, 5)) for name in molecules: out_file = (f"{output_dir}/{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) if len(results) == 0: continue mid_diam = min(results['diam2']) lit_d = known_df[known_df['molecule'] == name]['diffuse'].iloc[0] if lit_d == 't': if mid_diam <= threshold: C = 'b' M = 'o' else: C = 'b' M = 'X' elif lit_d == 'f': if mid_diam <= threshold: C = 'r' M = 'X' else: C = 'r' M = 'o' else: continue ax.scatter(np.average(results['ratio_1']), np.average(results['ratio_2']), c=C, edgecolors='k', marker=M, alpha=1.0, s=80) ax.plot([0, 0.5, 1, 0], [1, 0.5, 1, 1], c='k', lw=2) ax.text(0.75, 1.03, 'sphere', fontsize=20) ax.text(0.4, 0.45, 'oblate', fontsize=20) ax.text(-0.05, 1.03, 'prolate', fontsize=20) pfn.define_standard_plot(ax, xtitle='$I_1$ / $I_3$', ytitle='$I_2$ / $I_3$', xlim=(-0.1, 1.1), ylim=(0.4, 1.1)) fig.tight_layout() fig.savefig("shape.pdf", dpi=720, bbox_inches='tight')
def mol_parity(propx, propy, file, xtitle, ytitle, mol_file=None): """ Plot a parity of two molecular properties. """ if mol_file is None: molecule_list = glob.glob('*_unopt.mol') else: molecule_list = IO.read_molecule_list(mol_file) # iterate over molecules Xs = [] Ys = [] for mol in molecule_list: name = mol.replace('_unopt.mol', '') prop_file = name + '_prop.json' if not exists(prop_file): continue with open(prop_file, 'r') as f: prop_dict = json.load(f) Xs.append(prop_dict[propx]) Ys.append(prop_dict[propy]) fig, ax = plt.subplots(figsize=(8, 5)) ax.scatter(Xs, Ys, c='#FA7268', edgecolors='k', marker='o', alpha=1.0, s=80) xlim = None ylim = None if propx == 'Synth_score': xlim = (0, 10) elif propy == 'Synth_score': ylim = (0, 10) pfn.define_standard_plot(ax, xtitle=xtitle, ytitle=ytitle, xlim=xlim, ylim=ylim) fig.tight_layout() fig.savefig(f'parity_{file}.pdf', dpi=720, bbox_inches='tight')
def biomin_known(molecules, output_dir, plot_suffix): """ Scatter plot of all molecule sizes in dictionary. """ fig, ax = plt.subplots(figsize=(8, 5)) m_diams = [] for name in molecules: out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) print('-----', name, mid_diam, '-----') m_diams.append(mid_diam) m_diams = np.asarray(m_diams) X_bins = np.arange(0.1, 21, 0.5) hist, bin_edges = np.histogram(a=m_diams, bins=X_bins) ax.bar(bin_edges[:-1], hist, align='edge', width=0.5, color='#2C3E50', edgecolor='k', alpha=0.8) ax.axvline(x=3.4, c='k') ax.axvspan( xmin=4.0, xmax=6.6, facecolor='k', alpha=0.25, # hatch="/" ) # ax.axvspan(xmin=5.4, xmax=6.6, facecolor='k', alpha=0.2) pfn.define_standard_plot( ax, # xtitle='intermediate diameter [$\mathrm{\AA}$]', xtitle=r'$d$ [$\mathrm{\AA}$]', ytitle='count', xlim=(0, 15), ylim=(0, 15)) fig.tight_layout() fig.savefig(f"molecule_size_{plot_suffix}.pdf", dpi=720, bbox_inches='tight')
def min_of_mid_plots(parameter_sets, molecules, test_mol, full_results, colours, markers): for t in parameter_sets: fig, ax = plt.subplots() for name in molecules: if name not in test_mol: continue X = [] Y = [] Y_err = [] for i, v in enumerate(parameter_sets[t]): _, _, _, _, min_mid = full_results[t][name][v] # if i == 0: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name], label=name) # else: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name]) X.append(float(v)) Y.append(min_mid) X = np.asarray(X) Y = np.asarray(Y) print(name, '--', max([i - min(Y) for i in Y])) Y_err = np.asarray(Y_err) ax.plot(X, Y, c=colours[name], marker=markers[name], label=name) if t == 'N_conformers': t_lim = (0, 1100) t_name = '$N$' # 'no. conformers' if t == 'spacing': t_lim = (0.2, 0.7) t_name = r'grid spacing [$\mathrm{\AA}$]' if t == 'vdw': t_lim = (0.4, 1.1) t_name = 'vdW scale parameter' if t == 'boxMargin': t_lim = (3, 9) t_name = r'box margin [$\mathrm{\AA}$]' pfn.define_standard_plot(ax, xtitle=t_name, ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=t_lim, ylim=(3.5, 8)) # ax.legend(fontsize=16, ncol=3) fig.tight_layout() fig.savefig(f"min_of_mid_{t}.pdf", dpi=720, bbox_inches='tight') input(f'^^ is max dev from min_of_mid for {t}')
def cs_NHA(Xs, Ys): fig, ax = plt.subplots(figsize=(8, 5)) ylim = (0, 17) xlim = (0, 40) CS = [(1.0, 1.0, 1.0), (44 / 255, 62 / 255, 80 / 255)] cm = colors.LinearSegmentedColormap.from_list('test', CS, N=10) fig, ax, hist = pfn.twoD_histogram(X_data=Xs, Y_data=Ys, xlim=xlim, ylim=ylim, cmap=cm, fig=fig, ax=ax) cbar = fig.colorbar(hist[3], ax=ax) cbar.ax.set_ylabel('count', fontsize=16) cbar.ax.tick_params(labelsize=16) # # ax.scatter( # Xs, # Ys, # c='#FF7900', # edgecolors='k', # marker='o', # alpha=1.0, # s=120 # ) # Horizontal lines for different materials. ax.axhspan(ymin=4.0, ymax=6.6, facecolor='k', alpha=0.2) # ax.axvspan(xmin=5.4, xmax=6.6, facecolor='k', alpha=0.2) # plot possible region of ZIF pore limiting diameters from # Banerjee 2008 - 10.1126/science.1152516 # ax.axvspan(0.0, 13, facecolor='#2ca02c', alpha=0.2) # HOF size limit: # ax.axvline(x=13.1, c='k', lw=2, linestyle='--') pfn.define_standard_plot( ax, # xtitle='number of heavy atoms', ylim=ylim, xlim=xlim, ytitle=r'intermediate diameter [$\mathrm{\AA}$]', xtitle=r'no. heavy atoms', ) fig.tight_layout() fig.savefig(f'chemical_space_NHA.pdf', dpi=720, bbox_inches='tight')
def min_plots(parameter_sets, molecules, test_mol, full_results, colours, markers): for t in parameter_sets: fig, ax = plt.subplots() for name in molecules: if name not in test_mol: continue X = [] Y = [] Y_err = [] for i, v in enumerate(parameter_sets[t]): RES = full_results[t][name][v] min_diam_avg, min_diam_std, _, _, _ = RES avg = float(min_diam_avg) std = float(min_diam_std) # if i == 0: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name], label=name) # else: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name]) X.append(float(v)) Y.append(avg) Y_err.append(std) ax.plot(X, Y, c=colours[name], marker=markers[name], label=name) if t == 'N_conformers': t_lim = (0, 1100) t_name = '$N$' # 'no. conformers' if t == 'spacing': t_lim = (0, 1.2) t_name = r'grid spacing [$\mathrm{\AA}$]' if t == 'vdw': t_lim = (0.4, 1.2) t_name = r'vdW scale parameter' if t == 'boxMargin': t_lim = (2, 10) t_name = r'box margin [$\mathrm{\AA}$]' pfn.define_standard_plot( ax, xtitle=t_name, ytitle=r'avg. minimum diameter [$\mathrm{\AA}$]', xlim=t_lim, ylim=(0, 10)) ax.legend(loc=1, fontsize=16) fig.tight_layout() fig.savefig(f"min_{t}.pdf", dpi=720, bbox_inches='tight')
def shapes(molecules, threshold, output_dir, plot_suffix): """ Plot molecule shapes of all molecules in dictionary. """ fig, ax = plt.subplots(figsize=(5, 5)) for name in molecules: out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) if mid_diam <= threshold: C = 'b' M = 'o' E = 'k' else: C = 'r' M = 'o' E = 'k' ax.scatter(np.average(results['ratio_1']), np.average(results['ratio_2']), c=C, edgecolors=E, marker=M, alpha=1.0, s=80) ax.plot([0, 0.5, 1, 0], [1, 0.5, 1, 1], c='k', lw=2) ax.text(0.75, 1.03, 'sphere', fontsize=20) ax.text(0.4, 0.45, 'oblate', fontsize=20) ax.text(-0.05, 1.03, 'prolate', fontsize=20) pfn.define_standard_plot(ax, title='', xtitle='$I_1$ / $I_3$', ytitle='$I_2$ / $I_3$', xlim=(-0.1, 1.1), ylim=(0.4, 1.1)) fig.tight_layout() fig.savefig(f"shape_{plot_suffix}.pdf", dpi=720, bbox_inches='tight')
def parity_with_known_min2(molecules, diameters, output_dir): """ Parity plot of calculated diameters and known kinetic diameters. """ fig, ax = plt.subplots(figsize=(5, 5)) for name in molecules: try: min2_diam = float(diameters[name]) except ValueError: print('no radius given for this molecule - skipped') continue out_file = (f"{output_dir}/{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) if len(results) == 0: continue mid_diam = min(results['diam2']) C = '#E74C3C' M = 'o' print(name, min2_diam, mid_diam) ax.scatter(min2_diam, mid_diam, c=C, edgecolors='k', marker=M, alpha=1.0, s=120) ax.plot(np.linspace(-1, 12, 2), np.linspace(-1, 12, 2), c='k', alpha=0.4) # plot the limit from the two Sholl papers on diffusion # ax.axvspan(4.0, 4.2, facecolor='r', alpha=0.5) pfn.define_standard_plot(ax, xtitle=r'critical diameter [$\mathrm{\AA}$]', ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=(1, 10), ylim=(1, 10)) fig.tight_layout() fig.savefig("parity_min2.pdf", dpi=720, bbox_inches='tight')
def HOF_examples(output_dir): """ Prepare figure showing the value of d for all molecules used in the BioHOFs from: 10.1021/jacs.9b06589 """ # the n-phenyl esters mol_list_1 = [ 'fluorescein', 'hydrogen_peroxide', 'methanol', 'formaldehyde', 'urea' ] smiles_list_1 = [ 'C1=CC=C2C(=C1)C(=O)OC23C4=C(C=C(C=C4)O)OC5=C3C=CC(=C5)O', 'OO', 'CO', 'C=O', 'C(=O)(N)N' ] fig, ax = plt.subplots(figsize=(8, 5)) for i, name in enumerate(mol_list_1): out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) mol = Chem.AddHs(Chem.MolFromSmiles(smiles_list_1[i])) MW = Descriptors.MolWt(mol) print(name, mol_list_1[i], MW, mid_diam) ax.scatter(MW, mid_diam, c='#5499C7', edgecolors='k', marker='o', alpha=1.0, s=140) # ax.axhline(y=11.8, c='k', alpha=0.2) pfn.define_standard_plot(ax, xtitle='molecular weight [g/mol]', ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=(10, 500), ylim=(2.5, 15)) fig.tight_layout() fig.savefig("HOF_examples.pdf", dpi=720, bbox_inches='tight')
def cyt_C_perox_assay(output_dir): """ Prepare figure showing the change in intermediate diameter for 3 peroxide molcules degraded by Cyt-C in ZIF-8 (One-Pot Synthesis of Protein-Embedded Metal–Organic Frameworks with Enhanced Biological Activities, DOI:10.1021/nl5026419) """ # the n-phenyl esters mol_list_1 = [ 'hydrogen peroxide', 'methyl ethyl ketone peroxide', 'tert-butyl hydroperoxide' ] smiles_list_1 = ['OO', 'CCC(C)(OO)OOC(C)(CC)OO', 'CC(C)(C)OO'] fig, ax = plt.subplots() for i, name in enumerate(mol_list_1): out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) mol = Chem.AddHs(Chem.MolFromSmiles(smiles_list_1[i])) MW = Descriptors.MolWt(mol) print(name, mol_list_1[i], MW, mid_diam) ax.scatter(MW, mid_diam, c='k', edgecolors='k', marker='o', alpha=1.0, s=100) ax.axhspan(ymin=4.0, ymax=6.6, facecolor='k', alpha=0.2, hatch="/") pfn.define_standard_plot(ax, xtitle='molecular weight [g/mol]', ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=(10, 250), ylim=(2.5, 8)) fig.tight_layout() fig.savefig("cytC_comp.pdf", dpi=720, bbox_inches='tight')
def mol_dist(data_dict): """ Plot distribution of a molecular property. """ fig, ax = plt.subplots(figsize=(8, 5)) width = data_dict['width'] X_bins = np.arange(data_dict['xlim'][0], data_dict['xlim'][1], width) hist, bin_edges = np.histogram(a=data_dict['d'], bins=X_bins) ax.bar(bin_edges[:-1], hist, align='edge', alpha=1.0, width=width, color=data_dict['c'], edgecolor='k') pfn.define_standard_plot(ax, xtitle=data_dict['xtitle'], ytitle='count', xlim=data_dict['xlim'], ylim=None) fig.tight_layout() fig.savefig(f"hist_{data_dict['file']}.pdf", dpi=720, bbox_inches='tight')
def no_rxns_vs_size(data, params, plot_suffix): """ Plot number of possible reactions as a function of size threshold. """ fig, ax = plt.subplots(figsize=(8, 5)) # bin each of the sets of data based on X value width = 0.5 X_bins = np.arange(0, 20.5, width) hist, bin_edges = np.histogram(a=data['max_mid_diam'], bins=X_bins) ax2 = ax.twinx() ax2.bar( bin_edges[:-1], hist, align='edge', alpha=0.9, width=width, color='#2C3E50', edgecolor='k' ) # cumulative plot cumul = np.cumsum(hist) ax.plot( bin_edges[:-1], cumul, alpha=1.0, label='max component < threshold', color='r', marker='o' ) # ax.axvspan(xmin=4.0, xmax=6.6, facecolor='k', alpha=0.2, # hatch="/") ax.axvspan(xmin=4.0, xmax=6.6, facecolor='k', alpha=0.2) # ax.axvspan(xmin=5.4, xmax=6.6, facecolor='k', alpha=0.2) # plot possible region of ZIF pore limiting diameters from # Banerjee 2008 - 10.1126/science.1152516 # ax.axvspan(0.0, 13, facecolor='#2ca02c', alpha=0.2) # ax.axvline(x=13.1, c='k', lw=2, linestyle='--') pfn.define_standard_plot( ax, xtitle=r'$d$ of largest component [$\mathrm{\AA}$]', ytitle='cumulative # reactions', xlim=(0, 17), ylim=(0, int(max(cumul)+max(cumul)*0.1)) ) ax2.set_ylim(0, int(max(hist)+max(hist)*0.2)) ax2.set_ylabel('# reactions', fontsize=16) ax.yaxis.set_major_locator(MaxNLocator(integer=True)) ax2.yaxis.set_major_locator(MaxNLocator(integer=True)) # Change left y axis colours. ax.spines['left'].set_color('red') ax2.spines['left'].set_color('red') ax2.tick_params(axis='both', which='major', labelsize=16) fig.tight_layout() fig.savefig( f"{plot_suffix}/size_threshold_{plot_suffix}.pdf", dpi=720, bbox_inches='tight' )
def n_phenyl_assay(output_dir): """ Prepare figure showing the change in intermediate diameter for molecules commonly used in n-phenyl ester hydrolysis assays. """ # the n-phenyl esters mol_list_1 = [ 'p-nitrophenyl acetate', 'p-nitrophenyl butyrate', 'p-nitrophenyl hexanoate', 'p-nitrophenyl octanoate', 'p-nitrophenyl decanoate', 'p-nitrophenyl dodecanoate' ] # the products mol_list_2 = [ 'acetic acid', 'butyric acid', 'hexanoic acid', 'octanoic acid', 'decanoic acid', 'dodecanoic acid' ] # no Cs no_Cs = [2, 4, 6, 8, 10, 12] fig, ax = plt.subplots() for i, C in enumerate(no_Cs): # ester name = mol_list_1[i] out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) print(C, mol_list_1[i], mid_diam) ax.scatter(C, mid_diam, c='r', edgecolors='k', marker='o', alpha=1.0, s=100) # acid name = mol_list_2[i] out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) mid_diam = min(results['diam2']) print(C, mol_list_2[i], mid_diam) ax.scatter(C, mid_diam, c='b', edgecolors='k', marker='o', alpha=1.0, s=120) ax.axhspan(ymin=4.0, ymax=6.6, facecolor='k', alpha=0.2, hatch="/") # n-phenol name = 'p-nitrophenol' out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: import sys sys.exit('calc molecule diameters!') results = pd.read_csv(out_file) mid_diam = min(results['diam2']) ax.axhline(y=mid_diam, c='purple', alpha=1) pfn.define_standard_plot(ax, xtitle='no. carbons', ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=(1, 14), ylim=(2.5, 8)) # decoy legend ax.scatter(-100, -100, c='r', edgecolors='k', marker='o', alpha=1.0, s=100, label='ester') ax.scatter(-100, -100, c='b', edgecolors='k', marker='o', alpha=1.0, s=100, label='acid') ax.legend(fontsize=16) fig.tight_layout() fig.savefig("ester_comp.pdf", dpi=720, bbox_inches='tight')
def rxn_space(data, filename): """ Plot number of possible reactions as a function of size threshold. """ plot_prop = { 1: { 'c': '#FA7268', 'e': 'none', 'a': 0.5, 'm': 'o', 's': 50, 'label': 'class I' }, 2: { 'c': '#DAF7A6', 'e': 'none', 'a': 0.5, 'm': 'x', 's': 50, 'label': 'class II' }, 3: { 'c': '#900C3F', 'e': 'none', 'a': 1.0, 'm': 'x', 's': 50, 'label': 'class III' }, 4: { 'c': '#F6D973', 'e': 'none', 'a': 0.5, 'm': 'x', 's': 50, 'label': 'class IV' } } # bin each of the sets of data based on X value width = 0.5 X_bins = np.arange(0, 20.5, width) fig, ax = plt.subplots(figsize=(8, 5)) # bin each of the sets of data based on X value for p in plot_prop: if p != 3: continue pp = plot_prop[p] sub_data = data[data['PC_class'] == p] hist, bin_edges = np.histogram( a=sub_data['max_mid_diam'], bins=X_bins ) ax.bar( bin_edges[:-1], hist, align='edge', alpha=pp['a'], width=width, color=pp['c'], edgecolor='k', label=pp['label'] ) ax.legend(fontsize=16) ax.axvspan(xmin=4.0, xmax=6.6, facecolor='k', alpha=0.2, hatch="/") # ax.axvspan(xmin=5.4, xmax=6.6, facecolor='k', alpha=0.2) # plot possible region of ZIF pore limiting diameters from # Banerjee 2008 - 10.1126/science.1152516 # ax.axvspan(0.0, 13, facecolor='#2ca02c', alpha=0.2) # HOF. ax.axvline(x=13.1, c='k', lw=2, linestyle='--') pfn.define_standard_plot( ax, xtitle=r'$d$ of largest component [$\mathrm{\AA}$]', ytitle='# reactions', xlim=(0, 17), ylim=None ) fig.tight_layout() fig.savefig( filename, dpi=720, bbox_inches='tight' )
def cs_purch(purch, not_purch): fig, ax = plt.subplots(figsize=(8, 5)) plot_prop = { 't': { 'c': '#FA7268', 'e': 'none', 'a': 0.5, 'm': 'o', 's': 50, 'label': 'purchasable' }, 'f': { 'c': '#DAF7A6', 'e': 'none', 'a': 0.5, 'm': 'x', 's': 50, 'label': 'not purchasable' } } # bin each of the sets of data based on X value for p in plot_prop: pp = plot_prop[p] if p == 't': data = purch else: data = not_purch width = 0.5 X_bins = np.arange(0, 15.5, width) hist, bin_edges = np.histogram(a=data, bins=X_bins, density=True) ax.bar( bin_edges[:-1], hist, align='edge', alpha=0.8, width=width, color=pp['c'], edgecolor='k', label=pp['label'], ) # for X, Y, Z in zip(Xs, Ys, Zs): # if Z: # pp = plot_prop['t'] # else: # pp = plot_prop['f'] # # ax.scatter( # X, # Y, # c=pp['c'], # edgecolors=pp['e'], # marker=pp['m'], # alpha=pp['a'], # s=pp['s'] # ) # Vertical lines for different materials. ax.axvspan(xmin=4.0, xmax=6.6, facecolor='k', alpha=0.2, hatch="/") # ax.axvspan(xmin=5.4, xmax=6.6, facecolor='k', alpha=0.2) # plot possible region of ZIF pore limiting diameters from # Banerjee 2008 - 10.1126/science.1152516 # ax.axvspan(0.0, 13, facecolor='#2ca02c', alpha=0.2) # HOF size limit: ax.axvline(x=13.1, c='k', lw=2, linestyle='--') # # Legend. # for p in plot_prop: # pp = plot_prop[p] # ax.scatter( # X, # Y, # c=pp['c'], # edgecolors=pp['e'], # marker=pp['m'], # alpha=pp['a'], # s=pp['s'], # label=pp['label'] # ) ax.legend(fontsize=16) pfn.define_standard_plot( ax, # xtitle='number of heavy atoms', xtitle=r'intermediate diameter [$\mathrm{\AA}$]', ytitle='frequency', ) fig.tight_layout() fig.savefig(f'chemical_space_purch.pdf', dpi=720, bbox_inches='tight')
def parity_cf_scale_with_known(molecules, diameters, known_df, pars, scale_info): """ Produce a parity plot of calculated diameters and known kinetic diameters for multiple input parameters. """ S = 120 fig, ax = plt.subplots(figsize=(5, 5)) for dir in scale_info: if dir != 'scale09_test': continue kin_diams = [] mid_diams = [] sc, C, M, A, E = scale_info[dir] scale_output = f'scale_sc_{dir}.txt' if os.path.exists(scale_output): with open(scale_output, 'r') as f: for line in f: res = line.rstrip().split('__') name, kin_diam, mid_diam = res kin_diams.append(float(kin_diam)) mid_diams.append(float(mid_diam)) ax.scatter(float(kin_diam), float(mid_diam), c=C, edgecolors=E, marker=M, alpha=A, s=S) else: with open(scale_output, 'w') as f: for name in molecules: try: kin_diam = float(diameters[name]) except ValueError: print('no radius given for this molecule ' '- skipped') continue out_file = (f"{dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) if len(results) == 0: continue mid_diam = min(results['diam2']) kin_diams.append(float(kin_diam)) mid_diams.append(float(mid_diam)) ax.scatter(float(kin_diam), float(mid_diam), c=C, edgecolors=E, marker=M, alpha=A, s=S) f.write(name + '__' + str(kin_diam) + '__' + str(mid_diam) + '\n') corr = pearsonr(kin_diams, mid_diams) MAE = mean_absolute_error(kin_diams, mid_diams) chi2 = sum([((j - i)**2) / i for i, j in zip(kin_diams, mid_diams)]) print(f'{dir} R^2: {corr}, MAE: {MAE}, chi^2: {chi2}') ax.plot(np.linspace(-1, 12, 2), np.linspace(-1, 12, 2), c='k', alpha=0.4) pfn.define_standard_plot( ax, xtitle=r'kinetic diameter [$\mathrm{\AA}$]', # ytitle='intermediate diameter [$\mathrm{\AA}$]', ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=(1, 10), ylim=(1, 10)) # legend for dir in scale_info: if dir != 'scale09_test': continue sc, C, M, A, E = scale_info[dir] ax.scatter(-100, -100, c=C, edgecolors=E, marker=M, alpha=A, s=S, label=f'vdW scale = {sc}') # ax.legend(loc=2, fontsize=14) fig.tight_layout() fig.savefig("parity_scalecf.pdf", dpi=720, bbox_inches='tight')
def dist_cf_scale_with_known(molecules, diameters, known_df, pars, scale_info): """ Produce a bar plot of distributions of the deviations of calculated diameters and known kinetic diameters for multiple input params. """ fig, ax = plt.subplots(figsize=(8, 5)) for dir in scale_info: kin_diams = [] mid_diams = [] sc, C, M, A, E = scale_info[dir] scale_output = f'scale_sc_{dir}.txt' if os.path.exists(scale_output): with open(scale_output, 'r') as f: for line in f: res = line.rstrip().split('__') name, kin_diam, mid_diam = res kin_diams.append(float(kin_diam)) mid_diams.append(float(mid_diam)) else: with open(scale_output, 'w') as f: for name in molecules: try: kin_diam = float(diameters[name]) except ValueError: print('no radius given for this molecule ' '- skipped') continue out_file = (f"{dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) if len(results) == 0: continue mid_diam = min(results['diam2']) kin_diams.append(float(kin_diam)) mid_diams.append(float(mid_diam)) f.write(name + '__' + str(kin_diam) + '__' + str(mid_diam) + '\n') corr = pearsonr(kin_diams, mid_diams) MAE = mean_absolute_error(kin_diams, mid_diams) chi2 = sum([((j - i)**2) / i for i, j in zip(kin_diams, mid_diams)]) print(f'{dir} R^2: {corr}, MAE: {MAE}, chi^2: {chi2}') X_vals = [i - j for i, j in zip(mid_diams, kin_diams)] width = 0.1 xlim = (-2, 2) X_bins = np.arange(xlim[0], xlim[1], width) hist, bin_edges = np.histogram(a=X_vals, bins=X_bins) # # ax.bar( # bin_edges[:-1], # hist, # align='edge', # alpha=0.5, # width=width, # color=C, # edgecolor='k', # label=f'vdW scale = {sc}' # ) ax.plot(X_bins[:-1] + width / 2, hist, c=C, lw='1.5', marker='o', alpha=1.0, label=f'vdW scale = {sc}') pfn.define_standard_plot( ax, xtitle=r'|$d$ - kinetic diameter| [$\mathrm{\AA}$]', ytitle='count', xlim=xlim, ylim=None) ax.legend(fontsize=14) fig.tight_layout() fig.savefig("dist_scalecf.pdf", dpi=720, bbox_inches='tight')
def target_conformer_plot(parameter_sets, molecules, test_mol, full_results, colours, markers, properties): # target no conformers targ_confs = [50, 200] # set property for p, PROP in enumerate(['MW', 'NHA', 'NRB']): if p == 0: PROP_lab = 'MW [g/mol]' p_lim = (0, 120) if p == 1: PROP_lab = 'no. heavy atoms' p_lim = (0, 9) if p == 2: PROP_lab = 'no. rotatable bonds' p_lim = (0, 6) for t in parameter_sets: if t != 'N_conformers': continue # fig = plt.figure() # figsize=(8, 8)) # ax = fig.add_subplot(111, projection='3d') fig, ax = plt.subplots() for name in molecules: if name not in test_mol: continue X = [] Y = [] Z = [] for i, v in enumerate(parameter_sets[t]): RES = full_results[t][name][v] _, _, _, _, min_mid = RES # if i == 0: # ax.errorbar( # float(v), avg, c=colours[name], # yerr=std, fmt=markers[name], label=name) # else: # ax.errorbar(float(v), avg, c=colours[name], # yerr=std, fmt=markers[name]) X.append(float(v)) Y.append(min_mid) Z.append(properties[name][p]) X = np.asarray(X) Y = np.asarray(Y) Z = np.asarray(Z) for targ_conf in targ_confs: Y2 = Y - Y[-1] Z2 = Z[X == targ_conf] Y2 = Y2[X == targ_conf] # plot points # ax.scatter(X, Y-Y[-1], Z, s=60, # c=colours[name], marker=markers[name]) ax.scatter(Z2, Y2, c=colours[name], marker=markers[name], label=name, s=80) pfn.define_standard_plot( ax, xtitle=PROP_lab, # ytitle=( # '$d_{\mathrm{i, min}}$ - ' # '$d_{\mathrm{i, min}}$(1000) ' # '[$\mathrm{\AA}$]' # ), ytitle=r'$d-d$(1000) [$\mathrm{\AA}$]', xlim=p_lim, ylim=(-0.1, 0.5)) ax.axhline(y=0, c='k', linestyle='--') # ax.set_xlabel(t_name, fontsize=16) # ax.set_ylabel( # '$d_{\mathrm{i, min}}-d_{\mathrm{i, min}}$(1000)' # ' [$\mathrm{\AA}$]' # ), # fontsize=16) # ax.set_zlabel(PROP_lab, fontsize=16) # ax.set_xlim(t_lim) # ax.set_ylim(-0.1, 0.5) # ax.set_zlim(p_lim) # ax.set_aspect('equal', 'box') # dist = 30 # angles = 10 # ax.view_init(dist, angles) # ax.legend(fontsize=14, ncol=2) fig.tight_layout() fig.savefig(f"min_of_mid_{t}_v_prop_{PROP}.pdf", bbox_inches='tight', dpi=720)
def seed_test(seeds): """ Compares the minimum diameter obtained for a set of molecules with different random seeds for the ETKDG algorithm. """ molecules = { 'n-hexane': 'CCCCCC', 'n-heptane': 'CCCCCCC', 'n-octane': 'CCCCCCCC', 'toluene': 'CC1=CC=CC=C1', 'p-nitrophenol': 'C1=CC(=CC=C1[N+](=O)[O-])O', 'p-nitrophenyl butyrate': 'CCCC(=O)OC1=CC=C(C=C1)[N+](=O)[O-]', 'butyric acid': 'CCCC(=O)O', } colours = { 'n-hexane': 'k', 'n-heptane': 'r', 'n-octane': 'b', 'toluene': 'green', 'p-nitrophenol': 'purple', 'p-nitrophenyl butyrate': 'orange', 'butyric acid': 'darkgray', } markers = { 'n-hexane': 'o', 'n-heptane': 'X', 'n-octane': 'D', 'toluene': 'P', 'p-nitrophenol': '^', 'p-nitrophenyl butyrate': '>', 'butyric acid': '<', } seed_output = "seed_test.pkl" if os.path.exists(seed_output): # load results full_results = pickle.load(open(seed_output, "rb")) else: full_results = {} for t in seeds: full_results[t] = {} for name in molecules: full_results[t][name] = {} for name in molecules: for t in seeds: output_dir = f'seeds_{t}' out_file = (f"{output_dir}/" f"{name.replace(' ', '_').replace('/', '__')}" '_diam_result.csv') if os.path.exists(out_file) is False: continue results = pd.read_csv(out_file) if len(results) == 0: continue min_diam_avg = np.average(results['diam1']) min_diam_std = np.std(results['diam1']) mid_diam_avg = np.average(results['diam2']) mid_diam_std = np.std(results['diam2']) min_mid = min(results['diam2']) result = (min_diam_avg, min_diam_std, mid_diam_avg, mid_diam_std, min_mid) full_results[t][name] = result # save file pickle.dump(full_results, open("seed_test.pkl", "wb")) fig, ax = plt.subplots() for name in molecules: X = [] Y = [] for t in seeds: RES = full_results[t][name] _, _, _, _, min_mid = RES X.append(int(t)) Y.append(min_mid) print(name, '--', max([i - min(Y) for i in Y])) ax.scatter(X, Y, c=colours[name], marker=markers[name], label=name) input('^^ max dev from min_of_mid for seeds') t_lim = (0, 850000) t_name = 'random seed' pfn.define_standard_plot(ax, xtitle=t_name, ytitle=r'$d$ [$\mathrm{\AA}$]', xlim=t_lim, ylim=(4, 8)) # ax.set_xticks([1, 2, 3, 4, 5, 6, 7]) # ax.set_xticklabels([str(i) for i in seeds]) # ax.legend(fontsize=16, ncol=3) fig.tight_layout() fig.savefig("min_of_mid_seeds.pdf", dpi=720, bbox_inches='tight')
def rxn_complexity(data, filename): """ Plot the measures of complexity of each reaction. """ fig, ax = plt.subplots(figsize=(8, 5)) ylim = (-1000, 1000) xlim = (-10, 10) # CS = [(1.0, 1.0, 1.0), (44/255, 62/255, 80/255)] # cm = colors.LinearSegmentedColormap.from_list('test', CS, N=10) # fig, ax, hist = pfn.twoD_histogram( # X_data=data['deltasa'], # Y_data=data['deltabct'], # xlim=xlim, # ylim=ylim, # cmap=cm, # fig=fig, # ax=ax # ) # cbar = fig.colorbar(hist[3], ax=ax) # cbar.ax.set_ylabel('count', fontsize=16) # cbar.ax.tick_params(labelsize=16) ax.scatter( data['deltasa'], data['deltabct'], c='#CCD1D1', edgecolors='none', marker='o', alpha=1.0, s=40, label='full dataset' ) small_data = data[data['max_mid_diam'] < 6.6] ax.scatter( small_data['deltasa'], small_data['deltabct'], c='#2C3E50', edgecolors='none', marker='o', alpha=1.0, s=40, label='viable reactions' ) pfn.define_standard_plot( ax, # xtitle='number of heavy atoms', ylim=ylim, xlim=xlim, ytitle=r'$\Delta$ BertzCT', xtitle=r'$\Delta$ SAscore', ) ax.legend(fontsize=16) fig.tight_layout() fig.savefig( filename, dpi=720, bbox_inches='tight' )