def main(): if (not len(sys.argv) == 3): print('Usage: screening.py plot_suffix target_EC\n') print(""" plot_suffix: string to put at the end of plot file names. target_EC: file containing a target EC number (f if all EC should be used) param_file: """) sys.exit() else: plot_suffix = sys.argv[1] target_EC_file = sys.argv[2] if sys.argv[2] != 'f' else None pars = utilities.read_params(sys.argv[3]) if target_EC_file is None: # Set options and get all EC numbers. settings = {} search_EC_file = pars['EC_file'] search_ECs = utilities.get_ECs_from_file(EC_file=search_EC_file) else: settings = {} # Read ECs from file. search_ECs = utilities.get_ECs_from_file(EC_file=target_EC_file) print(settings) print(search_ECs) # plot distributions of protein sequence properties if DB_switch != 3: if input('do dist_GRAVY? (t/f)') == 't': print('doing....') pfn.rs_dist_GRAVY(output_dir=search_output_dir, generator=yield_rxn_syst(search_output_dir), plot_suffix=plot_suffix) if input('do dist_I index? (t/f)') == 't': print('doing....') pfn.rs_dist_I_index(output_dir=search_output_dir, generator=yield_rxn_syst(search_output_dir), plot_suffix=plot_suffix) if input('do dist_A index? (t/f)') == 't': print('doing....') pfn.rs_dist_A_index(output_dir=search_output_dir, generator=yield_rxn_syst(search_output_dir), plot_suffix=plot_suffix) if input('do dist_TM index? (t/f)') == 't': print('doing....') pfn.rs_dist_TM_index(output_dir=search_output_dir, generator=yield_rxn_syst(search_output_dir), plot_suffix=plot_suffix) if input('do dist_pI? (t/f)') == 't': print('doing....') pfn.rs_dist_pI(output_dir=search_output_dir, generator=yield_rxn_syst(search_output_dir), plot_suffix=plot_suffix) sys.exit()
def main(): if (not len(sys.argv) == 4): print(""" Usage: linB_screening.py molecule_file rerun_diameter_calc param_file """) sys.exit() else: molecule_file = sys.argv[1] rerun_diameter_calc = True if sys.argv[2] == 't' else False pars = utilities.read_params(sys.argv[3]) start = time.time() # Ignore MW restrictions. pars['MW_thresh'] = 2000 df, molecules, diameters = rdkf.read_mol_txt_file(molecule_file) # draw 2D structures print('--- draw 2D structures...') rdkf.draw_svg_for_all_molecules(molecules) # calculate the size of the ellipsoid surroudning all molecules # using input pars if rerun_diameter_calc: print('--- calculating molecular diameters...') rdkf.calc_molecule_diameters( molecules, pars=pars, out_dir='linB_pars', ) # print results for each molecule print('--- print results and plot...') pm.print_results(molecules, threshold=pars['size_thresh'], output_dir='linB_pars') pm.categorical(molecules, threshold=pars['size_thresh'], output_dir='linB_pars', plot_suffix='linB') pm.shapes(molecules, threshold=pars['size_thresh'], output_dir='linB_pars', plot_suffix='linB') end = time.time() print(f'---- total time taken = {round(end-start, 2)} s')
def post(self): start_time = time.time() args = self.parser.parse_args() # read data params = read_params(args['params'].stream) df = read_file(args['raw_data'].stream.read()) y_train = read_file(args['labels'].stream.read()) # build features X_train = build_features(df, params) y_train = y_train.set_index('example_id') y_train = y_train.loc[X_train.index] # train model cl = train_model(X_train, y_train.label, params) self.model_factory.add_pipeline(cl, params) if isinstance(cl, tpot.TPOTClassifier): final_classifier = cl.fitted_pipeline_ evaluated_indivs = cl.evaluated_individuals_ else: final_classifier = cl evaluated_indivs = None model_type = str(final_classifier) mean_accuracy, mean_roc_auc = cross_validate(final_classifier, X_train, y_train.label) # format feat_eng_params feat_eng_params = params['extract_features'].copy() for k in feat_eng_params.keys(): if k == 'default_fc_parameters': # shows calculations like min, mean, etc. feat_eng_params[k] = str(feat_eng_params[k].keys()) elif k == 'impute_function': feat_eng_params[k] = str(feat_eng_params[k].__name__) else: feat_eng_params[k] = str(feat_eng_params[k]) # for k in feat_eng_params: # feat_eng_params[k] = str(feat_eng_params[k]) result = { 'trainTime': time.time() - start_time, 'trainShape': X_train.shape, 'modelType': model_type, 'featureEngParams': feat_eng_params, 'modelId': params['pipeline_id'], 'mean_cv_accuracy': mean_accuracy, 'mean_cv_roc_auc': mean_roc_auc, 'evaluated_models': evaluated_indivs } self.model_factory[params['pipeline_id']]['stats'] = result return json.dumps(result)
def main(): if (not len(sys.argv) == 3): print(""" Usage: RS_analysis.py prop_redo param_file prop_redo: t for rerun, else to read from prop_done.txt. param_file: """) sys.exit() else: prop_redo = True if sys.argv[1] == 't' else False pars = utilities.read_params(sys.argv[2]) temp_time = time.time() main_analysis(prop_redo=prop_redo, pars=pars) print('--- time taken =', '{0:.2f}'.format(time.time() - temp_time), 's')
def main(): if (not len(sys.argv) == 3): print(""" Usage: molecule_population.py param_file redo mol_file param_file: molecule : molecule file (_unopt.mol) to show the ellipsoid of. """) sys.exit() else: params = utilities.read_params(sys.argv[1]) molecule = sys.argv[2] vdwScale = params['vdwScale'] boxMargin = params['boxMargin'] spacing = params['spacing'] show_vdw = params['show_vdw'] plot_ellip = params['plot_ellip'] # Set conformers to 1 to speed up. N_conformers = 1 MW_thresh = params['MW_thresh'] seed = int(params['seed']) name = molecule.replace('_unopt.mol', '') diam_file = f'ellips_plots/{name}_diam.csv' smiles = rdkf.read_structure_to_smiles(molecule) print('>> getting molecular size') _ = rdkf.calc_molecule_diameter(name, smiles, out_file=diam_file, vdwScale=vdwScale, boxMargin=boxMargin, spacing=spacing, MW_thresh=MW_thresh, show_vdw=show_vdw, plot_ellip=plot_ellip, N_conformers=N_conformers, rSeed=seed, do_step_plot=True) del _
def main(): if (not len(sys.argv) == 5): print(""" Usage: RS_collection.py run redo skipped run: t to run search for new rxn systems into current dir. redo: t to overwrite all rxn systems. param_file: skipped: t to see the number of skipped rxns in cwd. """) sys.exit() else: run = True if sys.argv[1] == 't' else False redo = True if sys.argv[2] == 't' else False pars = utilities.read_params(sys.argv[3]) skipped = True if sys.argv[4] == 't' else False if run: main_run(redo, pars) if skipped: search_output_dir = getcwd() + '/' percent_skipped(search_output_dir, pars) print('----- All done! ------')
def main(): if (not len(sys.argv) == 3): print(""" Usage: visualise_reaction_system.py param_file file param_file : (str) file : (str) pkl file containing reaction system """) sys.exit() else: params = utilities.read_params(sys.argv[1]) file = sys.argv[2] file = os.path.join(os.getcwd(), file) print(file) # Read in reaction system. rs = get_RS(filename=file, output_dir=os.getcwd(), pars=params, verbose=True) print(rs) if rs.skip_rxn: print(f'>>> {rs.skip_reason}') print(f'max d = {rs.max_min_mid_diam}\n') # Output molecular components and their properties. for rsc in rs.components: print(rsc) print(f'SMILEs = {rsc.SMILES}') print(f'logP = {rsc.logP}') print(f'logS = {rsc.logS}') print(f'SA = {rsc.Synth_score}')
def main(): if (not len(sys.argv) == 3): print(""" Usage: biomin_screening.py rerun_diameter_calc param_file """) sys.exit() else: rerun_diameter_calc = True if sys.argv[1] == 't' else False pars = utilities.read_params(sys.argv[2]) start = time.time() # set parameters EC_set, EC_mol_set, EC_descriptors = EC_sets() # Ignore MW restrictions. pars['MW_thresh'] = 2000 print('------------------------------------------------') print('Screen molecular size of compounds in known reactions') print('------------------------------------------------') # screen known reactant and product molecules print('--- get molecule DB + draw 2D structures...') molecules, diameters = get_molecule_DB(EC_mol_set=EC_mol_set, output_dir='2d_') # calculate the size of the ellipsoid surroudning all molecules if rerun_diameter_calc: print('--- calculate molecular diameters...') rdkf.calc_molecule_diameters( molecules, pars=pars, out_dir='biomin_sizes', ) # print results for each molecule print('--- print results and plot...') pm.print_results(molecules, threshold=pars['size_thresh'], output_dir='biomin_sizes') # plotting biomin_known(molecules, output_dir='biomin_sizes', plot_suffix='biomin_known') pm.categorical(molecules, threshold=pars['size_thresh'], output_dir='biomin_sizes', plot_suffix='biomin_known') pm.shapes(molecules, threshold=pars['size_thresh'], output_dir='biomin_sizes', plot_suffix='biomin_known') n_phenyl_assay(output_dir='biomin_sizes') cyt_C_perox_assay(output_dir='biomin_sizes') HOF_examples(output_dir='biomin_sizes') Tash_esters(output_dir='biomin_sizes') end = time.time() print(f'---- total time taken = {round(end-start, 2)} s')
def post(self): args = self.parser.parse_args() params = read_params(args['params'].stream) df = read_file(args['raw_data'].stream.read()) result = self.model_factory.use_pipeline(df, params['pipeline_id']) return result.reset_index().to_json()
def main(): if (not len(sys.argv) == 5): print(""" Usage: param_screening.py molecule_file : enzyme_screen/data/test_molecules.txt rerun_diameter_calc : t or f param_file : enzyme_screen/data/param_file.txt do_parity : t or f to make the figure or not """) sys.exit() else: molecule_file = sys.argv[1] rerun_diameter_calc = True if sys.argv[2] == 't' else False pars = utilities.read_params(sys.argv[3]) do_parity = True if sys.argv[4] == 't' else False start = time.time() df, molecules, diameters, min2s = rdkf.read_mol_txt_file(molecule_file) print(min2s) # Ignore MW restrictions. pars['MW_thresh'] = 2000 # draw 2D structures print('--- draw 2D structures...') rdkf.draw_svg_for_all_molecules(molecules) parameter_sets = { 'spacing': [0.3, 0.4, 0.5, 0.6], 'N_conformers': [10, 50, 100, 200, 300, 400, 600, 1000], 'boxMargin': [4, 6, 8] } seeds = [ 1, 1000, 500, 50000, 2123, 345555, 542221, 679293, 2755, 99982, 825412, 342, 54638, 1982, 77654, 8553, 4 ] # calculate the size of the ellipsoid surroudning all molecules # using input pars if rerun_diameter_calc: print('--- calculating molecular diameters for all tests...') rdkf.calc_molecule_diameters( molecules, pars=pars, out_dir='orig_pars', ) # Scale test. new_pars = pars.copy() new_pars['vdwScale'] = 1.0 rdkf.calc_molecule_diameters( molecules, pars=new_pars, out_dir='scale1_test', ) new_pars = pars.copy() new_pars['vdwScale'] = 0.9 rdkf.calc_molecule_diameters( molecules, pars=new_pars, out_dir='scale09_test', ) new_pars = pars.copy() new_pars['vdwScale'] = 0.8 rdkf.calc_molecule_diameters( molecules, pars=new_pars, out_dir='scale08_test', ) new_pars = pars.copy() new_pars['vdwScale'] = 0.7 rdkf.calc_molecule_diameters( molecules, pars=new_pars, out_dir='scale07_test', ) # Seed test. print('--------- seed tests! ----------------') for seed in seeds: new_pars = pars.copy() new_pars['seed'] = seed print(f'doing seed {seed}') print(new_pars) new_molecules = { 'n-hexane': 'CCCCCC', 'n-heptane': 'CCCCCCC', 'n-octane': 'CCCCCCCC', 'toluene': 'CC1=CC=CC=C1', 'p-nitrophenol': 'C1=CC(=CC=C1[N+](=O)[O-])O', 'p-nitrophenyl butyrate': ('CCCC(=O)OC1=CC=C(C=C1)[N+](=O)[O-]'), 'butyric acid': 'CCCC(=O)O', } print(new_molecules) rdkf.calc_molecule_diameters( new_molecules, pars=new_pars, out_dir=f'seeds_{seed}', ) # Parameter tests. print('--------- param tests! ----------------') test_mol = [ 'n-butane', 'meta-xylene', 'n-hexane', 'n-heptane', 'n-octane', 'toluene', 'napthalene' ] new_molecules = {i: molecules[i] for i in molecules if i in test_mol} print(new_molecules) for t in parameter_sets: for v in parameter_sets[t]: print(f'doing test {t} with value {v}') new_pars = pars.copy() new_pars[t] = v print(new_pars) rdkf.calc_molecule_diameters( new_molecules, pars=new_pars, out_dir=f'ptests_{t}_{v}', ) cf_verploegh2015(molecules, output_dir='orig_pars') cf_polyukhov2019(molecules, output_dir='orig_pars') cf_ueda2019(molecules, output_dir='orig_pars') cf_cuadardocollardos2019(molecules, output_dir='orig_pars') cf_cuadardocollardos2019(molecules, output_dir='orig_pars') print_results_cf_known(molecules, known_df=df, threshold=pars['size_thresh'], output_dir='orig_pars') parity_with_known(molecules, diameters, output_dir='orig_pars') parity_with_known_min2(molecules, min2s, output_dir='orig_pars') categorical_with_known(molecules, known_df=df, threshold=pars['size_thresh'], output_dir='orig_pars') shapes_with_known(molecules, known_df=df, threshold=pars['size_thresh'], output_dir='orig_pars') # '#FA7268', '#F8A72A', '#DAF7A6' if do_parity: scale_info = { # DIR: (scale, C, M, alpha, edgecolor) 'scale1_test': (1.0, '#FA7268', 'D', 1.0, 'none'), 'scale09_test': (0.9, '#900C3F', 'o', 1.0, 'k'), 'scale08_test': (0.8, '#6BADB0', '>', 1.0, 'none'), 'scale07_test': (0.7, '#F6D973', '<', 1.0, 'none') } parity_cf_scale_with_known(molecules, diameters, known_df=df, pars=pars, scale_info=scale_info) dist_cf_scale_with_known(molecules, diameters, known_df=df, pars=pars, scale_info=scale_info) seed_test(seeds=seeds) parameter_tests(molecules) end = time.time() print(f'---- total time taken = {round(end-start, 2)} s')
def main(): if (not len(sys.argv) == 3): print('Usage: screening.py param_file case\n') print(""" param_file (str) : case (str) : define the case study to use. """) case_studies(None, None) sys.exit() else: pars = utilities.read_params(sys.argv[1]) case = sys.argv[2] pars = case_studies(string=case, pars=pars) if not os.path.exists(pars['file_suffix']): os.mkdir(pars['file_suffix']) # Get all EC numbers. search_EC_file = pars['EC_file'] search_ECs = utilities.get_ECs_from_file(EC_file=search_EC_file) # Iterate through all reactions in directory. search_output_dir = os.getcwd() prop_output_file = os.path.join(search_output_dir, 'rs_properties.csv') if os.path.exists(prop_output_file): output_data = pd.read_csv(prop_output_file) else: raise FileNotFoundError(f'{prop_output_file} with all data is missing') target_data = pd.DataFrame(columns=output_data.columns) for i, row in output_data.iterrows(): if row['ec'] in search_ECs: # Remove data with incomplete attributes. if float(row['minlogp']) == 1E10: continue if float(row['maxlogp']) == -1E10: continue if float(row['minlogs']) == 1E10: continue if float(row['maxlogs']) == -1E10: continue if float(row['rmaxsa']) == 0: continue if float(row['pmaxsa']) == 0: continue if float(row['rmaxbct']) == 0: continue if float(row['pmaxbct']) == 0: continue target_data = target_data.append(row) pr.no_rxns_vs_size(data=target_data, params=pars, plot_suffix=pars['file_suffix']) pr.save_candidates(data=target_data, params=pars, filename=(f"{pars['file_suffix']}/" f"candidates_{pars['file_suffix']}.csv")) pr.rxn_space(data=target_data, filename=(f"{pars['file_suffix']}/" f"rxn_space_{pars['file_suffix']}.pdf")) pr.rxn_value(data=target_data, filename=(f"{pars['file_suffix']}/" f"rxn_value_{pars['file_suffix']}.pdf")) pr.rxn_complexity(data=target_data, filename=(f"{pars['file_suffix']}/" f"rxn_complexity_{pars['file_suffix']}.pdf")) plots_to_do = [ # Column, type (dist, stacked, pie), xtitle, xlim, width ('minlogs', 'dist', 'min. logS', None, 0.5), ('minlogs', 'stacked', 'min. logS', None, 0.5), ('maxlogp', 'dist', 'max. logP', None, 0.5), ('maxlogp', 'stacked', 'max. logP', None, 0.5), ('nr', 'dist', 'no reactants', None, 0.5), ('np', 'dist', 'no products', None, 0.5), ('PC_class', 'pie', 'purchasability class', None, 1), ('max_mid_diam', 'dist', r'$d$ of largest component [$\mathrm{\AA}$]', None, 0.5), ('max_mid_diam', 'stacked', r'$d$ of largest component [$\mathrm{\AA}$]', None, 0.5), ('deltasa', 'dist', r'$\Delta$ SAscore', (-10, 10), 0.5), ('deltasa', 'stacked', r'$\Delta$ SAscore', (-10, 10), 0.5), ('deltabct', 'dist', r'$\Delta$ BertzCT', (-1000, 1000), 100), ] for pl in plots_to_do: col, type, xtitle, xlim, width = pl if type == 'stacked': fig, ax = pr.stacked_dist(data=target_data, col=col, xtitle=xtitle, xlim=xlim, width=width) fig.tight_layout() fig.savefig(fname=(f"{pars['file_suffix']}/" f"stacked_{col}_{pars['file_suffix']}.pdf"), dpi=720, bbox_inches='tight') fig, ax = pr.violinplot(data=target_data, col=col, ytitle=xtitle, ylim=xlim) fig.tight_layout() fig.savefig(fname=(f"{pars['file_suffix']}/" f"violin_{col}_{pars['file_suffix']}.pdf"), dpi=720, bbox_inches='tight') elif type == 'dist': fig, ax = pr.dist(X=target_data[col], xtitle=xtitle, xlim=xlim, width=width) fig.tight_layout() fig.savefig(fname=(f"{pars['file_suffix']}/" f"dist_{col}_{pars['file_suffix']}.pdf"), dpi=720, bbox_inches='tight') elif type == 'pie': fig, ax = pr.pie(X=target_data[col], xtitle=xtitle, xlim=xlim, width=width) fig.tight_layout() fig.savefig(fname=(f"{pars['file_suffix']}/" f"pie_{col}_{pars['file_suffix']}.pdf"), dpi=720, bbox_inches='tight')
def main(): if (not len(sys.argv) == 6): print(""" Usage: molecule_population.py param_file redo mol_file param_file: redo size: t to overwrite SIZE of all molecules. redo rest: t to overwrite properties of all molecules. plot: t to plot distributions of molecule properties. mol_file : file name of list of molecules to allow for trivial parallelisation, `f` if not specified, where all `mol` files are populated. """) sys.exit() else: params = utilities.read_params(sys.argv[1]) redo_size = True if sys.argv[2] == 't' else False redo_prop = True if sys.argv[3] == 't' else False plot = True if sys.argv[4] == 't' else False mol_file = None if sys.argv[5] == 'f' else sys.argv[5] print('settings:') print(' Molecule file:', mol_file) print( 'populate the properties attributes for all ' 'molecules in DB...' ) populate_all_molecules( params=params, mol_file=mol_file, redo_size=redo_size, redo_prop=redo_prop, ) if plot: pm.mol_parity( propx='logP', propy='logS', file='logPvslogS', xtitle='logP', ytitle='logS' ) pm.mol_parity( propx='NHA', propy='Synth_score', file=f"NHAvsSA_{params['file_suffix']}", xtitle='no. heavy atoms', ytitle='SAScore' ) pm.mol_parity( propx='NHA', propy='logP', file=f"NHAvslogP_{params['file_suffix']}", xtitle='no. heavy atoms', ytitle='logP' ) pm.mol_parity( propx='NHA', propy='logS', file=f"NHAvslogS_{params['file_suffix']}", xtitle='no. heavy atoms', ytitle='logS' ) pm.mol_categ( propx='purchasability', propy='Synth_score', file=f"purchvsSA_{params['file_suffix']}", xtitle='is purchasable', ytitle='SAscore' ) pm.mol_categ( propx='purchasability', propy='bertzCT', file=f"purchvsbCT_{params['file_suffix']}", xtitle='is purchasable', ytitle='BertzCT' ) pm.mol_categ( propx='purchasability', propy='size', file=f"purchvssize_{params['file_suffix']}", xtitle='is purchasable', ytitle=r'$d$ [$\mathrm{\AA}$]' ) pm.mol_categ( propx='size', propy='bertzCT', file=f"sizevsbCT_{params['file_suffix']}", xtitle='can diffuse', ytitle='BertzCT' ) pm.mol_categ( propx='size', propy='Synth_score', file=f"sizevsSA_{params['file_suffix']}", xtitle='can diffuse', ytitle='SAscore' ) pm.mol_all_dist(plot_suffix=params['file_suffix'])