def predict(cp_folder, test_path, cp_model_path, device, check_paths): """ Get and save the prediction results from a ChemProp model. Args: cp_folder (str): path to the chemprop folder on your computer test_path (str): path to the file with the test SMILES and their properties cp_model_path (str): path to the folder with the model of interest device (Union[str, int]): device to evaluate the model on check_paths (list[str]): paths to the different model checkpoints Returns: reals (dict):dictionary of the form {prop: real}, where `real` are the real values of the property `prop`. preds (list[dict]): same as `real` but for predicted. One for each model. """ script = os.path.join(cp_folder, "predict.py") preds_path = os.path.join(cp_model_path, f"test_pred.csv") # load the arguments from that model to get the features path args_path = f"{cp_model_path}/fold_0/args.json" if not os.path.isfile(args_path): args_path = args_path.replace("fold_0/", "") with open(args_path, "r") as f: args = json.load(f) features_path = args["separate_test_features_path"] # predictions from different models preds = [] for i, check_path in enumerate(check_paths): # make the chemprop command this_path = preds_path.replace(".csv", f"_{i}.csv") cmd = (f"source activate chemprop && python {script} " f" --test_path {test_path} --preds_path {this_path} " f" --checkpoint_paths {check_path} ") if device == "cpu": cmd += f" --no_cuda" else: cmd += f" --gpu {device} " if features_path is not None: feat_str = " ".join(features_path) cmd += f" --features_path {feat_str}" p = bash_command(cmd) p.wait() pred = read_csv(this_path) preds.append(pred) real = read_csv(test_path) return real, preds
def cp_hyperopt(cp_folder, hyp_folder, rerun): """ Run hyperparameter optimization with ChemProp. Args: cp_folder (str): path to the chemprop folder on your computer hyp_folder (str): where you want to store your hyperparameter optimization models rerun (bool): whether to rerun hyperparameter optimization if `hyp_folder` already exists and has the completion file `best_params.json`. Returns: best_params (dict): best parameters from hyperparameter optimization """ # path to `best_params.json` file param_file = os.path.join(hyp_folder, "best_params.json") params_exist = os.path.isfile(param_file) # If it exists and you don't want to re-run, then load it if params_exist and (not rerun): fprint(f"Loading hyperparameter results from {param_file}\n") with open(param_file, "r") as f: best_params = json.load(f) return best_params # otherwise run the script and read in the results hyp_script = os.path.join(cp_folder, "hyperparameter_optimization.py") config_path = os.path.join(hyp_folder, "config.json") with open(config_path, "r") as f: config = json.load(f) data_path = config["data_path"] dataset_type = config["dataset_type"] cmd = get_cp_cmd(hyp_script, config_path, data_path, dataset_type) cmd += f" --config_save_path {param_file}" fprint(f"Running hyperparameter optimization in folder {hyp_folder}\n") fprint(cmd) p = bash_command(f"source activate chemprop && {cmd}") p.wait() with open(param_file, "r") as f: best_params = json.load(f) return best_params
def cp_train(cp_folder, train_folder): """ Train a chemprop model. Args: cp_folder (str): path to the chemprop folder on your computer train_folder (str): where you want to store your trained models Returns: None """ train_script = os.path.join(cp_folder, "train.py") config_path = os.path.join(train_folder, "config.json") with open(config_path, "r") as f: config = json.load(f) data_path = config["data_path"] dataset_type = config["dataset_type"] cmd = get_cp_cmd(train_script, config_path, data_path, dataset_type) p = bash_command(f"source activate chemprop && {cmd}") p.wait()
def main(cp_folder, feature_folder, model_folder_paths, device, smiles_folder, metrics, hyper_dset_size, **kwargs): """ Get fingerprints from a pre-trained chemprop model. Args: cp_folder (str): path to the chemprop folder on your computer feature_folder (str): folder in which you're sainvg the features model_folder_paths (str): folders with the different models from which you're making fingerprints. device (Union[str, int]): device to evaluate the model on smiles_folder (str): folder with the csvs metrics (list[str]): names of the metrics corresponding to each ChemProp model name. hyper_dset_size (int): maximum size of the entire dataset to use in hyperparameter optimization. """ script = os.path.join(cp_folder, "predict.py") save_paths = [] hyper_paths = [] for model_path, metric in zip(model_folder_paths, metrics): # load the arguments from that model to get the features path args_path = f"{model_path}/fold_0/args.json" if not os.path.isfile(args_path): args_path = args_path.replace("fold_0/", "") with open(args_path, "r") as f: args = json.load(f) features_path = args["separate_test_features_path"] # make a string for all of the checkpoint paths check_str = os.path.join(model_path, "fold_0/model_0/model.pt") # make the chemprop command cmd = ("source activate chemprop && " f"python {script} " f" --checkpoint_paths {check_str} " f"--as_featurizer ") if device == "cpu": cmd += f" --no_cuda" else: cmd += f" --gpu {device} " if features_path is not None: feat_str = " ".join(features_path) cmd += f" --features_path {feat_str}" for split in ["train", "val", "test"]: feat_path = os.path.join(feature_folder, f"{split}_{metric}.csv") data_path = os.path.join(smiles_folder, f"{split}_full.csv") if not os.path.isdir(feature_folder): os.makedirs(feature_folder) cmd += (f" --test_path {data_path} " f" --preds_path {feat_path} ") p = bash_command(cmd) p.wait() # convert it to npz np_save_path = to_npz(feat_path) save_paths.append(np_save_path) # make hyperparameter optimization splits hyp_save_path = save_hyperopt(feat_folder=feature_folder, metric=metric, smiles_folder=smiles_folder, cp_save_folder=feature_folder, dset_size=hyper_dset_size) hyper_paths.append(hyp_save_path) summarize(save_paths + hyper_paths, feature_folder)
def main(base_config_path, hyp_config_path, use_hyperopt, rerun_hyperopt, cp_folder, feature_folder, model_folder_cp, metrics, feat_options, mpnn_options, **kwargs): """ Run transfer learning using fingerprints from 3D models evaluated by performance on a variety of metrics. Different models are trained with the fingerprints and with or without an MPNN. Args: base_config_path (str): where your basic job config file is, with parameters that may or may not be changed depending on the given run hyp_config_path (str): where your basic hyperopt job config file is, with parameters that may or may not be changed depending on the given run use_hyperopt (bool): do a hyperparameter optimization before training the model rerun_hyperopt (bool): whether to rerun hyperparameter optimization if `hyp_folder` already exists and has the completion file `best_params.json`. cp_folder (str): path to the chemprop folder on your computer feature_folder (str): directory with files for the features of the species model_folder_cp (str): directory in which you'll be saving your model folders metrics (list[str]): metrics you want to use feat_options (list[bool]): options you want to use for features. For example, [True, False] means you want to train one model with features and one without, while [True] just means you want to train one with features. mpnn_options (list[bool]): same idea as `feat_options`, but for whether or not to use an MPNN Returns: None """ cwd = os.path.abspath(".") script = os.path.join(cwd, "cp_tl.py") for feat in feat_options: for mpnn in mpnn_options: # can't run anything without either features or an MPNN if (not feat) and (not mpnn): continue for metric in metrics: paths = [] for split in ['train', 'val', 'test']: paths.append(os.path.join(feature_folder, f"{split}_{metric}.npz")) train_feat_path, val_feat_path, test_feat_path = paths train_folder = get_train_folder( model_folder_cp=model_folder_cp, feature_folder=feature_folder, metric=metric, feat=feat, mpnn=mpnn) msg = get_msg(feat, mpnn, train_folder) fprint(msg) cmd = (f"python {script} " f"--base_config_path {base_config_path} " f"--hyp_config_path {hyp_config_path} " f"--metric {metric} " f"--train_feat_path {train_feat_path} " f"--val_feat_path {val_feat_path} " f"--test_feat_path {test_feat_path} " f"--train_folder {train_folder} " f"--cp_folder {cp_folder} ") if use_hyperopt: cmd += "--use_hyperopt " if rerun_hyperopt: cmd += "--rerun_hyperopt " if not mpnn: cmd += "--features_only " if not feat: cmd += "--no_features " p = bash_command(cmd) p.wait()
def make_split(summary_path, csv_folder, cp_folder, props, split_sizes, split_type, max_specs, max_atoms, dataset_type, seed): """ Split the species into train, test, and validation sets. Args: summary_path (str): path to the JSON file that summarizes all of the information about the species, apart from their conformers. csv_folder (str): path to the folder in which we will save our csv files with the SMILES, properties and training splits. cp_folder (str): path to the ChemProp folder on your computer props (list[str]): list of property names that you want to predict split_sizes (list[float]): list of the form [train_split_size, val_split_size, test_split_size]. split_type (str): how to split the data. Options can be found in the Chemprop script `split_data.py`. A good choice is usually `scaffold_balanced`, which splits in such a way that similar scaffolds are in the same split. max_specs (int): maximum number of species allowed in dataset max_atoms (int): Maximum number of atoms allowed in a species dataset_type (str): type of problem, e.g. "classification" or "regression". seed (int): random seed for split Returns: None """ with open(summary_path, "r") as f: summary_dic = json.load(f) # apply any transformations to the data, e.g. wanting a # dataset that has the log of a value instead of the # value itself apply_transfs(props, summary_dic) # filter based on props, max species and max number of atoms summary_dic = subsample(summary_dic=summary_dic, props=props, max_specs=max_specs, max_atoms=max_atoms, dataset_type=dataset_type, seed=seed) # path csv file with SMILES and properties all_csv = os.path.join(csv_folder, "all.csv") if not os.path.isdir(csv_folder): os.makedirs(csv_folder) # write the contents of `summary_dic` to the csv to_csv(summary_dic, props, all_csv) # run the chemprop script `split_data.py` to make the splits # from `all.csv` script = os.path.join(cp_folder, "scripts", "split_data.py") split_str = " ".join(np.array(split_sizes).astype("str")) cmd = (f"source activate chemprop && " f"python {script} --split_type {split_type} " f"--split_sizes {split_str} " f"--data_path {all_csv} " f"--save_dir {csv_folder} " f"--seed {seed}") p = bash_command(cmd) p.wait()