Exemple #1
0
def predict(cp_folder, test_path, cp_model_path, device, check_paths):
    """
    Get and save the prediction results from a ChemProp model.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      test_path (str): path to the file with the test SMILES and their properties
      cp_model_path (str): path to the folder with the model of interest
      device (Union[str, int]): device to evaluate the model on
      check_paths (list[str]): paths to the different model checkpoints
    Returns:
      reals (dict):dictionary of the form {prop: real}, where `real`
          are the real values of the property `prop`.
      preds (list[dict]): same as `real` but for predicted. One for each
          model.
    """

    script = os.path.join(cp_folder, "predict.py")
    preds_path = os.path.join(cp_model_path, f"test_pred.csv")

    # load the arguments from that model to get the features path
    args_path = f"{cp_model_path}/fold_0/args.json"
    if not os.path.isfile(args_path):
        args_path = args_path.replace("fold_0/", "")
    with open(args_path, "r") as f:
        args = json.load(f)
    features_path = args["separate_test_features_path"]

    # predictions from different models
    preds = []

    for i, check_path in enumerate(check_paths):

        # make the chemprop command

        this_path = preds_path.replace(".csv", f"_{i}.csv")
        cmd = (f"source activate chemprop && python {script} "
               f" --test_path {test_path} --preds_path {this_path} "
               f" --checkpoint_paths {check_path} ")

        if device == "cpu":
            cmd += f" --no_cuda"
        else:
            cmd += f" --gpu {device} "

        if features_path is not None:
            feat_str = " ".join(features_path)
            cmd += f" --features_path {feat_str}"

        p = bash_command(cmd)
        p.wait()

        pred = read_csv(this_path)
        preds.append(pred)

    real = read_csv(test_path)

    return real, preds
Exemple #2
0
def cp_hyperopt(cp_folder, hyp_folder, rerun):
    """
    Run hyperparameter optimization with ChemProp.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      hyp_folder (str): where you want to store your hyperparameter
        optimization models
      rerun (bool): whether to rerun hyperparameter optimization if
        `hyp_folder` already exists and has the completion file
        `best_params.json`.
    Returns:
      best_params (dict): best parameters from hyperparameter 
        optimization
    """

    # path to `best_params.json` file
    param_file = os.path.join(hyp_folder, "best_params.json")
    params_exist = os.path.isfile(param_file)

    # If it exists and you don't want to re-run, then load it
    if params_exist and (not rerun):

        fprint(f"Loading hyperparameter results from {param_file}\n")

        with open(param_file, "r") as f:
            best_params = json.load(f)
        return best_params

    # otherwise run the script and read in the results

    hyp_script = os.path.join(cp_folder, "hyperparameter_optimization.py")
    config_path = os.path.join(hyp_folder, "config.json")

    with open(config_path, "r") as f:
        config = json.load(f)

    data_path = config["data_path"]
    dataset_type = config["dataset_type"]
    cmd = get_cp_cmd(hyp_script, config_path, data_path, dataset_type)
    cmd += f" --config_save_path {param_file}"

    fprint(f"Running hyperparameter optimization in folder {hyp_folder}\n")

    fprint(cmd)
    p = bash_command(f"source activate chemprop && {cmd}")
    p.wait()

    with open(param_file, "r") as f:
        best_params = json.load(f)

    return best_params
Exemple #3
0
def cp_train(cp_folder, train_folder):
    """
    Train a chemprop model.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      train_folder (str): where you want to store your trained models
    Returns:
      None
    """

    train_script = os.path.join(cp_folder, "train.py")
    config_path = os.path.join(train_folder, "config.json")

    with open(config_path, "r") as f:
        config = json.load(f)

    data_path = config["data_path"]
    dataset_type = config["dataset_type"]
    cmd = get_cp_cmd(train_script, config_path, data_path, dataset_type)

    p = bash_command(f"source activate chemprop && {cmd}")
    p.wait()
def main(cp_folder, feature_folder, model_folder_paths, device, smiles_folder,
         metrics, hyper_dset_size, **kwargs):
    """
    Get fingerprints from a pre-trained chemprop model.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      feature_folder (str): folder in which you're sainvg the features
      model_folder_paths (str): folders with the different models from which
        you're making fingerprints.
      device (Union[str, int]): device to evaluate the model on
      smiles_folder (str): folder with the csvs
      metrics (list[str]): names of the metrics corresponding to each
        ChemProp model name.
      hyper_dset_size (int): maximum size of the entire dataset to use in
        hyperparameter optimization.

    """

    script = os.path.join(cp_folder, "predict.py")
    save_paths = []
    hyper_paths = []

    for model_path, metric in zip(model_folder_paths, metrics):

        # load the arguments from that model to get the features path
        args_path = f"{model_path}/fold_0/args.json"
        if not os.path.isfile(args_path):
            args_path = args_path.replace("fold_0/", "")
        with open(args_path, "r") as f:
            args = json.load(f)
        features_path = args["separate_test_features_path"]

        # make a string for all of the checkpoint paths
        check_str = os.path.join(model_path, "fold_0/model_0/model.pt")

        # make the chemprop command

        cmd = ("source activate chemprop && "
               f"python {script} "
               f" --checkpoint_paths {check_str} "
               f"--as_featurizer ")

        if device == "cpu":
            cmd += f" --no_cuda"
        else:
            cmd += f" --gpu {device} "

        if features_path is not None:
            feat_str = " ".join(features_path)
            cmd += f" --features_path {feat_str}"

        for split in ["train", "val", "test"]:

            feat_path = os.path.join(feature_folder, f"{split}_{metric}.csv")
            data_path = os.path.join(smiles_folder, f"{split}_full.csv")

            if not os.path.isdir(feature_folder):
                os.makedirs(feature_folder)

            cmd += (f" --test_path {data_path} " f" --preds_path {feat_path} ")

            p = bash_command(cmd)
            p.wait()

            # convert it to npz
            np_save_path = to_npz(feat_path)
            save_paths.append(np_save_path)

        # make hyperparameter optimization splits
        hyp_save_path = save_hyperopt(feat_folder=feature_folder,
                                      metric=metric,
                                      smiles_folder=smiles_folder,
                                      cp_save_folder=feature_folder,
                                      dset_size=hyper_dset_size)
        hyper_paths.append(hyp_save_path)

    summarize(save_paths + hyper_paths, feature_folder)
def main(base_config_path,
         hyp_config_path,
         use_hyperopt,
         rerun_hyperopt,
         cp_folder,
         feature_folder,
         model_folder_cp,
         metrics,
         feat_options,
         mpnn_options,
         **kwargs):
    """
    Run transfer learning using fingerprints from 3D models evaluated by performance
    on a variety of metrics. Different models are trained with the fingerprints and
    with or without an MPNN.
    Args:
      base_config_path (str): where your basic job config file
        is, with parameters that may or may not be changed depending
        on the given run
      hyp_config_path (str): where your basic hyperopt job config file
        is, with parameters that may or may not be changed depending
        on the given run
      use_hyperopt (bool): do a hyperparameter optimization before training
        the model
      rerun_hyperopt (bool): whether to rerun hyperparameter optimization if
        `hyp_folder` already exists and has the completion file
        `best_params.json`.
      cp_folder (str): path to the chemprop folder on your computer
      feature_folder (str): directory with files for the features of the species
      model_folder_cp (str): directory in which you'll be saving your model
        folders
      metrics (list[str]): metrics you want to use
      feat_options (list[bool]): options you want to use for features. For example,
        [True, False] means you want to train one model with features and one without,
        while [True] just means you want to train one with features.
      mpnn_options (list[bool]): same idea as `feat_options`, but for whether or not to
        use an MPNN
    Returns:
      None
    """

    cwd = os.path.abspath(".")
    script = os.path.join(cwd, "cp_tl.py")

    for feat in feat_options:
        for mpnn in mpnn_options:
            # can't run anything without either features or an MPNN
            if (not feat) and (not mpnn):
                continue
            for metric in metrics:

                paths = []
                for split in ['train', 'val', 'test']:
                    paths.append(os.path.join(feature_folder,
                                              f"{split}_{metric}.npz"))

                train_feat_path, val_feat_path, test_feat_path = paths

                train_folder = get_train_folder(
                    model_folder_cp=model_folder_cp,
                    feature_folder=feature_folder,
                    metric=metric,
                    feat=feat,
                    mpnn=mpnn)

                msg = get_msg(feat, mpnn, train_folder)
                fprint(msg)

                cmd = (f"python {script} "
                       f"--base_config_path {base_config_path} "
                       f"--hyp_config_path {hyp_config_path} "
                       f"--metric {metric} "
                       f"--train_feat_path {train_feat_path} "
                       f"--val_feat_path {val_feat_path} "
                       f"--test_feat_path {test_feat_path} "
                       f"--train_folder {train_folder} "
                       f"--cp_folder {cp_folder} ")

                if use_hyperopt:
                    cmd += "--use_hyperopt "
                if rerun_hyperopt:
                    cmd += "--rerun_hyperopt "
                if not mpnn:
                    cmd += "--features_only "
                if not feat:
                    cmd += "--no_features "

                p = bash_command(cmd)
                p.wait()
Exemple #6
0
def make_split(summary_path, csv_folder, cp_folder, props, split_sizes,
               split_type, max_specs, max_atoms, dataset_type, seed):
    """
    Split the species into train, test, and validation sets.

    Args:
      summary_path (str): path to the JSON file that summarizes
        all of the information about the species, apart from their
        conformers.
      csv_folder (str): path to the folder in which we will save our
        csv files with the SMILES, properties and training splits.
      cp_folder (str): path to the ChemProp folder on your computer
      props (list[str]): list of property names that you want to predict
      split_sizes (list[float]): list of the form [train_split_size, val_split_size,
        test_split_size].
      split_type (str): how to split the data. Options can be found in the Chemprop
        script `split_data.py`. A good choice is usually `scaffold_balanced`, which splits
        in such a way that similar scaffolds are in the same split. 
      max_specs (int): maximum number of species allowed in dataset
      max_atoms (int): Maximum number of atoms allowed in a species
      dataset_type (str): type of problem, e.g. "classification" or 
        "regression".
      seed (int): random seed for split
    Returns:
      None

    """

    with open(summary_path, "r") as f:
        summary_dic = json.load(f)

    # apply any transformations to the data, e.g. wanting a
    # dataset that has the log of a value instead of the
    # value itself
    apply_transfs(props, summary_dic)

    # filter based on props, max species and max number of atoms
    summary_dic = subsample(summary_dic=summary_dic,
                            props=props,
                            max_specs=max_specs,
                            max_atoms=max_atoms,
                            dataset_type=dataset_type,
                            seed=seed)

    # path csv file with SMILES and properties
    all_csv = os.path.join(csv_folder, "all.csv")
    if not os.path.isdir(csv_folder):
        os.makedirs(csv_folder)
    # write the contents of `summary_dic` to the csv
    to_csv(summary_dic, props, all_csv)

    # run the chemprop script `split_data.py` to make the splits
    # from `all.csv`

    script = os.path.join(cp_folder, "scripts", "split_data.py")
    split_str = " ".join(np.array(split_sizes).astype("str"))
    cmd = (f"source activate chemprop && "
           f"python {script} --split_type {split_type} "
           f"--split_sizes {split_str} "
           f"--data_path {all_csv} "
           f"--save_dir {csv_folder} "
           f"--seed {seed}")
    p = bash_command(cmd)
    p.wait()