Beispiel #1
0
def parse_csv(pred_path,
              true_path,
              target):
    """
    Get the list of predicted and real values from a csv file.
    Running `predict.sh` on the results of a ChemProp calculation
    produces a csv file of the predictions of each ChemProp fold
    and a JSON file that summarizes the predictions of each fold.

    Args:
        pred_path (str): path to predicted values
        true_path (str): path to real values
        target (str): name of property you're predicting
    Returns:
        pred (list[np.array]): the predictions of this model.
            Given as a list of length 1 that contains an
            array of length `num_species` (number of species).
            Given in this way to be consistent with `parse_json`
            below.
        real (list[np.array]): same as `pred` but with the real
            values.
    """

    pred_dic = read_csv(pred_path)
    pred = np.array(pred_dic[target])

    real_dic = read_csv(true_path)
    real = np.array(real_dic[target])

    return [pred], [real]
Beispiel #2
0
def predict(cp_folder, test_path, cp_model_path, device, check_paths):
    """
    Get and save the prediction results from a ChemProp model.
    Args:
      cp_folder (str): path to the chemprop folder on your computer
      test_path (str): path to the file with the test SMILES and their properties
      cp_model_path (str): path to the folder with the model of interest
      device (Union[str, int]): device to evaluate the model on
      check_paths (list[str]): paths to the different model checkpoints
    Returns:
      reals (dict):dictionary of the form {prop: real}, where `real`
          are the real values of the property `prop`.
      preds (list[dict]): same as `real` but for predicted. One for each
          model.
    """

    script = os.path.join(cp_folder, "predict.py")
    preds_path = os.path.join(cp_model_path, f"test_pred.csv")

    # load the arguments from that model to get the features path
    args_path = f"{cp_model_path}/fold_0/args.json"
    if not os.path.isfile(args_path):
        args_path = args_path.replace("fold_0/", "")
    with open(args_path, "r") as f:
        args = json.load(f)
    features_path = args["separate_test_features_path"]

    # predictions from different models
    preds = []

    for i, check_path in enumerate(check_paths):

        # make the chemprop command

        this_path = preds_path.replace(".csv", f"_{i}.csv")
        cmd = (f"source activate chemprop && python {script} "
               f" --test_path {test_path} --preds_path {this_path} "
               f" --checkpoint_paths {check_path} ")

        if device == "cpu":
            cmd += f" --no_cuda"
        else:
            cmd += f" --gpu {device} "

        if features_path is not None:
            feat_str = " ".join(features_path)
            cmd += f" --features_path {feat_str}"

        p = bash_command(cmd)
        p.wait()

        pred = read_csv(this_path)
        preds.append(pred)

    real = read_csv(test_path)

    return real, preds
Beispiel #3
0
def get_splits(sample_dic, csv_folder):
    """
    Figure out which split (train, val or test) each SMILES in
    `sample_dic` belongs to.

    Args:
        sample_dic (dict): Sample of `summary_dic` that is used
            in this combined dataset. `summary_dic` contains
            information about all smiles strings we have, except
            for their conformers.
        csv_folder (str): path to folder that contains the csv files
            with the test/val/train smiles.
    Returns:
        sample_dic (dict): `sample_dic`, but with each sub-dictionary
            updated to contain the split assignment of the SMILES. 
    """

    for name in ["train", "val", "test"]:
        path = os.path.join(csv_folder, f"{name}_full.csv")
        csv_dic = read_csv(path)
        for i, smiles in enumerate(csv_dic["smiles"]):
            # add any properties present in the csv
            props = {
                key: csv_dic[key][i]
                for key in csv_dic.keys() if key != "smiles"
            }
            sample_dic[smiles].update({"split": name, **props})

    # get rid of anything that doesn't have a split labels
    keys = list(sample_dic.keys())
    for key in keys:
        if "split" not in sample_dic[key]:
            sample_dic.pop(key)

    return sample_dic
Beispiel #4
0
def load_data(train_path, val_path, test_path):
    """
    Load data from csvs into a dictionary for the different splits.
    Args:
      train_path (str): path to csv with training data
      val_path (str): path to csv with validation data
      test_path (str): path to csv with test data
    Returns:
      data (dict): dictionary of the form {split: sub_dic} for each
        split, where sub_dic contains SMILES strings and values for
        each property.

    """
    data = {}
    paths = [train_path, val_path, test_path]
    names = ["train", "val", "test"]
    for name, path in zip(names, paths):
        data[name] = read_csv(path)

    return data
def make_hyp_csvs(base_config_path, max_specs, seed):
    """
    Make csv files for the subsection of the SMILES that will be used
    for hyperparameter optimization.
    Args:
      base_config_path (str): where your basic job config file
        is, with parameters that may or may not be changed depending
        on the given run
      max_specs (int): maximum number of species to use in hyperparameter
        optimization.
      seed (int): random seed to use for split.
    Returns:
      None
    """

    # load the base config
    with open(base_config_path, "r") as f:
        base_dic = json.load(f)

    # load the SMILES strings from the train and validation
    # paths, then sample them
    train_path = base_dic["data_path"]
    val_path = base_dic.get("separate_val_path")
    paths = [train_path, val_path]

    # initialize the dictionary by reading the train data
    prop_dic = read_csv(paths[0])

    # if the validation data is separate, add the data lists
    # together

    if val_path is not None:
        new_dic = read_csv(val_path)
        for key, val in new_dic.items():
            prop_dic[key] += val

    # generate a proportional sample by first getting the
    # properties to be predicted, then making a `sample_dic`,
    # and finally calling `prop_split`

    props = list(filter(lambda x: x != "smiles", prop_dic.keys()))
    dataset_type = base_dic.get("dataset_type", "regression")

    num_smiles = len(prop_dic["smiles"])
    sample_dic = {
        prop_dic["smiles"][idx]: {prop: prop_dic[prop][idx]
                                  for prop in props}
        for idx in range(num_smiles)
    }

    keep_smiles = prop_split(max_specs=max_specs,
                             dataset_type=dataset_type,
                             props=props,
                             sample_dic=sample_dic,
                             seed=seed)

    # save to csv
    new_dic = {"smiles": keep_smiles}
    for prop in props:
        new_dic.update({prop: [sample_dic[key][prop] for key in keep_smiles]})

    smiles_folder = "/".join(train_path.split("/")[:-1])
    hyp_path = os.path.join(smiles_folder, "hyperopt_full.csv")
    write_csv(hyp_path, new_dic)