예제 #1
0
def chemprop_predict():
    """Parses Chemprop predicting arguments and runs prediction using a trained Chemprop model.

    This is the entry point for the command line command :code:`chemprop_predict`.
    """
    args = PredictArgs().parse_args()
    all_smiles = None
    if args.smiles is not None:
        smiles = args.smiles.split(',')
        all_smiles = [[s] for s in smiles]

    _, preds = make_predictions(args=PredictArgs().parse_args(),
                                smiles=all_smiles)
    return preds
예제 #2
0
def predict_outside(args_dict):
    """
    Used for calling this script from another python script.
    :dict args_dict: dict of args to use
    """
    sys.argv = create_args(args_dict, 'predict.py')

    args = PredictArgs().parse_args()
    make_predictions(args)
예제 #3
0
def predict(data, model_predict):
    """
    Modify this method to add pre and post processing for scoring calls.  For example, this can be
    used to implement one-hot encoding for models that don't include it on their own.

    Parameters
    ----------
    data: pd.DataFrame
    model_predict: Callable[[pd.DataFrame], pd.DataFrame]

    Returns
    -------
    pd.DataFrame
    """
    # Execute any steps you need to do before scoring

    # This method makes predictions against the raw, deserialized model
    #predictions = model_predict(data)

    data.to_csv("/opt/code/chemprop_folder/for_scoring.csv", index=False)

    args = PredictArgs().parse_args([
        '--test_path', '/opt/chemprop_folder/for_scoring.csv',
        '--checkpoint_path', '/opt/code/model.pth', '--preds_path',
        '/opt/chemprop_folder/preds.csv'
    ])

    make_predictions(args)

    preds_df = pds.read_csv("/opt/chemprop_folder/preds.csv")
    sh = str(preds_df.shape)
    print(sh)

    preds_df = preds_df.rename(columns={"p_np": "positive_class_label"})
    preds_df = preds_df.drop(columns=['smiles'])
    preds_df["negative_class_label"] = 1 - preds_df["positive_class_label"]

    print(preds_df.head())

    # Execute any steps you need to do after scoring
    # Note: To properly send predictions back to DataRobot, the returned DataFrame should contain a
    # column for each output label for classification or a single value column for regression
    return preds_df
예제 #4
0
def chemprop_predict() -> None:
    """Parses Chemprop predicting arguments and runs prediction using a trained Chemprop model.

    This is the entry point for the command line command :code:`chemprop_predict`.
    """
    make_predictions(args=PredictArgs().parse_args())
예제 #5
0
def predict():
    """Renders the predict page and makes predictions if the method is POST."""
    if request.method == 'GET':
        return render_predict()

    # Get arguments
    ckpt_id = request.form['checkpointName']

    if request.form['textSmiles'] != '':
        smiles = request.form['textSmiles'].split()
    elif request.form['drawSmiles'] != '':
        smiles = [request.form['drawSmiles']]
    else:
        # Upload data file with SMILES
        data = request.files['data']
        data_name = secure_filename(data.filename)
        data_path = os.path.join(app.config['TEMP_FOLDER'], data_name)
        data.save(data_path)

        # Check if header is smiles
        possible_smiles = get_header(data_path)[0]
        smiles = [possible_smiles
                  ] if Chem.MolFromSmiles(possible_smiles) is not None else []

        # Get remaining smiles
        smiles.extend(get_smiles(data_path))

    models = db.get_models(ckpt_id)
    model_paths = [
        os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt')
        for model in models
    ]

    task_names = load_task_names(model_paths[0])
    num_tasks = len(task_names)
    gpu = request.form.get('gpu')
    train_args = load_args(model_paths[0])

    # Build arguments
    arguments = [
        '--test_path', 'None', '--preds_path',
        os.path.join(app.config['TEMP_FOLDER'],
                     app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths',
        *model_paths
    ]

    if gpu is not None:
        if gpu == 'None':
            arguments.append('--no_cuda')
        else:
            arguments += ['--gpu', gpu]

    # Handle additional features
    if train_args.features_path is not None:
        # TODO: make it possible to specify the features generator if trained using features_path
        arguments += [
            '--features_generator', 'rdkit_2d_normalized',
            '--no_features_scaling'
        ]
    elif train_args.features_generator is not None:
        arguments += ['--features_generator', *train_args.features_generator]

        if not train_args.features_scaling:
            arguments.append('--no_features_scaling')

    # Parse arguments
    args = PredictArgs().parse_args(arguments)

    # Run predictions
    preds = make_predictions(args=args, smiles=smiles)

    if all(p is None for p in preds):
        return render_predict(errors=['All SMILES are invalid'])

    # Replace invalid smiles with message
    invalid_smiles_warning = 'Invalid SMILES String'
    preds = [
        pred if pred is not None else [invalid_smiles_warning] * num_tasks
        for pred in preds
    ]

    return render_predict(
        predicted=True,
        smiles=smiles,
        num_smiles=min(10, len(smiles)),
        show_more=max(0,
                      len(smiles) - 10),
        task_names=task_names,
        num_tasks=len(task_names),
        preds=preds,
        warnings=["List contains invalid SMILES strings"]
        if None in preds else None,
        errors=["No SMILES strings given"] if len(preds) == 0 else None)
예제 #6
0
def chemprop_predict() -> None:
    """Runs chemprop predicting."""
    make_predictions(PredictArgs().parse_args())
예제 #7
0
def chemprop_fingerprint() -> None:
    """
    Parses Chemprop predicting arguments and returns the latent representation vectors for
    provided molecules, according to a previously trained model.
    """
    molecule_fingerprint(args=PredictArgs().parse_args())
예제 #8
0
"""Loads a trained model checkpoint and makes predictions on a dataset."""
import torch
import numpy as np
import random

from chemprop.args import PredictArgs
from chemprop.train import make_predictions

if __name__ == '__main__':
    args = PredictArgs().parse_args()
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    np.random.seed(0)
    # this one is needed for torchtext random call (shuffled iterator)
    # in multi gpu it ensures datasets are read in the same order
    random.seed(0)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

    make_predictions(args)
예제 #9
0
def make_predictions(
    args: PredictArgs,
    smiles: List[List[str]] = None,
    model_objects: Tuple[PredictArgs, TrainArgs, List[MoleculeModel],
                         List[StandardScaler], int, List[str], ] = None,
    calibrator: UncertaintyCalibrator = None,
    return_invalid_smiles: bool = True,
    return_index_dict: bool = False,
    return_uncertainty: bool = False,
) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :param model_objects: Tuple of output of load_model function which can be called separately outside this function. Preloaded model objects should have
                used the non-generator option for load_model if the objects are to be used multiple times or are intended to be used for calibration as well.
    :param calibrator: A :class: `~chemprop.uncertainty.UncertaintyCalibrator` object, for use in calibrating uncertainty predictions.
                Can be preloaded and provided as a function input or constructed within the function from arguments. The models and scalers used
                to initiate the calibrator must be lists instead of generators if the same calibrator is to be used multiple times or
                if the same models and scalers objects are also part of the provided model_objects input.
    :param return_invalid_smiles: Whether to return predictions of "Invalid SMILES" for invalid SMILES, otherwise will skip them in returned predictions.
    :param return_index_dict: Whether to return the prediction results as a dictionary keyed from the initial data indexes.
    :param return_uncertainty: Whether to return uncertainty predictions alongside the model value predictions.
    :return: A list of lists of target predictions. If returning uncertainty, a tuple containing first prediction values then uncertainty estimates.
    """
    if model_objects:
        (
            args,
            train_args,
            models,
            scalers,
            num_tasks,
            task_names,
        ) = model_objects
    else:
        (
            args,
            train_args,
            models,
            scalers,
            num_tasks,
            task_names,
        ) = load_model(args, generator=True)

    num_models = len(args.checkpoint_paths)

    set_features(args, train_args)

    # Note: to get the invalid SMILES for your data, use the get_invalid_smiles_from_file or get_invalid_smiles_from_list functions from data/utils.py
    full_data, test_data, test_data_loader, full_to_valid_indices = load_data(
        args, smiles)

    if args.uncertainty_method is None and (args.calibration_method is not None
                                            or args.evaluation_methods
                                            is not None):
        if args.dataset_type in ['classification', 'multiclass']:
            args.uncertainty_method = 'classification'
        else:
            raise ValueError(
                'Cannot calibrate or evaluate uncertainty without selection of an uncertainty method.'
            )

    if calibrator is None and args.calibration_path is not None:

        calibration_data = get_data(
            path=args.calibration_path,
            smiles_columns=args.smiles_columns,
            target_columns=task_names,
            features_path=args.calibration_features_path,
            features_generator=args.features_generator,
            phase_features_path=args.calibration_phase_features_path,
            atom_descriptors_path=args.calibration_atom_descriptors_path,
            bond_features_path=args.calibration_bond_features_path,
            max_data_size=args.max_data_size,
            loss_function=args.loss_function,
        )

        calibration_data_loader = MoleculeDataLoader(
            dataset=calibration_data,
            batch_size=args.batch_size,
            num_workers=args.num_workers,
        )

        if isinstance(models, List) and isinstance(scalers, List):
            calibration_models = models
            calibration_scalers = scalers
        else:
            calibration_model_objects = load_model(args, generator=True)
            calibration_models = calibration_model_objects[2]
            calibration_scalers = calibration_model_objects[3]

        calibrator = build_uncertainty_calibrator(
            calibration_method=args.calibration_method,
            uncertainty_method=args.uncertainty_method,
            interval_percentile=args.calibration_interval_percentile,
            regression_calibrator_metric=args.regression_calibrator_metric,
            calibration_data=calibration_data,
            calibration_data_loader=calibration_data_loader,
            models=calibration_models,
            scalers=calibration_scalers,
            num_models=num_models,
            dataset_type=args.dataset_type,
            loss_function=args.loss_function,
            uncertainty_dropout_p=args.uncertainty_dropout_p,
            dropout_sampling_size=args.dropout_sampling_size,
            spectra_phase_mask=getattr(train_args, "spectra_phase_mask", None),
        )

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        preds = [None] * len(full_data)
        unc = [None] * len(full_data)
    else:
        preds, unc = predict_and_save(
            args=args,
            train_args=train_args,
            test_data=test_data,
            task_names=task_names,
            num_tasks=num_tasks,
            test_data_loader=test_data_loader,
            full_data=full_data,
            full_to_valid_indices=full_to_valid_indices,
            models=models,
            scalers=scalers,
            num_models=num_models,
            calibrator=calibrator,
            return_invalid_smiles=return_invalid_smiles,
        )

    if return_index_dict:
        preds_dict = {}
        unc_dict = {}
        for i in range(len(full_data)):
            if return_invalid_smiles:
                preds_dict[i] = preds[i]
                unc_dict[i] = unc[i]
            else:
                valid_index = full_to_valid_indices.get(i, None)
                if valid_index is not None:
                    preds_dict[i] = preds[valid_index]
                    unc_dict[i] = unc[valid_index]
        if return_uncertainty:
            return preds_dict, unc_dict
        else:
            return preds_dict
    else:
        if return_uncertainty:
            return preds, unc
        else:
            return preds
예제 #10
0
def predict():
    args = PredictArgs()