Esempio n. 1
0
def cross_validate_sklearn(args: SklearnTrainArgs,
                           logger: Logger = None) -> Tuple[float, float]:
    info = logger.info if logger is not None else print
    init_seed = args.seed
    save_dir = args.save_dir

    # Run training on different random seeds for each fold
    all_scores = []
    for fold_num in range(args.num_folds):
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores = run_sklearn(args, logger)
        all_scores.append(model_scores)
    all_scores = np.array(all_scores)

    # Report scores for each fold
    for fold_num, scores in enumerate(all_scores):
        info(
            f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}'
        )

    # Report scores across folds
    avg_scores = np.nanmean(
        all_scores, axis=1)  # average score for each model across tasks
    mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
    info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}')

    return mean_score, std_score
Esempio n. 2
0
def sklearn_train() -> None:
    """Parses scikit-learn training arguments and trains a scikit-learn model.

    This is the entry point for the command line command :code:`sklearn_train`.
    """
    cross_validate(args=SklearnTrainArgs().parse_args(),
                   train_func=run_sklearn)
Esempio n. 3
0
def sklearn_train() -> None:
    """Runs sklearn training."""
    args = SklearnTrainArgs().parse_args()
    logger = create_logger(name='sklearn-train',
                           save_dir=args.save_dir,
                           quiet=args.quiet)
    cross_validate_sklearn(args, logger)
Esempio n. 4
0
def sklearn_train() -> None:
    """Parses scikit-learn training arguments and trains a scikit-learn model.

    This is the entry point for the command line command :code:`sklearn_train`.
    """
    args = SklearnTrainArgs().parse_args()
    logger = create_logger(name='sklearn-train',
                           save_dir=args.save_dir,
                           quiet=args.quiet)
    cross_validate_sklearn(args, logger)
Esempio n. 5
0
def predict_sklearn(args: SklearnPredictArgs):
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_column=args.smiles_column,
                    target_columns=[])

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        datapoint.set_features(
            morgan_fingerprint(mol=datapoint.smiles,
                               radius=train_args.radius,
                               num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    assert len(data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
Esempio n. 6
0
def cross_validate_sklearn(args: SklearnTrainArgs) -> Tuple[float, float]:
    """
    Runs k-fold cross-validation for a scikit-learn model.

    For each of k splits (folds) of the data, trains and tests a model on that split
    and aggregates the performance across folds.

    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 loading data and training the scikit-learn model.
    :return: A tuple containing the mean and standard deviation performance across folds.
    """
    logger = create_logger(name=SKLEARN_TRAIN_LOGGER_NAME,
                           save_dir=args.save_dir,
                           quiet=args.quiet)
    info = logger.info if logger is not None else print
    init_seed = args.seed
    save_dir = args.save_dir

    # Run training on different random seeds for each fold
    all_scores = []
    for fold_num in range(args.num_folds):
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores = run_sklearn(args, logger)
        all_scores.append(model_scores)
    all_scores = np.array(all_scores)

    # Report scores for each fold
    for fold_num, scores in enumerate(all_scores):
        info(
            f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}'
        )

    # Report scores across folds
    avg_scores = np.nanmean(
        all_scores, axis=1)  # average score for each model across tasks
    mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
    info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}')

    return mean_score, std_score
Esempio n. 7
0
def predict_sklearn(args: SklearnPredictArgs) -> None:
    """
    Loads data and a trained scikit-learn model and uses the model to make predictions on the data.

   :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for
                 loading data, loading a trained scikit-learn model, and making predictions with the model.
    """
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=[],
                    ignore_columns=[],
                    store_row=True)

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        for s in datapoint.smiles:
            datapoint.extend_features(
                morgan_fingerprint(mol=s,
                                   radius=train_args.radius,
                                   num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    # assert len(data) == len(avg_preds)    #TODO: address with unit test later
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
Esempio n. 8
0
def run_sklearn(args: SklearnTrainArgs,
                data: MoleculeDataset,
                logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 loading data and training the scikit-learn model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          num_folds=args.num_folds,
                                          args=args)

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metrics=args.metrics,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metrics=args.metrics,
                                    args=args,
                                    logger=logger)

    for metric in args.metrics:
        info(f'Test {metric} = {np.nanmean(scores[metric])}')

    return scores
Esempio n. 9
0
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]:
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    metric_func = get_metric_func(args.metric)

    debug('Loading data')
    data = get_data(path=args.data_path,
                    smiles_column=args.smiles_column,
                    target_columns=args.target_columns)
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_column=args.smiles_column,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          args=args)

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metric_func=metric_func,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metric_func=metric_func,
                                    args=args,
                                    logger=logger)

    info(f'Test {args.metric} = {np.nanmean(scores)}')

    return scores
Esempio n. 10
0
from chemprop.args import SklearnTrainArgs
from chemprop.sklearn_train import cross_validate_sklearn
from chemprop.utils import create_logger

if __name__ == '__main__':
    args = SklearnTrainArgs().parse_args()

    logger = create_logger(name='sklearn-train',
                           save_dir=args.save_dir,
                           quiet=args.quiet)

    if args.metric is None:
        if args.dataset_type == 'regression':
            args.metric = 'rmse'
        elif args.dataset_type == 'classification':
            args.metric = 'auc'
        else:
            raise ValueError(
                f'Default metric not supported for dataset_type "{args.dataset_type}"'
            )

    cross_validate_sklearn(args, logger)