def cross_validate_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> Tuple[float, float]: info = logger.info if logger is not None else print init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_sklearn(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) # Report scores across folds avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') return mean_score, std_score
def sklearn_train() -> None: """Parses scikit-learn training arguments and trains a scikit-learn model. This is the entry point for the command line command :code:`sklearn_train`. """ cross_validate(args=SklearnTrainArgs().parse_args(), train_func=run_sklearn)
def sklearn_train() -> None: """Runs sklearn training.""" args = SklearnTrainArgs().parse_args() logger = create_logger(name='sklearn-train', save_dir=args.save_dir, quiet=args.quiet) cross_validate_sklearn(args, logger)
def sklearn_train() -> None: """Parses scikit-learn training arguments and trains a scikit-learn model. This is the entry point for the command line command :code:`sklearn_train`. """ args = SklearnTrainArgs().parse_args() logger = create_logger(name='sklearn-train', save_dir=args.save_dir, quiet=args.quiet) cross_validate_sklearn(args, logger)
def predict_sklearn(args: SklearnPredictArgs): print('Loading data') data = get_data(path=args.test_path, smiles_column=args.smiles_column, target_columns=[]) print('Loading training arguments') with open(args.checkpoint_paths[0], 'rb') as f: model = pickle.load(f) train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict( model.train_args, skip_unsettable=True) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=train_args.radius, num_bits=train_args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), train_args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=train_args.model_type, dataset_type=train_args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to data for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(train_args.task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def cross_validate_sklearn(args: SklearnTrainArgs) -> Tuple[float, float]: """ Runs k-fold cross-validation for a scikit-learn model. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=SKLEARN_TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) info = logger.info if logger is not None else print init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_sklearn(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) # Report scores across folds avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') return mean_score, std_score
def predict_sklearn(args: SklearnPredictArgs) -> None: """ Loads data and a trained scikit-learn model and uses the model to make predictions on the data. :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for loading data, loading a trained scikit-learn model, and making predictions with the model. """ print('Loading data') data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], store_row=True) print('Loading training arguments') with open(args.checkpoint_paths[0], 'rb') as f: model = pickle.load(f) train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict( model.train_args, skip_unsettable=True) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): for s in datapoint.smiles: datapoint.extend_features( morgan_fingerprint(mol=s, radius=train_args.radius, num_bits=train_args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), train_args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=train_args.model_type, dataset_type=train_args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') # assert len(data) == len(avg_preds) #TODO: address with unit test later makedirs(args.preds_path, isfile=True) # Copy predictions over to data for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(train_args.task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def run_sklearn(args: SklearnTrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, num_folds=args.num_folds, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) for metric in args.metrics: info(f'Test {metric} = {np.nanmean(scores[metric])}') return scores
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]: if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) metric_func = get_metric_func(args.metric) debug('Loading data') data = get_data(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns) args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) info(f'Test {args.metric} = {np.nanmean(scores)}') return scores
from chemprop.args import SklearnTrainArgs from chemprop.sklearn_train import cross_validate_sklearn from chemprop.utils import create_logger if __name__ == '__main__': args = SklearnTrainArgs().parse_args() logger = create_logger(name='sklearn-train', save_dir=args.save_dir, quiet=args.quiet) if args.metric is None: if args.dataset_type == 'regression': args.metric = 'rmse' elif args.dataset_type == 'classification': args.metric = 'auc' else: raise ValueError( f'Default metric not supported for dataset_type "{args.dataset_type}"' ) cross_validate_sklearn(args, logger)