def run_sklearn(args: SklearnTrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, num_folds=args.num_folds, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) for metric in args.metrics: info(f'Test {metric} = {np.nanmean(scores[metric])}') return scores
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]: if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) metric_func = get_metric_func(args.metric) debug('Loading data') data = get_data(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns) args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) info(f'Test {args.metric} = {np.nanmean(scores)}') return scores