Esempio n. 1
0
def run_sklearn(args: SklearnTrainArgs,
                data: MoleculeDataset,
                logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 loading data and training the scikit-learn model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    debug('Loading data')
    data = get_data(path=args.data_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=args.target_columns)
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_columns=args.smiles_columns,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          num_folds=args.num_folds,
                                          args=args)

    if args.save_smiles_splits:
        save_smiles_splits(
            data_path=args.data_path,
            save_dir=args.save_dir,
            task_names=args.task_names,
            features_path=args.features_path,
            train_data=train_data,
            test_data=test_data,
            smiles_columns=args.smiles_columns,
        )

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            for s in datapoint.smiles:
                datapoint.extend_features(
                    morgan_fingerprint(mol=s,
                                       radius=args.radius,
                                       num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1,
                                          random_state=args.seed)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metrics=args.metrics,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metrics=args.metrics,
                                    args=args,
                                    logger=logger)

    for metric in args.metrics:
        info(f'Test {metric} = {np.nanmean(scores[metric])}')

    return scores
Esempio n. 2
0
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]:
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    metric_func = get_metric_func(args.metric)

    debug('Loading data')
    data = get_data(path=args.data_path,
                    smiles_column=args.smiles_column,
                    target_columns=args.target_columns)
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_column=args.smiles_column,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          args=args)

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metric_func=metric_func,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metric_func=metric_func,
                                    args=args,
                                    logger=logger)

    info(f'Test {args.metric} = {np.nanmean(scores)}')

    return scores