Ejemplo n.º 1
0
def multi_task_sklearn(model,
                       train_data: MolPairDataset,
                       val_data: MolPairDataset,
                       test_data: MolPairDataset,
                       metric_func: Callable,
                       args: Namespace,
                       logger: Logger = None) -> List[float]:
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print
    num_tasks = train_data.num_tasks()

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    # Train
    feats = compose_feats(train_data)
    debug(f'Using {feats[0].shape} features')
    model.fit(feats, train_targets)

    # Save model
    with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f:
        pickle.dump(model, f)

    debug('Predicting')
    test_preds = predict(model=model,
                         model_type=args.model_type,
                         dataset_type=args.dataset_type,
                         features=compose_feats(test_data))

    if args.save_preds:
        val_preds = predict(model=model,
                            model_type=args.model_type,
                            dataset_type=args.dataset_type,
                            features=compose_feats(val_data))
        val_scores = evaluate_predictions(preds=val_preds,
                                          targets=val_data.targets(),
                                          num_tasks=num_tasks,
                                          metric_func=metric_func,
                                          dataset_type=args.dataset_type,
                                          logger=logger)
        info(f'Val {args.metric} = {np.nanmean(val_scores)}')
        save_predictions(args.save_dir, None, val_data, test_data, None,
                         val_preds, test_preds, args.task_names)

    scores = evaluate_predictions(preds=test_preds,
                                  targets=test_data.targets(),
                                  num_tasks=num_tasks,
                                  metric_func=metric_func,
                                  dataset_type=args.dataset_type,
                                  logger=logger)
    return scores
Ejemplo n.º 2
0
def multi_task_sklearn(model,
                       train_data: MoleculeDataset,
                       test_data: MoleculeDataset,
                       metric_func: Callable,
                       args: SklearnTrainArgs,
                       logger: Logger = None) -> List[float]:
    num_tasks = train_data.num_tasks()

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    # Train
    model.fit(train_data.features(), train_targets)

    # Save model
    with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f:
        pickle.dump(model, f)

    test_preds = predict(model=model,
                         model_type=args.model_type,
                         dataset_type=args.dataset_type,
                         features=test_data.features())

    scores = evaluate_predictions(preds=test_preds,
                                  targets=test_data.targets(),
                                  num_tasks=num_tasks,
                                  metric_func=metric_func,
                                  dataset_type=args.dataset_type,
                                  logger=logger)

    return scores
Ejemplo n.º 3
0
def multi_task_random_forest(train_data: MoleculeDataset,
                             test_data: MoleculeDataset,
                             metric_func: Callable,
                             args: Namespace) -> List[float]:
    if args.dataset_type == 'regression':
        model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1)
    elif args.dataset_type == 'classification':
        model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1)
    else:
        raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    model.fit(train_data.features(), train_targets)

    test_preds = model.predict(test_data.features())
    if train_data.num_tasks() == 1:
        test_preds = [[pred] for pred in test_preds]

    scores = evaluate_predictions(
        preds=test_preds,
        targets=test_data.targets(),
        metric_func=metric_func,
        dataset_type=args.dataset_type
    )

    return scores
Ejemplo n.º 4
0
def compute_values(dataset: str, preds: List[List[List[float]]],
                   targets: List[List[List[float]]]) -> List[float]:
    num_tasks = len(preds[0][0])

    values = [
        evaluate_predictions(preds=pred,
                             targets=target,
                             num_tasks=num_tasks,
                             metric_func=DATASETS[dataset]['metric'],
                             dataset_type=DATASETS[dataset]['type'],
                             logger=FAKE_LOGGER)
        for pred, target in tqdm(zip(preds, targets), total=len(preds))
    ]

    values = [np.nanmean(value) for value in values]

    return values
Ejemplo n.º 5
0
def single_task_random_forest(train_data: MoleculeDataset,
                              test_data: MoleculeDataset,
                              metric_func: Callable,
                              args: Namespace,
                              logger: Logger = None) -> List[float]:
    scores = []
    num_tasks = train_data.num_tasks()
    for task_num in trange(num_tasks):
        # Only get features and targets for molecules where target is not None
        train_features, train_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                train_data.features(), train_data.targets())
              if targets[task_num] is not None])
        test_features, test_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                test_data.features(), test_data.targets())
              if targets[task_num] is not None])

        if args.dataset_type == 'regression':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.dataset_type == 'classification':
            model = RandomForestClassifier(class_weight=args.class_weight,
                                           n_estimators=args.num_trees,
                                           n_jobs=-1)
        else:
            raise ValueError(
                f'dataset_type "{args.dataset_type}" not supported.')

        model.fit(train_features, train_targets)

        test_preds = model.predict(test_features)

        test_preds = [[pred] for pred in test_preds]
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(preds=test_preds,
                                     targets=test_targets,
                                     num_tasks=1,
                                     metric_func=metric_func,
                                     dataset_type=args.dataset_type,
                                     logger=logger)
        scores.append(score[0])

    return scores
Ejemplo n.º 6
0
def single_task_sklearn(model,
                        train_data: MoleculeDataset,
                        test_data: MoleculeDataset,
                        metric_func: Callable,
                        args: SklearnTrainArgs,
                        logger: Logger = None) -> List[float]:
    scores = []
    num_tasks = train_data.num_tasks()
    for task_num in trange(num_tasks):
        # Only get features and targets for molecules where target is not None
        train_features, train_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                train_data.features(), train_data.targets())
              if targets[task_num] is not None])
        test_features, test_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                test_data.features(), test_data.targets())
              if targets[task_num] is not None])

        model.fit(train_features, train_targets)

        test_preds = predict(model=model,
                             model_type=args.model_type,
                             dataset_type=args.dataset_type,
                             features=test_features)
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(preds=test_preds,
                                     targets=test_targets,
                                     num_tasks=1,
                                     metric_func=metric_func,
                                     dataset_type=args.dataset_type,
                                     logger=logger)
        scores.append(score[0])

    return scores
Ejemplo n.º 7
0
def run_training(args: Namespace, logger: Logger = None) -> List[float]:
    """
    Trains a model and returns test scores on the model checkpoint with the highest validation score.

    :param args: Arguments.
    :param logger: Logger.
    :return: A list of ensemble scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print args
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.2, 0.0),
                                              seed=args.seed,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        with open(args.data_path, 'r') as f:
            reader = csv.reader(f)
            header = next(reader)

            lines_by_smiles = {}
            indices_by_smiles = {}
            for i, line in enumerate(reader):
                smiles = line[0]
                lines_by_smiles[smiles] = line
                indices_by_smiles[smiles] = i

        all_split_indices = []
        for dataset, name in [(train_data, 'train'), (val_data, 'val'),
                              (test_data, 'test')]:
            with open(os.path.join(args.save_dir, name + '_smiles.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(['smiles'])
                for smiles in dataset.smiles():
                    writer.writerow([smiles])
            with open(os.path.join(args.save_dir, name + '_full.csv'),
                      'w') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                for smiles in dataset.smiles():
                    writer.writerow(lines_by_smiles[smiles])
            split_indices = []
            for smiles in dataset.smiles():
                split_indices.append(indices_by_smiles[smiles])
                split_indices = sorted(split_indices)
            all_split_indices.append(split_indices)
        with open(os.path.join(args.save_dir, 'split_indices.pckl'),
                  'wb') as f:
            pickle.dump(all_split_indices, f)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)
        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    current_args=args,
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = build_model(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
            model = model.cuda()

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data=train_data,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data=val_data,
                                  num_tasks=args.num_tasks,
                                  metric_func=metric_func,
                                  batch_size=args.batch_size,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(f'validation_{task_name}_{args.metric}',
                                      val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                cuda=args.cuda,
                                logger=logger)

        test_preds = predict(model=model,
                             data=test_data,
                             batch_size=args.batch_size,
                             scaler=scaler)
        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}',
                                  test_score, n_iter)

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')
    writer.add_scalar(f'ensemble_test_{args.metric}', avg_ensemble_test_score,
                      0)

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores