Esempio n. 1
0
def class_balance(data_path: str, split_type: str):
    # Update args
    args.val_fold_index, args.test_fold_index = 1, 2
    args.split_type = 'predetermined'

    # Load data
    data = get_data(path=args.data_path,
                    smiles_columns=args.smiles_column,
                    target_columns=args.target_columns)
    args.task_names = args.target_columns or get_task_names(
        path=args.data_path, smiles_columns=args.smiles_column)

    # Average class sizes
    all_class_sizes = {'train': [], 'val': [], 'test': []}

    for i in range(10):
        print(f'Fold {i}')

        # Update args
        data_name = os.path.splitext(os.path.basename(data_path))[0]
        args.folds_file = f'/data/rsg/chemistry/yangk/lsc_experiments_dump_splits/data/{data_name}/{split_type}/fold_{i}/0/split_indices.pckl'

        if not os.path.exists(args.folds_file):
            print(f'Fold indices do not exist')
            continue

        # Split data
        train_data, val_data, test_data = split_data(
            data=data, split_type=args.split_type, args=args)

        # Determine class balance
        for data_split, split_name in [(train_data, 'train'),
                                       (val_data, 'val'), (test_data, 'test')]:
            class_sizes = get_class_sizes(data_split)
            print(f'Class sizes for {split_name}')

            for i, task_class_sizes in enumerate(class_sizes):
                print(
                    f'{args.task_names[i]} '
                    f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
                )

            all_class_sizes[split_name].append(class_sizes)

        print()

    # Mean and std across folds
    for split_name in ['train', 'val', 'test']:
        print(f'Average class sizes for {split_name}')

        mean_class_sizes, std_class_sizes = np.mean(
            all_class_sizes[split_name],
            axis=0), np.std(all_class_sizes[split_name], axis=0)

        for i, (mean_task_class_sizes, std_task_class_sizes) in enumerate(
                zip(mean_class_sizes, std_class_sizes)):
            print(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {mean_size * 100:.2f}% +/- {std_size * 100:.2f}%" for cls, (mean_size, std_size) in enumerate(zip(mean_task_class_sizes, std_task_class_sizes)))}'
            )
Esempio n. 2
0
 def test_features_and_phase_features(self):
     """Testing the handling of phase features"""
     data = get_data(path=self.data_path,
                     features_path=['dummy_path.csv'],
                     phase_features_path='dummy_path.csv')
     self.assertTrue(
         np.array_equal(data.features(),
                        [[0, 1, 0, 1], [1, 0, 1, 0], [1, 0, 1, 0]]))
def examine_split_balance(split_type: str):
    results = []

    for dataset in DATASETS:
        # Load task names for the dataset
        data_path = os.path.join(BASE, dataset, f'{dataset}.csv')
        data = get_data(data_path)

        # Get class balance ratios for full dataset
        ratios = compute_ratios(data)

        # Initialize array of diffs between ratios
        ratio_diffs = []

        # Loop through folds
        for fold in os.listdir(os.path.join(BASE, dataset, split_type)):
            # Open fold indices
            with open(
                    os.path.join(BASE, dataset, split_type, fold, '0',
                                 'split_indices.pckl'), 'rb') as f:
                indices = pickle.load(f)

            # Get test data
            test_data = MoleculeDataset([data[index] for index in indices[2]])

            # Get test ratios
            test_ratios = compute_ratios(test_data)

            # Compute ratio diff
            ratio_diff = np.maximum(ratios / test_ratios, test_ratios / ratios)
            ratio_diff[np.where(np.isinf(ratio_diff))[0]] = np.nan

            # Add ratio diff
            ratio_diffs.append(ratio_diff)

        # Convert to numpy array
        ratio_diffs = np.array(ratio_diffs)  # num_folds x num_tasks

        # Determine number of folds and number of failures
        num_folds = len(ratio_diffs)
        num_failures = np.sum(np.isnan(ratio_diffs))

        # Average across tasks
        ratio_diffs = np.nanmean(ratio_diffs, axis=1)  # num_folds

        # Compute mean and standard deviation across folds
        mean, std = np.nanmean(ratio_diffs), np.nanstd(ratio_diffs)

        # Add results
        results.append({
            'dataset': dataset,
            'mean': mean,
            'std': std,
            'num_folds': num_folds,
            'num_failures': num_failures
        })

    pprint(results)
Esempio n. 4
0
 def test_features(self):
     """Testing the features return"""
     data = get_data(
         path=self.data_path,
         features_path=['dummy_path.csv'],
     )
     print(data.features())
     self.assertTrue(
         np.array_equal(data.features(), [[0, 1], [2, 3], [4, 5]]))
Esempio n. 5
0
def average_duplicates(args: Args):
    """Averages duplicate data points in a dataset."""
    print('Loading data')
    header = get_header(args.data_path)
    data = get_data(path=args.data_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=args.target_columns)
    print(f'Data size = {len(data):,}')

    # Map SMILES string to lists of targets
    smiles_in_order = []
    smiles_to_targets = defaultdict(list)
    for smiles, targets in zip(data.smiles(flatten=True), data.targets()):
        smiles_to_targets[smiles].append(targets)
        if len(smiles_to_targets[smiles]) == 1:
            smiles_in_order.append(smiles)

    # Find duplicates
    duplicate_count = 0
    stds = []
    new_data = []
    for smiles in smiles_in_order:
        all_targets = smiles_to_targets[smiles]
        duplicate_count += len(all_targets) - 1
        num_tasks = len(all_targets[0])

        targets_by_task = [[] for _ in range(num_tasks)]
        for task in range(num_tasks):
            for targets in all_targets:
                if targets[task] is not None:
                    targets_by_task[task].append(targets[task])

        stds.append([
            np.std(task_targets) if len(task_targets) > 0 else 0.0
            for task_targets in targets_by_task
        ])
        means = [
            np.mean(task_targets) if len(task_targets) > 0 else None
            for task_targets in targets_by_task
        ]
        new_data.append((smiles, means))

    print(f'Number of duplicates = {duplicate_count:,}')
    print(
        f'Duplicate standard deviation per task = {", ".join(f":{std:.4e}" for std in np.mean(stds, axis=0))}'
    )
    print(f'New data size = {len(new_data):,}')

    # Save new data
    with open(args.save_path, 'w') as f:
        f.write(','.join(header) + '\n')

        for smiles, avg_targets in new_data:
            f.write(smiles + ',' + ','.join(
                str(value) if value is not None else ''
                for value in avg_targets) + '\n')
Esempio n. 6
0
 def test_2features(self):
     """Testing the features return for two features paths"""
     data = get_data(
         path=self.data_path,
         features_path=['dummy_path.csv', 'also_dummy_path.csv'],
     )
     print(data.features())
     self.assertTrue(
         np.array_equal(data.features(),
                        [[0, 1, 0, 1], [2, 3, 2, 3], [4, 5, 4, 5]]))
Esempio n. 7
0
def overlap(args: Args):
    data_1 = get_data(path=args.data_path_1,
                      smiles_column=args.smiles_column_1)
    data_2 = get_data(path=args.data_path_2,
                      smiles_column=args.smiles_column_2)

    smiles1 = set(data_1.smiles())
    smiles2 = set(data_2.smiles())
    size_1, size_2 = len(smiles1), len(smiles2)
    intersection = smiles1.intersection(smiles2)
    size_intersect = len(intersection)
    print(f'Size of dataset 1: {size_1}')
    print(f'Size of dataset 2: {size_2}')
    print(f'Size of intersection: {size_intersect}')
    print(
        f'Size of intersection as frac of dataset 1: {size_intersect / size_1}'
    )
    print(
        f'Size of intersection as frac of dataset 2: {size_intersect / size_2}'
    )

    if args.save_intersection_path is not None:
        with open(args.data_path_1,
                  'r') as rf, open(args.save_intersection_path, 'w') as wf:
            reader, writer = csv.reader(rf), csv.writer(wf)
            header = next(reader)
            writer.writerow(header)
            for line in reader:
                if line[0] in intersection:
                    writer.writerow(line)

    if args.save_difference_path is not None:
        with open(args.data_path_1, 'r') as rf, open(args.save_difference_path,
                                                     'w') as wf:
            reader, writer = csv.reader(rf), csv.writer(wf)
            header = next(reader)
            writer.writerow(header)
            for line in reader():
                if line[0] not in intersection:
                    writer.writerow(line)
Esempio n. 8
0
def create_time_splits(args: Args):
    # ASSUME DATA GIVEN IN CHRONOLOGICAL ORDER.
    # this will dump a very different format of indices, with all in one file; TODO modify as convenient later.
    data = get_data(path=args.data_path, smiles_columns=args.smiles_columns)
    num_data = len(data)
    all_indices = list(range(num_data))
    fold_indices = {'random': [], 'scaffold': [], 'time': []}
    for i in range(args.num_folds - args.time_folds_per_train_set - 1):
        begin, end = int(i * num_data / args.num_folds), int(
            (i + args.time_folds_per_train_set + 2) * num_data /
            args.num_folds)
        subset_indices = all_indices[begin:end]
        subset_data = MoleculeDataset(data[begin:end])
        fold_indices['random'].append(
            split_indices(deepcopy(subset_indices),
                          args.time_folds_per_train_set + 2))
        fold_indices['scaffold'].append(
            split_indices(subset_indices,
                          args.time_folds_per_train_set + 2,
                          scaffold=True,
                          split_key_molecule=args.split_key_molecule,
                          data=subset_data))
        fold_indices['time'].append(
            split_indices(subset_indices,
                          args.time_folds_per_train_set + 2,
                          shuffle=False))
    for split_type in ['random', 'scaffold', 'time']:
        all_splits = []
        for i in range(len(fold_indices[split_type])):
            os.makedirs(os.path.join(args.save_dir, split_type,
                                     'fold_' + str(i), '0'),
                        exist_ok=True)
            with open(
                    os.path.join(args.save_dir, split_type, 'fold_' + str(i),
                                 '0', 'split_indices.pckl'), 'wb') as wf:
                train = np.concatenate([
                    fold_indices[split_type][i][j]
                    for j in range(args.time_folds_per_train_set)
                ])
                # train = []
                # for fold in train_folds:
                #     train += fold
                val = fold_indices[split_type][i][-2]
                test = fold_indices[split_type][i][-1]
                pickle.dump(
                    [train, val, test], wf
                )  # each is a pickle file containing a list of length-3 index lists for train/val/test
                all_splits.append([train, val, test])
        with open(
                os.path.join(args.save_dir, split_type, 'fold_' + str(i),
                             'split_indices.pckl'), 'wb') as wf:
            pickle.dump(all_splits, wf)
Esempio n. 9
0
def load_data(args: PredictArgs, smiles: List[List[str]]):
    """
    Function to load data from a list of smiles or a file.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: A list of list of smiles, or None if data is to be read from file
    :return: A tuple of a :class:`~chemprop.data.MoleculeDataset` containing all datapoints, a :class:`~chemprop.data.MoleculeDataset` containing only valid datapoints,
                 a :class:`~chemprop.data.MoleculeDataLoader` and a dictionary mapping full to valid indices.
    """
    print("Loading data")
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator,
        )
    else:
        full_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=[],
            ignore_columns=[],
            skip_invalid_smiles=False,
            args=args,
            store_row=not args.drop_extra_columns,
        )

    print("Validating SMILES")
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    print(f"Test size = {len(test_data):,}")

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    return full_data, test_data, test_data_loader, full_to_valid_indices
Esempio n. 10
0
def create_crossval_splits(args: Args):
    data = get_data(path=args.data_path, smiles_columns=args.smiles_columns)
    num_data = len(data)
    if args.split_type == 'random':
        all_indices = list(range(num_data))
        fold_indices = split_indices(all_indices,
                                     args.num_folds,
                                     scaffold=False)
    elif args.split_type == 'scaffold':
        all_indices = list(range(num_data))
        fold_indices = split_indices(
            all_indices,
            args.num_folds,
            scaffold=True,
            split_key_molecule=args.split_key_molecule,
            data=data)
    else:
        raise ValueError
    random.shuffle(fold_indices)
    for i in range(args.test_folds_to_test):
        all_splits = []
        for j in range(1, args.val_folds_per_test + 1):
            os.makedirs(os.path.join(args.save_dir, args.split_type,
                                     f'fold_{i}', f'{j - 1}'),
                        exist_ok=True)
            with open(
                    os.path.join(args.save_dir, args.split_type, f'fold_{i}',
                                 f'{j - 1}', 'split_indices.pckl'),
                    'wb') as wf:
                val_idx = (i + j) % args.num_folds
                val = fold_indices[val_idx]
                test = fold_indices[i]
                train = []
                for k in range(args.num_folds):
                    if k != i and k != val_idx:
                        train.append(fold_indices[k])
                train = np.concatenate(train)
                pickle.dump([train, val, test], wf)
                all_splits.append([train, val, test])
        with open(
                os.path.join(args.save_dir, args.split_type, f'fold_{i}',
                             'split_indices.pckl'), 'wb') as wf:
            pickle.dump(all_splits, wf)
Esempio n. 11
0
def cross_validate(
    args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger],
                                          Dict[str, List[float]]]
) -> Tuple[float, float]:
    """
    Runs k-fold cross-validation.

    For each of k splits (folds) of the data, trains and tests a model on that split
    and aggregates the performance across folds.

    :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for
                 loading data and training the Chemprop model.
    :param train_func: Function which runs training.
    :return: A tuple containing the mean and standard deviation performance across folds.
    """
    logger = create_logger(name=TRAIN_LOGGER_NAME,
                           save_dir=args.save_dir,
                           quiet=args.quiet)
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_column=args.smiles_column,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    # Print command line
    debug('Command line')
    debug(f'python {" ".join(sys.argv)}')

    # Print args
    debug('Args')
    debug(args)

    # Save args
    args.save(os.path.join(args.save_dir, 'args.json'))

    # Get data
    debug('Loading data')
    data = get_data(path=args.data_path,
                    args=args,
                    logger=logger,
                    skip_none_targets=True)
    validate_dataset_type(data, dataset_type=args.dataset_type)
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Run training on different random seeds for each fold
    all_scores = defaultdict(list)
    for fold_num in range(args.num_folds):
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores = train_func(
            args, deepcopy(data),
            logger)  # deepcopy since data may be modified
        for metric, scores in model_scores.items():
            all_scores[metric].append(scores)
    all_scores = dict(all_scores)

    # Convert scores to numpy arrays
    for metric, scores in all_scores.items():
        all_scores[metric] = np.array(scores)

    # Report results
    info(f'{args.num_folds}-fold cross validation')

    # Report scores for each fold
    for fold_num in range(args.num_folds):
        for metric, scores in all_scores.items():
            info(
                f'\tSeed {init_seed + fold_num} ==> test {metric} = {np.nanmean(scores[fold_num]):.6f}'
            )

            if args.show_individual_scores:
                for task_name, score in zip(args.task_names, scores[fold_num]):
                    info(
                        f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}'
                    )

    # Report scores across folds
    for metric, scores in all_scores.items():
        avg_scores = np.nanmean(
            scores, axis=1)  # average score for each model across tasks
        mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)
        info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}')

        if args.show_individual_scores:
            for task_num, task_name in enumerate(args.task_names):
                info(
                    f'\tOverall test {task_name} {metric} = '
                    f'{np.nanmean(scores[:, task_num]):.6f} +/- {np.nanstd(scores[:, task_num]):.6f}'
                )

    # Save scores
    with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f:
        writer = csv.writer(f)

        header = ['Task']
        for metric in args.metrics:
            header += [f'Mean {metric}', f'Standard deviation {metric}'] + \
                      [f'Fold {i} {metric}' for i in range(args.num_folds)]
        writer.writerow(header)

        for task_num, task_name in enumerate(args.task_names):
            row = [task_name]
            for metric, scores in all_scores.items():
                task_scores = scores[:, task_num]
                mean, std = np.nanmean(task_scores), np.nanstd(task_scores)
                row += [mean, std] + task_scores.tolist()
            writer.writerow(row)

    # Determine mean and std score of main metric
    avg_scores = np.nanmean(all_scores[args.metric], axis=1)
    mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores)

    # Optionally merge and save test preds
    if args.save_preds:
        all_preds = pd.concat([
            pd.read_csv(
                os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv'))
            for fold_num in range(args.num_folds)
        ])
        all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False)

    return mean_score, std_score
Esempio n. 12
0
def run_sklearn(args: SklearnTrainArgs,
                data: MoleculeDataset,
                logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 loading data and training the scikit-learn model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    debug('Loading data')
    data = get_data(path=args.data_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=args.target_columns)
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_columns=args.smiles_columns,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          num_folds=args.num_folds,
                                          args=args)

    if args.save_smiles_splits:
        save_smiles_splits(
            data_path=args.data_path,
            save_dir=args.save_dir,
            task_names=args.task_names,
            features_path=args.features_path,
            train_data=train_data,
            test_data=test_data,
            smiles_columns=args.smiles_columns,
        )

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            for s in datapoint.smiles:
                datapoint.extend_features(
                    morgan_fingerprint(mol=s,
                                       radius=args.radius,
                                       num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1,
                                          random_state=args.seed)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metrics=args.metrics,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metrics=args.metrics,
                                    args=args,
                                    logger=logger)

    for metric in args.metrics:
        info(f'Test {metric} = {np.nanmean(scores[metric])}')

    return scores
Esempio n. 13
0
def molecule_fingerprint(
        args: FingerprintArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    if args.fingerprint_type == 'MPN':  # only need to supply input features if using FFN latent representation and if model calls for them.
        validate_feature_sources = False
    else:
        validate_feature_sources = True
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=validate_feature_sources)
    args: Union[FingerprintArgs, TrainArgs]

    #set explicit H option and reaction option
    reset_featurization_parameters()
    if args.atom_descriptors == 'feature':
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    set_explicit_h(train_args.explicit_h)
    set_adding_hs(args.adding_h)
    if train_args.reaction:
        set_reaction(train_args.reaction, train_args.reaction_mode)
    elif train_args.reaction_solvent:
        set_reaction(True, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Set fingerprint size
    if args.fingerprint_type == 'MPN':
        if args.atom_descriptors == "descriptor":  # special case when we have 'descriptor' extra dimensions need to be added
            total_fp_size = (
                args.hidden_size +
                test_data.atom_descriptors_size()) * args.number_of_molecules
        else:
            if args.reaction_solvent:
                total_fp_size = args.hidden_size + args.hidden_size_solvent
            else:
                total_fp_size = args.hidden_size * args.number_of_molecules
        if args.features_only:
            raise ValueError(
                'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.'
            )
    elif args.fingerprint_type == 'last_FFN':
        if args.ffn_num_layers != 1:
            total_fp_size = args.ffn_hidden_size
        else:
            raise ValueError(
                'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.'
            )
    else:
        raise ValueError(
            f'Fingerprint type {args.fingerprint_type} not supported')
    all_fingerprints = np.zeros(
        (len(test_data), total_fp_size, len(args.checkpoint_paths)))

    # Load model
    print(
        f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.'
    )

    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
            args.checkpoint_paths[index])

        # Normalize features
        if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make fingerprints
        model_fp = model_fingerprint(model=model,
                                     data_loader=test_data_loader,
                                     fingerprint_type=args.fingerprint_type)
        if args.fingerprint_type == 'MPN' and (
                args.features_path is not None or args.features_generator
        ):  # truncate any features from MPN fingerprint
            model_fp = np.array(model_fp)[:, :total_fp_size]
        all_fingerprints[:, :, index] = model_fp

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this
    makedirs(args.preds_path, isfile=True)

    # Set column names
    fingerprint_columns = []
    if args.fingerprint_type == 'MPN':
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size // args.number_of_molecules):
                for k in range(args.number_of_molecules):
                    fingerprint_columns.append(f'fp_{j}_mol_{k}')
        else:
            for j in range(total_fp_size // args.number_of_molecules):
                for i in range(len(args.checkpoint_paths)):
                    for k in range(args.number_of_molecules):
                        fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}')

    else:  # args == 'last_FNN'
        if len(args.checkpoint_paths) == 1:
            for j in range(total_fp_size):
                fingerprint_columns.append(f'fp_{j}')
        else:
            for j in range(total_fp_size):
                for i in range(len(args.checkpoint_paths)):
                    fingerprint_columns.append(f'fp_{j}_model_{i}')

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = all_fingerprints[valid_index].reshape(
            (len(args.checkpoint_paths) * total_fp_size
             )) if valid_index is not None else ['Invalid SMILES'] * len(
                 args.checkpoint_paths) * total_fp_size

        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return all_fingerprints
Esempio n. 14
0
    print()
    print(
        f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}'
    )
    print(f'Minimum dice similarity = {np.min(similarities):.4f}')
    print(f'Maximum dice similarity = {np.max(similarities):.4f}')
    print()
    print('Percentiles for dice similarity')
    print(' | '.join([
        f'{i}% = {np.percentile(similarities, i):.4f}'
        for i in range(0, 101, 10)
    ]))


if __name__ == '__main__':
    args = Args().parse_args()

    data_1 = get_data(path=args.data_path_1,
                      smiles_column=args.smiles_column_1)
    data_2 = get_data(path=args.data_path_2,
                      smiles_column=args.smiles_column_2)

    if args.similarity_measure == 'scaffold':
        scaffold_similarity(data_1.smiles(), data_2.smiles())
    elif args.similarity_measure == 'morgan':
        morgan_similarity(data_1.smiles(), data_2.smiles(), args.radius,
                          args.sample_rate)
    else:
        raise ValueError(
            f'Similarity measure "{args.similarity_measure}" not supported.')
Esempio n. 15
0
def molecule_fingerprint(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to encode fingerprint vectors for the data.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of fingerprint vectors (list of floats)
    """

    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    update_prediction_args(predict_args=args,
                           train_args=train_args,
                           validate_feature_sources=False)
    args: Union[PredictArgs, TrainArgs]

    #set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             smiles_columns=args.smiles_columns,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    # Load model
    print(f'Encoding smiles into a fingerprint vector from a single model')
    if len(args.checkpoint_paths) != 1:
        raise ValueError(
            "Fingerprint generation only supports one model, cannot use an ensemble"
        )

    model = load_checkpoint(args.checkpoint_paths[0], device=args.device)
    scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers(
        args.checkpoint_paths[0])

    # Normalize features
    if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling:
        test_data.reset_features_and_targets()
        if args.features_scaling:
            test_data.normalize_features(features_scaler)
        if train_args.atom_descriptor_scaling and args.atom_descriptors is not None:
            test_data.normalize_features(atom_descriptor_scaler,
                                         scale_atom_descriptors=True)
        if train_args.bond_feature_scaling and args.bond_features_size > 0:
            test_data.normalize_features(bond_feature_scaler,
                                         scale_bond_features=True)

    # Make fingerprints
    model_preds = model_fingerprint(model=model, data_loader=test_data_loader)

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(model_preds)
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to full_data
    total_hidden_size = args.hidden_size * args.number_of_molecules
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = model_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * total_hidden_size

        fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)]
        for i in range(len(fingerprint_columns)):
            datapoint.row[fingerprint_columns[i]] = preds[i]

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f,
                                fieldnames=args.smiles_columns +
                                fingerprint_columns,
                                extrasaction='ignore')
        writer.writeheader()
        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return model_preds
Esempio n. 16
0
 def test_targets(self):
     """Testing the base case targets"""
     data = get_data(path=self.data_path)
     self.assertEqual(data.targets(), [[0, 1], [2, 3], [4, 5]])
Esempio n. 17
0
def make_predictions(
    args: PredictArgs,
    smiles: List[List[str]] = None,
    model_objects: Tuple[PredictArgs, TrainArgs, List[MoleculeModel],
                         List[StandardScaler], int, List[str], ] = None,
    calibrator: UncertaintyCalibrator = None,
    return_invalid_smiles: bool = True,
    return_index_dict: bool = False,
    return_uncertainty: bool = False,
) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :param model_objects: Tuple of output of load_model function which can be called separately outside this function. Preloaded model objects should have
                used the non-generator option for load_model if the objects are to be used multiple times or are intended to be used for calibration as well.
    :param calibrator: A :class: `~chemprop.uncertainty.UncertaintyCalibrator` object, for use in calibrating uncertainty predictions.
                Can be preloaded and provided as a function input or constructed within the function from arguments. The models and scalers used
                to initiate the calibrator must be lists instead of generators if the same calibrator is to be used multiple times or
                if the same models and scalers objects are also part of the provided model_objects input.
    :param return_invalid_smiles: Whether to return predictions of "Invalid SMILES" for invalid SMILES, otherwise will skip them in returned predictions.
    :param return_index_dict: Whether to return the prediction results as a dictionary keyed from the initial data indexes.
    :param return_uncertainty: Whether to return uncertainty predictions alongside the model value predictions.
    :return: A list of lists of target predictions. If returning uncertainty, a tuple containing first prediction values then uncertainty estimates.
    """
    if model_objects:
        (
            args,
            train_args,
            models,
            scalers,
            num_tasks,
            task_names,
        ) = model_objects
    else:
        (
            args,
            train_args,
            models,
            scalers,
            num_tasks,
            task_names,
        ) = load_model(args, generator=True)

    num_models = len(args.checkpoint_paths)

    set_features(args, train_args)

    # Note: to get the invalid SMILES for your data, use the get_invalid_smiles_from_file or get_invalid_smiles_from_list functions from data/utils.py
    full_data, test_data, test_data_loader, full_to_valid_indices = load_data(
        args, smiles)

    if args.uncertainty_method is None and (args.calibration_method is not None
                                            or args.evaluation_methods
                                            is not None):
        if args.dataset_type in ['classification', 'multiclass']:
            args.uncertainty_method = 'classification'
        else:
            raise ValueError(
                'Cannot calibrate or evaluate uncertainty without selection of an uncertainty method.'
            )

    if calibrator is None and args.calibration_path is not None:

        calibration_data = get_data(
            path=args.calibration_path,
            smiles_columns=args.smiles_columns,
            target_columns=task_names,
            features_path=args.calibration_features_path,
            features_generator=args.features_generator,
            phase_features_path=args.calibration_phase_features_path,
            atom_descriptors_path=args.calibration_atom_descriptors_path,
            bond_features_path=args.calibration_bond_features_path,
            max_data_size=args.max_data_size,
            loss_function=args.loss_function,
        )

        calibration_data_loader = MoleculeDataLoader(
            dataset=calibration_data,
            batch_size=args.batch_size,
            num_workers=args.num_workers,
        )

        if isinstance(models, List) and isinstance(scalers, List):
            calibration_models = models
            calibration_scalers = scalers
        else:
            calibration_model_objects = load_model(args, generator=True)
            calibration_models = calibration_model_objects[2]
            calibration_scalers = calibration_model_objects[3]

        calibrator = build_uncertainty_calibrator(
            calibration_method=args.calibration_method,
            uncertainty_method=args.uncertainty_method,
            interval_percentile=args.calibration_interval_percentile,
            regression_calibrator_metric=args.regression_calibrator_metric,
            calibration_data=calibration_data,
            calibration_data_loader=calibration_data_loader,
            models=calibration_models,
            scalers=calibration_scalers,
            num_models=num_models,
            dataset_type=args.dataset_type,
            loss_function=args.loss_function,
            uncertainty_dropout_p=args.uncertainty_dropout_p,
            dropout_sampling_size=args.dropout_sampling_size,
            spectra_phase_mask=getattr(train_args, "spectra_phase_mask", None),
        )

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        preds = [None] * len(full_data)
        unc = [None] * len(full_data)
    else:
        preds, unc = predict_and_save(
            args=args,
            train_args=train_args,
            test_data=test_data,
            task_names=task_names,
            num_tasks=num_tasks,
            test_data_loader=test_data_loader,
            full_data=full_data,
            full_to_valid_indices=full_to_valid_indices,
            models=models,
            scalers=scalers,
            num_models=num_models,
            calibrator=calibrator,
            return_invalid_smiles=return_invalid_smiles,
        )

    if return_index_dict:
        preds_dict = {}
        unc_dict = {}
        for i in range(len(full_data)):
            if return_invalid_smiles:
                preds_dict[i] = preds[i]
                unc_dict[i] = unc[i]
            else:
                valid_index = full_to_valid_indices.get(i, None)
                if valid_index is not None:
                    preds_dict[i] = preds[valid_index]
                    unc_dict[i] = unc[valid_index]
        if return_uncertainty:
            return preds_dict, unc_dict
        else:
            return preds_dict
    else:
        if return_uncertainty:
            return preds, unc
        else:
            return preds
Esempio n. 18
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             target_columns=[],
                             ignore_columns=[],
                             skip_invalid_smiles=False,
                             args=args,
                             store_row=True)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        scaler, features_scaler = load_scalers(checkpoint_path)

        # Normalize features
        if args.features_scaling:
            test_data.reset_features_and_targets()
            test_data.normalize_features(features_scaler)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        for pred_name, pred in zip(task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Esempio n. 19
0
def train():
    """Renders the train page and performs training if request method is POST."""
    global PROGRESS, TRAINING

    warnings, errors = [], []

    if request.method == 'GET':
        return render_train()

    # Get arguments
    data_name, epochs, ensemble_size, checkpoint_name = \
        request.form['dataName'], int(request.form['epochs']), \
        int(request.form['ensembleSize']), request.form['checkpointName']
    gpu = request.form.get('gpu')
    data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv')
    dataset_type = request.form.get('datasetType', 'regression')

    # Create and modify args
    args = TrainArgs().parse_args([
        '--data_path', data_path, '--dataset_type', dataset_type, '--epochs',
        str(epochs), '--ensemble_size',
        str(ensemble_size)
    ])

    # Check if regression/classification selection matches data
    data = get_data(path=data_path)
    targets = data.targets()
    unique_targets = {
        target
        for row in targets for target in row if target is not None
    }

    if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0:
        errors.append(
            'Selected classification dataset but not all labels are 0 or 1. Select regression instead.'
        )

        return render_train(warnings=warnings, errors=errors)

    if dataset_type == 'regression' and unique_targets <= {0, 1}:
        errors.append(
            'Selected regression dataset but all labels are 0 or 1. Select classification instead.'
        )

        return render_train(warnings=warnings, errors=errors)

    if gpu is not None:
        if gpu == 'None':
            args.cuda = False
        else:
            args.gpu = int(gpu)

    current_user = request.cookies.get('currentUser')

    if not current_user:
        # Use DEFAULT as current user if the client's cookie is not set.
        current_user = app.config['DEFAULT_USER_ID']

    ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name, current_user,
                                        args.dataset_type, args.epochs,
                                        args.ensemble_size, len(targets))

    with TemporaryDirectory() as temp_dir:
        args.save_dir = temp_dir

        process = mp.Process(target=progress_bar, args=(args, PROGRESS))
        process.start()
        TRAINING = 1

        # Run training
        logger = create_logger(name='train',
                               save_dir=args.save_dir,
                               quiet=args.quiet)
        task_scores = run_training(args, logger)
        process.join()

        # Reset globals
        TRAINING = 0
        PROGRESS = mp.Value('d', 0.0)

        # Check if name overlap
        if checkpoint_name != ckpt_name:
            warnings.append(
                name_already_exists_message('Checkpoint', checkpoint_name,
                                            ckpt_name))

        # Move models
        for root, _, files in os.walk(args.save_dir):
            for fname in files:
                if fname.endswith('.pt'):
                    model_id = db.insert_model(ckpt_id)
                    save_path = os.path.join(app.config['CHECKPOINT_FOLDER'],
                                             f'{model_id}.pt')
                    shutil.move(os.path.join(args.save_dir, root, fname),
                                save_path)

    return render_train(trained=True,
                        metric=args.metric,
                        num_tasks=len(args.task_names),
                        task_names=args.task_names,
                        task_scores=format_float_list(task_scores),
                        mean_score=format_float(np.mean(task_scores)),
                        warnings=warnings,
                        errors=errors)
Esempio n. 20
0
 def test_dataweights(self):
     """Testing the handling of data weights"""
     data = get_data(path=self.data_path,
                     data_weights_path='dummy_path.csv')
     self.assertEqual(data.data_weights(), [1, 1.5, 0.5])
Esempio n. 21
0
 def test_return_dataset(self):
     """Testing the return type"""
     data = get_data(path=self.data_path)
     self.assertIsInstance(data, MoleculeDataset)
Esempio n. 22
0
def make_predictions(
        args: PredictArgs,
        smiles: List[List[str]] = None) -> List[List[Optional[float]]]:
    """
    Loads data and a trained model and uses the model to make predictions on the data.

    If SMILES are provided, then makes predictions on smiles.
    Otherwise makes predictions on :code:`args.test_data`.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: List of list of SMILES to make predictions on.
    :return: A list of lists of target predictions.
    """
    print("Loading training args")
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    update_prediction_args(predict_args=args, train_args=train_args)
    args: Union[PredictArgs, TrainArgs]

    if args.atom_descriptors == "feature":
        set_extra_atom_fdim(train_args.atom_features_size)

    if args.bond_features_path is not None:
        set_extra_bond_fdim(train_args.bond_features_size)

    # set explicit H option and reaction option
    set_explicit_h(train_args.explicit_h)
    set_reaction(train_args.reaction, train_args.reaction_mode)

    print("Loading data")
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator,
        )
    else:
        full_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=[],
            ignore_columns=[],
            skip_invalid_smiles=False,
            args=args,
            store_row=not args.drop_extra_columns,
        )

    print("Validating SMILES")
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f"Test size = {len(test_data):,}")

    # Predict with each model individually and sum predictions
    if args.dataset_type == "multiclass":
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(
        dataset=test_data,
        batch_size=args.batch_size,
        num_workers=0 if sys.platform == "darwin" else args.num_workers,
    )

    # Partial results for variance robust calculation.
    if args.ensemble_variance:
        all_preds = np.zeros(
            (len(test_data), num_tasks, len(args.checkpoint_paths)))

    print(
        f"Predicting with an ensemble of {len(args.checkpoint_paths)} models")
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model and scalers
        model = load_checkpoint(checkpoint_path, device=args.device)
        (
            scaler,
            features_scaler,
            atom_descriptor_scaler,
            bond_feature_scaler,
        ) = load_scalers(checkpoint_path)

        # Normalize features
        if (args.features_scaling or train_args.atom_descriptor_scaling
                or train_args.bond_feature_scaling):
            test_data.reset_features_and_targets()
            if args.features_scaling:
                test_data.normalize_features(features_scaler)
            if (train_args.atom_descriptor_scaling
                    and args.atom_descriptors is not None):
                test_data.normalize_features(atom_descriptor_scaler,
                                             scale_atom_descriptors=True)
            if train_args.bond_feature_scaling and args.bond_features_size > 0:
                test_data.normalize_features(bond_feature_scaler,
                                             scale_bond_features=True)

        # Make predictions
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)
        if args.ensemble_variance:
            all_preds[:, :, index] = model_preds

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    if args.ensemble_variance:
        all_epi_uncs = np.var(all_preds, axis=2)
        all_epi_uncs = all_epi_uncs.tolist()

    # Save predictions
    print(f"Saving predictions to {args.preds_path}")
    assert len(test_data) == len(avg_preds)
    if args.ensemble_variance:
        assert len(test_data) == len(all_epi_uncs)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == "multiclass":
        task_names = [
            f"{name}_class_{i}" for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = (avg_preds[valid_index] if valid_index is not None else
                 ["Invalid SMILES"] * len(task_names))
        if args.ensemble_variance:
            epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None
                        else ["Invalid SMILES"] * len(task_names))

        # If extra columns have been dropped, add back in SMILES columns
        if args.drop_extra_columns:
            datapoint.row = OrderedDict()

            smiles_columns = args.smiles_columns

            for column, smiles in zip(smiles_columns, datapoint.smiles):
                datapoint.row[column] = smiles

        # Add predictions columns
        if args.ensemble_variance:
            for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs):
                datapoint.row[pred_name] = pred
                datapoint.row[pred_name + "_epi_unc"] = epi_unc
        else:
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, "w") as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Esempio n. 23
0
def run_training(args: TrainArgs, logger: Logger = None) -> List[float]:
    """
    Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for
                 loading data and training the Chemprop model.
    :param logger: A logger to record output.
    :return: A list of model scores for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Print command line
    debug('Command line')
    debug(f'python {" ".join(sys.argv)}')

    # Print args
    debug('Args')
    debug(args)

    # Save args
    args.save(os.path.join(args.save_dir, 'args.json'))

    # Set pytorch seed for random initial weights
    torch.manual_seed(args.pytorch_seed)

    # Get data
    debug('Loading data')
    data = get_data(path=args.data_path, args=args, logger=logger)
    validate_dataset_type(data, dataset_type=args.dataset_type)
    args.features_size = data.features_size()
    debug(f'Number of tasks = {args.num_tasks}')

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(path=args.separate_test_path,
                             args=args,
                             features_path=args.separate_test_features_path,
                             logger=logger)
    if args.separate_val_path:
        val_data = get_data(path=args.separate_val_path,
                            args=args,
                            features_path=args.separate_val_features_path,
                            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.0, 0.2),
                                              seed=args.seed,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        save_smiles_splits(train_data=train_data,
                           val_data=val_data,
                           test_data=test_data,
                           data_path=args.data_path,
                           save_dir=args.save_dir,
                           smiles_column=args.smiles_column)

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        train_smiles, train_targets = train_data.smiles(), train_data.targets()
        scaler = StandardScaler().fit(train_targets)
        scaled_targets = scaler.transform(train_targets).tolist()
        train_data.set_targets(scaled_targets)
    else:
        scaler = None

    # Get loss and metric functions
    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Automatically determine whether to cache
    if len(data) <= args.cache_cutoff:
        cache = True
        num_workers = 0
    else:
        cache = False
        num_workers = args.num_workers

    # Create data loaders
    train_data_loader = MoleculeDataLoader(dataset=train_data,
                                           batch_size=args.batch_size,
                                           num_workers=num_workers,
                                           cache=cache,
                                           class_balance=args.class_balance,
                                           shuffle=True,
                                           seed=args.seed)
    val_data_loader = MoleculeDataLoader(dataset=val_data,
                                         batch_size=args.batch_size,
                                         num_workers=num_workers,
                                         cache=cache)
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=num_workers,
                                          cache=cache)

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = MoleculeModel(args)

        debug(model)
        debug(f'Number of parameters = {param_count(model):,}')
        if args.cuda:
            debug('Moving model to cuda')
        model = model.to(args.device)

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, 'model.pt'), model, scaler,
                        features_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')

            n_iter = train(model=model,
                           data_loader=train_data_loader,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data_loader=val_data_loader,
                                  num_tasks=args.num_tasks,
                                  metric_func=metric_func,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            # Average validation score
            avg_val_score = np.nanmean(val_scores)
            debug(f'Validation {args.metric} = {avg_val_score:.6f}')
            writer.add_scalar(f'validation_{args.metric}', avg_val_score,
                              n_iter)

            if args.show_individual_scores:
                # Individual validation scores
                for task_name, val_score in zip(args.task_names, val_scores):
                    debug(
                        f'Validation {task_name} {args.metric} = {val_score:.6f}'
                    )
                    writer.add_scalar(f'validation_{task_name}_{args.metric}',
                                      val_score, n_iter)

            # Save model checkpoint if improved validation score
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir, 'model.pt'), model,
                                scaler, features_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, 'model.pt'),
                                device=args.device,
                                logger=logger)

        test_preds = predict(model=model,
                             data_loader=test_data_loader,
                             scaler=scaler)
        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        avg_test_score = np.nanmean(test_scores)
        info(f'Model {model_idx} test {args.metric} = {avg_test_score:.6f}')
        writer.add_scalar(f'test_{args.metric}', avg_test_score, 0)

        if args.show_individual_scores:
            # Individual test scores
            for task_name, test_score in zip(args.task_names, test_scores):
                info(
                    f'Model {model_idx} test {task_name} {args.metric} = {test_score:.6f}'
                )
                writer.add_scalar(f'test_{task_name}_{args.metric}',
                                  test_score, n_iter)
        writer.close()

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metric_func=metric_func,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    # Average ensemble score
    avg_ensemble_test_score = np.nanmean(ensemble_scores)
    info(f'Ensemble test {args.metric} = {avg_ensemble_test_score:.6f}')

    # Individual ensemble scores
    if args.show_individual_scores:
        for task_name, ensemble_score in zip(args.task_names, ensemble_scores):
            info(
                f'Ensemble test {task_name} {args.metric} = {ensemble_score:.6f}'
            )

    return ensemble_scores
Esempio n. 24
0
def run_training(args: TrainArgs,
                 data: MoleculeDataset,
                 logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a Chemprop model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for
                 loading data and training the Chemprop model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`args.metrics` to a list of values for each task.

    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set pytorch seed for random initial weights
    torch.manual_seed(args.pytorch_seed)

    # Split data
    debug(f'Splitting data with seed {args.seed}')
    if args.separate_test_path:
        test_data = get_data(
            path=args.separate_test_path,
            args=args,
            features_path=args.separate_test_features_path,
            atom_descriptors_path=args.separate_test_atom_descriptors_path,
            bond_features_path=args.separate_test_bond_features_path,
            phase_features_path=args.separate_test_phase_features_path,
            smiles_columns=args.smiles_columns,
            logger=logger)
    if args.separate_val_path:
        val_data = get_data(
            path=args.separate_val_path,
            args=args,
            features_path=args.separate_val_features_path,
            atom_descriptors_path=args.separate_val_atom_descriptors_path,
            bond_features_path=args.separate_val_bond_features_path,
            phase_features_path=args.separate_val_phase_features_path,
            smiles_columns=args.smiles_columns,
            logger=logger)

    if args.separate_val_path and args.separate_test_path:
        train_data = data
    elif args.separate_val_path:
        train_data, _, test_data = split_data(data=data,
                                              split_type=args.split_type,
                                              sizes=(0.8, 0.0, 0.2),
                                              seed=args.seed,
                                              num_folds=args.num_folds,
                                              args=args,
                                              logger=logger)
    elif args.separate_test_path:
        train_data, val_data, _ = split_data(data=data,
                                             split_type=args.split_type,
                                             sizes=(0.8, 0.2, 0.0),
                                             seed=args.seed,
                                             num_folds=args.num_folds,
                                             args=args,
                                             logger=logger)
    else:
        train_data, val_data, test_data = split_data(
            data=data,
            split_type=args.split_type,
            sizes=args.split_sizes,
            seed=args.seed,
            num_folds=args.num_folds,
            args=args,
            logger=logger)

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
            debug(
                f'{args.task_names[i]} '
                f'{", ".join(f"{cls}: {size * 100:.2f}%" for cls, size in enumerate(task_class_sizes))}'
            )

    if args.save_smiles_splits:
        save_smiles_splits(
            data_path=args.data_path,
            save_dir=args.save_dir,
            task_names=args.task_names,
            features_path=args.features_path,
            train_data=train_data,
            val_data=val_data,
            test_data=test_data,
            smiles_columns=args.smiles_columns,
            logger=logger,
        )

    if args.features_scaling:
        features_scaler = train_data.normalize_features(replace_nan_token=0)
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    if args.atom_descriptor_scaling and args.atom_descriptors is not None:
        atom_descriptor_scaler = train_data.normalize_features(
            replace_nan_token=0, scale_atom_descriptors=True)
        val_data.normalize_features(atom_descriptor_scaler,
                                    scale_atom_descriptors=True)
        test_data.normalize_features(atom_descriptor_scaler,
                                     scale_atom_descriptors=True)
    else:
        atom_descriptor_scaler = None

    if args.bond_feature_scaling and args.bond_features_size > 0:
        bond_feature_scaler = train_data.normalize_features(
            replace_nan_token=0, scale_bond_features=True)
        val_data.normalize_features(bond_feature_scaler,
                                    scale_bond_features=True)
        test_data.normalize_features(bond_feature_scaler,
                                     scale_bond_features=True)
    else:
        bond_feature_scaler = None

    args.train_data_size = len(train_data)

    debug(
        f'Total size = {len(data):,} | '
        f'train size = {len(train_data):,} | val size = {len(val_data):,} | test size = {len(test_data):,}'
    )

    # Initialize scaler and scale training targets by subtracting mean and dividing standard deviation (regression only)
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        scaler = train_data.normalize_targets()
    elif args.dataset_type == 'spectra':
        debug(
            'Normalizing spectra and excluding spectra regions based on phase')
        args.spectra_phase_mask = load_phase_mask(args.spectra_phase_mask_path)
        for dataset in [train_data, test_data, val_data]:
            data_targets = normalize_spectra(
                spectra=dataset.targets(),
                phase_features=dataset.phase_features(),
                phase_mask=args.spectra_phase_mask,
                excluded_sub_value=None,
                threshold=args.spectra_target_floor,
            )
            dataset.set_targets(data_targets)
        scaler = None
    else:
        scaler = None

    # Get loss function
    loss_func = get_loss_func(args)

    # Set up test set evaluation
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    if args.dataset_type == 'multiclass':
        sum_test_preds = np.zeros(
            (len(test_smiles), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_test_preds = np.zeros((len(test_smiles), args.num_tasks))

    # Automatically determine whether to cache
    if len(data) <= args.cache_cutoff:
        set_cache_graph(True)
        num_workers = 0
    else:
        set_cache_graph(False)
        num_workers = args.num_workers

    # Create data loaders
    train_data_loader = MoleculeDataLoader(dataset=train_data,
                                           batch_size=args.batch_size,
                                           num_workers=num_workers,
                                           class_balance=args.class_balance,
                                           shuffle=True,
                                           seed=args.seed)
    val_data_loader = MoleculeDataLoader(dataset=val_data,
                                         batch_size=args.batch_size,
                                         num_workers=num_workers)
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=num_workers)

    if args.class_balance:
        debug(
            f'With class_balance, effective train size = {train_data_loader.iter_size:,}'
        )

    # Train ensemble of models
    for model_idx in range(args.ensemble_size):
        # Tensorboard writer
        save_dir = os.path.join(args.save_dir, f'model_{model_idx}')
        makedirs(save_dir)
        try:
            writer = SummaryWriter(log_dir=save_dir)
        except:
            writer = SummaryWriter(logdir=save_dir)

        # Load/build model
        if args.checkpoint_paths is not None:
            debug(
                f'Loading model {model_idx} from {args.checkpoint_paths[model_idx]}'
            )
            model = load_checkpoint(args.checkpoint_paths[model_idx],
                                    logger=logger)
        else:
            debug(f'Building model {model_idx}')
            model = MoleculeModel(args)

        # Optionally, overwrite weights:
        if args.checkpoint_frzn is not None:
            debug(
                f'Loading and freezing parameters from {args.checkpoint_frzn}.'
            )
            model = load_frzn_model(model=model,
                                    path=args.checkpoint_frzn,
                                    current_args=args,
                                    logger=logger)

        debug(model)

        if args.checkpoint_frzn is not None:
            debug(f'Number of unfrozen parameters = {param_count(model):,}')
            debug(f'Total number of parameters = {param_count_all(model):,}')
        else:
            debug(f'Number of parameters = {param_count_all(model):,}')

        if args.cuda:
            debug('Moving model to cuda')
        model = model.to(args.device)

        # Ensure that model is saved in correct location for evaluation if 0 epochs
        save_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME), model, scaler,
                        features_scaler, atom_descriptor_scaler,
                        bond_feature_scaler, args)

        # Optimizers
        optimizer = build_optimizer(model, args)

        # Learning rate schedulers
        scheduler = build_lr_scheduler(optimizer, args)

        # Run training
        best_score = float('inf') if args.minimize_score else -float('inf')
        best_epoch, n_iter = 0, 0
        for epoch in trange(args.epochs):
            debug(f'Epoch {epoch}')
            n_iter = train(model=model,
                           data_loader=train_data_loader,
                           loss_func=loss_func,
                           optimizer=optimizer,
                           scheduler=scheduler,
                           args=args,
                           n_iter=n_iter,
                           logger=logger,
                           writer=writer)
            if isinstance(scheduler, ExponentialLR):
                scheduler.step()
            val_scores = evaluate(model=model,
                                  data_loader=val_data_loader,
                                  num_tasks=args.num_tasks,
                                  metrics=args.metrics,
                                  dataset_type=args.dataset_type,
                                  scaler=scaler,
                                  logger=logger)

            for metric, scores in val_scores.items():
                # Average validation score
                avg_val_score = np.nanmean(scores)
                debug(f'Validation {metric} = {avg_val_score:.6f}')
                writer.add_scalar(f'validation_{metric}', avg_val_score,
                                  n_iter)

                if args.show_individual_scores:
                    # Individual validation scores
                    for task_name, val_score in zip(args.task_names, scores):
                        debug(
                            f'Validation {task_name} {metric} = {val_score:.6f}'
                        )
                        writer.add_scalar(f'validation_{task_name}_{metric}',
                                          val_score, n_iter)

            # Save model checkpoint if improved validation score
            avg_val_score = np.nanmean(val_scores[args.metric])
            if args.minimize_score and avg_val_score < best_score or \
                    not args.minimize_score and avg_val_score > best_score:
                best_score, best_epoch = avg_val_score, epoch
                save_checkpoint(os.path.join(save_dir,
                                             MODEL_FILE_NAME), model, scaler,
                                features_scaler, atom_descriptor_scaler,
                                bond_feature_scaler, args)

        # Evaluate on test set using model with best validation score
        info(
            f'Model {model_idx} best validation {args.metric} = {best_score:.6f} on epoch {best_epoch}'
        )
        model = load_checkpoint(os.path.join(save_dir, MODEL_FILE_NAME),
                                device=args.device,
                                logger=logger)

        test_preds = predict(model=model,
                             data_loader=test_data_loader,
                             scaler=scaler)
        test_scores = evaluate_predictions(preds=test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metrics=args.metrics,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

        if len(test_preds) != 0:
            sum_test_preds += np.array(test_preds)

        # Average test score
        for metric, scores in test_scores.items():
            avg_test_score = np.nanmean(scores)
            info(f'Model {model_idx} test {metric} = {avg_test_score:.6f}')
            writer.add_scalar(f'test_{metric}', avg_test_score, 0)

            if args.show_individual_scores and args.dataset_type != 'spectra':
                # Individual test scores
                for task_name, test_score in zip(args.task_names, scores):
                    info(
                        f'Model {model_idx} test {task_name} {metric} = {test_score:.6f}'
                    )
                    writer.add_scalar(f'test_{task_name}_{metric}', test_score,
                                      n_iter)
        writer.close()

    # Evaluate ensemble on test set
    avg_test_preds = (sum_test_preds / args.ensemble_size).tolist()

    ensemble_scores = evaluate_predictions(preds=avg_test_preds,
                                           targets=test_targets,
                                           num_tasks=args.num_tasks,
                                           metrics=args.metrics,
                                           dataset_type=args.dataset_type,
                                           logger=logger)

    for metric, scores in ensemble_scores.items():
        # Average ensemble score
        avg_ensemble_test_score = np.nanmean(scores)
        info(f'Ensemble test {metric} = {avg_ensemble_test_score:.6f}')

        # Individual ensemble scores
        if args.show_individual_scores:
            for task_name, ensemble_score in zip(args.task_names, scores):
                info(
                    f'Ensemble test {task_name} {metric} = {ensemble_score:.6f}'
                )

    # Save scores
    with open(os.path.join(args.save_dir, 'test_scores.json'), 'w') as f:
        json.dump(ensemble_scores, f, indent=4, sort_keys=True)

    # Optionally save test preds
    if args.save_preds:
        test_preds_dataframe = pd.DataFrame(
            data={'smiles': test_data.smiles()})

        for i, task_name in enumerate(args.task_names):
            test_preds_dataframe[task_name] = [
                pred[i] for pred in avg_test_preds
            ]

        test_preds_dataframe.to_csv(os.path.join(args.save_dir,
                                                 'test_preds.csv'),
                                    index=False)

    return ensemble_scores
Esempio n. 25
0
 def test_smiles(self):
     """Testing the base case smiles"""
     data = get_data(path=self.data_path)
     self.assertEqual(data.smiles(),
                      [['C', 'CC'], ['CC', 'CN'], ['O', 'CO']])
Esempio n. 26
0
def predict_sklearn(args: SklearnPredictArgs) -> None:
    """
    Loads data and a trained scikit-learn model and uses the model to make predictions on the data.

   :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for
                 loading data, loading a trained scikit-learn model, and making predictions with the model.
    """
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=[],
                    ignore_columns=[],
                    store_row=True)

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        for s in datapoint.smiles:
            datapoint.extend_features(
                morgan_fingerprint(mol=s,
                                   radius=train_args.radius,
                                   num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    # assert len(data) == len(avg_preds)    #TODO: address with unit test later
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
Esempio n. 27
0
 def test_features_and_phase_features(self):
     """Testing the handling of phase features"""
     with self.assertRaises(ValueError):
         data = get_data(path=self.data_path,
                         features_path=['dummy_path.csv'],
                         phase_features_path='dummy_path.csv')
Esempio n. 28
0
def cross_validate(args: TrainArgs,
                   train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]]
                   ) -> Tuple[float, float]:
    """
    Runs k-fold cross-validation.

    For each of k splits (folds) of the data, trains and tests a model on that split
    and aggregates the performance across folds.

    :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for
                 loading data and training the Chemprop model.
    :param train_func: Function which runs training.
    :return: A tuple containing the mean and standard deviation performance across folds.
    """
    logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet)
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir
    args.task_names = get_task_names(path=args.data_path, smiles_columns=args.smiles_columns,
                                     target_columns=args.target_columns, ignore_columns=args.ignore_columns)

    # Print command line
    debug('Command line')
    debug(f'python {" ".join(sys.argv)}')

    # Print args
    debug('Args')
    debug(args)

    # Save args
    makedirs(args.save_dir)
    try:
        args.save(os.path.join(args.save_dir, 'args.json'))
    except subprocess.CalledProcessError:
        debug('Could not write the reproducibility section of the arguments to file, thus omitting this section.')
        args.save(os.path.join(args.save_dir, 'args.json'), with_reproducibility=False)

    # set explicit H option and reaction option
    reset_featurization_parameters(logger=logger)
    set_explicit_h(args.explicit_h)
    set_adding_hs(args.adding_h)
    if args.reaction:
        set_reaction(args.reaction, args.reaction_mode)
    elif args.reaction_solvent:
        set_reaction(True, args.reaction_mode)
    
    # Get data
    debug('Loading data')
    data = get_data(
        path=args.data_path,
        args=args,
        logger=logger,
        skip_none_targets=True,
        data_weights_path=args.data_weights_path
    )
    validate_dataset_type(data, dataset_type=args.dataset_type)
    args.features_size = data.features_size()

    if args.atom_descriptors == 'descriptor':
        args.atom_descriptors_size = data.atom_descriptors_size()
        args.ffn_hidden_size += args.atom_descriptors_size
    elif args.atom_descriptors == 'feature':
        args.atom_features_size = data.atom_features_size()
        set_extra_atom_fdim(args.atom_features_size)
    if args.bond_features_path is not None:
        args.bond_features_size = data.bond_features_size()
        set_extra_bond_fdim(args.bond_features_size)

    debug(f'Number of tasks = {args.num_tasks}')

    if args.target_weights is not None and len(args.target_weights) != args.num_tasks:
        raise ValueError('The number of provided target weights must match the number and order of the prediction tasks')

    # Run training on different random seeds for each fold
    all_scores = defaultdict(list)
    for fold_num in range(args.num_folds):
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        data.reset_features_and_targets()

        # If resuming experiment, load results from trained models
        test_scores_path = os.path.join(args.save_dir, 'test_scores.json')
        if args.resume_experiment and os.path.exists(test_scores_path):
            print('Loading scores')
            with open(test_scores_path) as f:
                model_scores = json.load(f)
        # Otherwise, train the models
        else:
            model_scores = train_func(args, data, logger)

        for metric, scores in model_scores.items():
            all_scores[metric].append(scores)
    all_scores = dict(all_scores)

    # Convert scores to numpy arrays
    for metric, scores in all_scores.items():
        all_scores[metric] = np.array(scores)

    # Report results
    info(f'{args.num_folds}-fold cross validation')

    # Report scores for each fold
    contains_nan_scores = False
    for fold_num in range(args.num_folds):
        for metric, scores in all_scores.items():
            info(f'\tSeed {init_seed + fold_num} ==> test {metric} = {multitask_mean(scores[fold_num], metric):.6f}')

            if args.show_individual_scores:
                for task_name, score in zip(args.task_names, scores[fold_num]):
                    info(f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}')
                    if np.isnan(score):
                        contains_nan_scores = True

    # Report scores across folds
    for metric, scores in all_scores.items():
        avg_scores = multitask_mean(scores, axis=1, metric=metric)  # average score for each model across tasks
        mean_score, std_score = np.mean(avg_scores), np.std(avg_scores)
        info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}')

        if args.show_individual_scores:
            for task_num, task_name in enumerate(args.task_names):
                info(f'\tOverall test {task_name} {metric} = '
                     f'{np.mean(scores[:, task_num]):.6f} +/- {np.std(scores[:, task_num]):.6f}')

    if contains_nan_scores:
        info("The metric scores observed for some fold test splits contain 'nan' values. \
            This can occur when the test set does not meet the requirements \
            for a particular metric, such as having no valid instances of one \
            task in the test set or not having positive examples for some classification metrics. \
            Before v1.5.1, the default behavior was to ignore nan values in individual folds or tasks \
            and still return an overall average for the remaining folds or tasks. The behavior now \
            is to include them in the average, converting overall average metrics to 'nan' as well.")

    # Save scores
    with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f:
        writer = csv.writer(f)

        header = ['Task']
        for metric in args.metrics:
            header += [f'Mean {metric}', f'Standard deviation {metric}'] + \
                      [f'Fold {i} {metric}' for i in range(args.num_folds)]
        writer.writerow(header)

        if args.dataset_type == 'spectra': # spectra data type has only one score to report
            row = ['spectra']
            for metric, scores in all_scores.items():
                task_scores = scores[:,0]
                mean, std = np.mean(task_scores), np.std(task_scores)
                row += [mean, std] + task_scores.tolist()
            writer.writerow(row)
        else: # all other data types, separate scores by task
            for task_num, task_name in enumerate(args.task_names):
                row = [task_name]
                for metric, scores in all_scores.items():
                    task_scores = scores[:, task_num]
                    mean, std = np.mean(task_scores), np.std(task_scores)
                    row += [mean, std] + task_scores.tolist()
                writer.writerow(row)

    # Determine mean and std score of main metric
    avg_scores = multitask_mean(all_scores[args.metric], metric=args.metric, axis=1)
    mean_score, std_score = np.mean(avg_scores), np.std(avg_scores)

    # Optionally merge and save test preds
    if args.save_preds:
        all_preds = pd.concat([pd.read_csv(os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv'))
                               for fold_num in range(args.num_folds)])
        all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False)

    return mean_score, std_score
Esempio n. 29
0
def predict_and_save(
    args: PredictArgs,
    train_args: TrainArgs,
    test_data: MoleculeDataset,
    task_names: List[str],
    num_tasks: int,
    test_data_loader: MoleculeDataLoader,
    full_data: MoleculeDataset,
    full_to_valid_indices: dict,
    models: List[MoleculeModel],
    scalers: List[List[StandardScaler]],
    num_models: int,
    calibrator: UncertaintyCalibrator = None,
    return_invalid_smiles: bool = False,
    save_results: bool = True,
):
    """
    Function to predict with a model and save the predictions to file.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param train_args: A :class:`~chemprop.args.TrainArgs` object containing arguments for training the model.
    :param test_data: A :class:`~chemprop.data.MoleculeDataset` containing valid datapoints.
    :param task_names: A list of task names.
    :param num_tasks: Number of tasks.
    :param test_data_loader: A :class:`~chemprop.data.MoleculeDataLoader` to load the test data.
    :param full_data:  A :class:`~chemprop.data.MoleculeDataset` containing all (valid and invalid) datapoints.
    :param full_to_valid_indices: A dictionary dictionary mapping full to valid indices.
    :param models: A list or generator object of :class:`~chemprop.models.MoleculeModel`\ s.
    :param scalers: A list or generator object of :class:`~chemprop.features.scaler.StandardScaler` objects.
    :param num_models: The number of models included in the models and scalers input.
    :param calibrator: A :class: `~chemprop.uncertainty.UncertaintyCalibrator` object, for use in calibrating uncertainty predictions.
    :param return_invalid_smiles: Whether to return predictions of "Invalid SMILES" for invalid SMILES, otherwise will skip them in returned predictions.
    :param save_results: Whether to save the predictions in a csv. Function returns the predictions regardless.
    :return:  A list of lists of target predictions.
    """
    estimator = UncertaintyEstimator(
        test_data=test_data,
        test_data_loader=test_data_loader,
        uncertainty_method=args.uncertainty_method,
        models=models,
        scalers=scalers,
        num_models=num_models,
        dataset_type=args.dataset_type,
        loss_function=args.loss_function,
        uncertainty_dropout_p=args.uncertainty_dropout_p,
        dropout_sampling_size=args.dropout_sampling_size,
        individual_ensemble_predictions=args.individual_ensemble_predictions,
        spectra_phase_mask=getattr(train_args, "spectra_phase_mask", None),
    )

    preds, unc = estimator.calculate_uncertainty(
        calibrator=calibrator)  # preds and unc are lists of shape(data,tasks)

    if args.individual_ensemble_predictions:
        individual_preds = (
            estimator.individual_predictions()
        )  # shape(data, tasks, ensemble) or (data, tasks, classes, ensemble)

    if args.evaluation_methods is not None:

        evaluation_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=task_names,
            features_path=args.features_path,
            features_generator=args.features_generator,
            phase_features_path=args.phase_features_path,
            atom_descriptors_path=args.atom_descriptors_path,
            bond_features_path=args.bond_features_path,
            max_data_size=args.max_data_size,
            loss_function=args.loss_function,
        )

        evaluators = []
        for evaluation_method in args.evaluation_methods:
            evaluator = build_uncertainty_evaluator(
                evaluation_method=evaluation_method,
                calibration_method=args.calibration_method,
                uncertainty_method=args.uncertainty_method,
                dataset_type=args.dataset_type,
                loss_function=args.loss_function,
                calibrator=calibrator,
            )
            evaluators.append(evaluator)
    else:
        evaluators = None

    if evaluators is not None:
        evaluations = []
        print(f"Evaluating uncertainty for tasks {task_names}")
        for evaluator in evaluators:
            evaluation = evaluator.evaluate(targets=evaluation_data.targets(),
                                            preds=preds,
                                            uncertainties=unc,
                                            mask=evaluation_data.mask())
            evaluations.append(evaluation)
            print(
                f"Using evaluation method {evaluator.evaluation_method}: {evaluation}"
            )
    else:
        evaluations = None

    # Save results
    if save_results:
        print(f"Saving predictions to {args.preds_path}")
        assert len(test_data) == len(preds)
        assert len(test_data) == len(unc)

        makedirs(args.preds_path, isfile=True)

        # Set multiclass column names, update num_tasks definitions
        if args.dataset_type == "multiclass":
            original_task_names = task_names
            task_names = [
                f"{name}_class_{i}" for name in task_names
                for i in range(args.multiclass_num_classes)
            ]
            num_tasks = num_tasks * args.multiclass_num_classes
        if args.uncertainty_method == "spectra_roundrobin":
            num_unc_tasks = 1
        else:
            num_unc_tasks = num_tasks

        # Copy predictions over to full_data
        for full_index, datapoint in enumerate(full_data):
            valid_index = full_to_valid_indices.get(full_index, None)
            if valid_index is not None:
                d_preds = preds[valid_index]
                d_unc = unc[valid_index]
                if args.individual_ensemble_predictions:
                    ind_preds = individual_preds[valid_index]
            else:
                d_preds = ["Invalid SMILES"] * num_tasks
                d_unc = ["Invalid SMILES"] * num_unc_tasks
                if args.individual_ensemble_predictions:
                    ind_preds = [["Invalid SMILES"] *
                                 len(args.checkpoint_paths)] * num_tasks
            # Reshape multiclass to merge task and class dimension, with updated num_tasks
            if args.dataset_type == "multiclass":
                d_preds = np.array(d_preds).reshape((num_tasks))
                d_unc = np.array(d_unc).reshape((num_unc_tasks))
                if args.individual_ensemble_predictions:
                    ind_preds = ind_preds.reshape(
                        (num_tasks, len(args.checkpoint_paths)))

            # If extra columns have been dropped, add back in SMILES columns
            if args.drop_extra_columns:
                datapoint.row = OrderedDict()

                smiles_columns = args.smiles_columns

                for column, smiles in zip(smiles_columns, datapoint.smiles):
                    datapoint.row[column] = smiles

            # Add predictions columns
            if args.uncertainty_method == "spectra_roundrobin":
                unc_names = [estimator.label]
            else:
                unc_names = [
                    name + f"_{estimator.label}" for name in task_names
                ]

            for pred_name, unc_name, pred, un in zip(task_names, unc_names,
                                                     d_preds, d_unc):
                datapoint.row[pred_name] = pred
                if args.uncertainty_method is not None:
                    datapoint.row[unc_name] = un
            if args.individual_ensemble_predictions:
                for pred_name, model_preds in zip(task_names, ind_preds):
                    for idx, pred in enumerate(model_preds):
                        datapoint.row[pred_name + f"_model_{idx}"] = pred

        # Save
        with open(args.preds_path, "w") as f:
            writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
            writer.writeheader()
            for datapoint in full_data:
                writer.writerow(datapoint.row)

        if evaluations is not None and args.evaluation_scores_path is not None:
            print(
                f"Saving uncertainty evaluations to {args.evaluation_scores_path}"
            )
            if args.dataset_type == "multiclass":
                task_names = original_task_names
            with open(args.evaluation_scores_path, "w") as f:
                writer = csv.writer(f)
                writer.writerow(["evaluation_method"] + task_names)
                for i, evaluation_method in enumerate(args.evaluation_methods):
                    writer.writerow([evaluation_method] + evaluations[i])

    if return_invalid_smiles:
        full_preds = []
        full_unc = []
        for full_index in range(len(full_data)):
            valid_index = full_to_valid_indices.get(full_index, None)
            if valid_index is not None:
                pred = preds[valid_index]
                un = unc[valid_index]
            else:
                pred = ["Invalid SMILES"] * num_tasks
                un = ["Invalid SMILES"] * num_unc_tasks
            full_preds.append(pred)
            full_unc.append(un)
        return full_preds, full_unc
    else:
        return preds, unc