Exemple #1
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args)
        valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = np.zeros((len(test_data), 1))
        for model in self.checkpoints:
            model_preds = predict(
                model=model,
                data=test_data,
                batch_size=batch_size,
                scaler=self.scaler
            )
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)
Exemple #2
0
def compute_molecule_vectors(model: nn.Module, data: MoleculeDataset, batch_size: int) -> List[np.ndarray]:
    """
    Computes the molecule vectors output from the last layer of a MoleculeModel.

    :param model: A MoleculeModel.
    :param data: A MoleculeDataset.
    :param batch_size: Batch size.
    :return: A list of 1D numpy arrays of length hidden_size containing
    the molecule vectors generated by the model for each molecule provided.
    """
    model.eval()
    model.ffn[-1] = Identity()  # Replace last linear layer with identity
    if hasattr(model, 'sigmoid'):
        model.sigmoid = Identity()

    vecs = []

    num_iters, iter_step = len(data), batch_size

    for i in trange(0, num_iters, iter_step):
        # Prepare batch
        mol_batch = MoleculeDataset(data[i:i + batch_size])
        smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features()

        # Run model
        batch = smiles_batch

        with torch.no_grad():
            batch_vecs = model(batch, features_batch)

        # Collect vectors
        batch_vecs = batch_vecs.data.cpu().numpy()
        vecs.extend(batch_vecs)

    return vecs
Exemple #3
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=self.train_args)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data=test_data,
                                  batch_size=batch_size,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)
        return avg_preds
Exemple #4
0
    def __call__(self,
                 smiles: List[str],
                 batch_size: int = 500) -> List[List[float]]:
        test_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=self.args.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)

        return avg_preds
def predict(model: nn.Module,
            data: MoleculeDataset,
            batch_size: int,
            scaler: StandardScaler = None,
            uncertainty: bool = False) -> List[List[float]]:
    """
    Makes predictions on a dataset using an ensemble of models.

    :param model: A model.
    :param data: A MoleculeDataset.
    :param batch_size: Batch size.
    :param scaler: A StandardScaler object fit on the training targets.
    :param uncertainty: Whether uncertainty values should be returned.
    :return: A list of lists of predictions. The outer list is examples
    while the inner list is tasks.
    """
    model.eval()

    preds = []

    num_iters, iter_step = len(data), batch_size

    for i in trange(0, num_iters, iter_step):
        # Prepare batch
        mol_batch = MoleculeDataset(data[i:i + batch_size])
        smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features()

        # Run model
        batch = smiles_batch

        with torch.no_grad():
            batch_preds = model(batch, features_batch)

        batch_preds = batch_preds.data.cpu().numpy()

        # Collect vectors
        batch_preds = batch_preds.tolist()
        preds.extend(batch_preds)

    if model.uncertainty:
        p = []
        c = []
        for i in range(len(preds)):
            p.append([preds[i][j] for j in range(len(preds[i])) if j % 2 == 0])
            c.append([preds[i][j] for j in range(len(preds[i])) if j % 2 == 1])

        if scaler is not None:
            p = scaler.inverse_transform(p).tolist()
            c = (scaler.stds**2 * c).tolist()

        if uncertainty:
            return p, c

        return p

    if scaler is not None:
        preds = scaler.inverse_transform(preds).tolist()

    return preds
def predict_smile(checkpoint_path: str, smile: str):
    smiles = [smile]
    """
        Makes predictions. If smiles is provided, makes predictions on smiles.
        Otherwise makes predictions on args.test_data.

        :param args: Arguments.
        :param smiles: Smiles to make predictions on.
        :return: A list of lists of target predictions.
        """
    args = Namespace()
    # print('Loading training args')
    scaler, features_scaler = load_scalers(checkpoint_path)
    train_args = load_args(checkpoint_path)

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        print("Enter Valid Smile String")
        return

    # print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))

    model = load_checkpoint(checkpoint_path, cuda=args.cuda)
    model_preds = predict(model=model,
                          data=test_data,
                          batch_size=1,
                          scaler=scaler)
    sum_preds += np.array(model_preds)

    # Ensemble predictions
    return sum_preds[0][0]
Exemple #7
0
    def __init__(self, train_data: MoleculeDataset, val_data: MoleculeDataset,
                 test_data: MoleculeDataset, scaler: StandardScaler,
                 args: Namespace):
        super().__init__(train_data, val_data, test_data, scaler, args)

        self.sum_val_uncertainty = np.zeros(
            (len(val_data.smiles()), args.num_tasks))

        self.sum_test_uncertainty = np.zeros(
            (len(test_data.smiles()), args.num_tasks))
Exemple #8
0
def single_task_sklearn(model: Union[RandomForestRegressor, RandomForestClassifier, SVR, SVC],
                        train_data: MoleculeDataset,
                        test_data: MoleculeDataset,
                        metrics: List[str],
                        args: SklearnTrainArgs,
                        logger: Logger = None) -> List[float]:
    """
    Trains a single-task scikit-learn model, meaning a separate model is trained for each task.

    This is necessary if some tasks have None (unknown) values.

    :param model: The scikit-learn model to train.
    :param train_data: The training data.
    :param test_data: The test data.
    :param metrics: A list of names of metric functions.
    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 training the scikit-learn model.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    scores = {}
    num_tasks = train_data.num_tasks()
    for task_num in trange(num_tasks):
        # Only get features and targets for molecules where target is not None
        train_features, train_targets = zip(*[(features, targets[task_num])
                                              for features, targets in zip(train_data.features(), train_data.targets())
                                              if targets[task_num] is not None])
        test_features, test_targets = zip(*[(features, targets[task_num])
                                            for features, targets in zip(test_data.features(), test_data.targets())
                                            if targets[task_num] is not None])

        model.fit(train_features, train_targets)

        test_preds = predict(
            model=model,
            model_type=args.model_type,
            dataset_type=args.dataset_type,
            features=test_features
        )
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(
            preds=test_preds,
            targets=test_targets,
            num_tasks=1,
            metrics=metrics,
            dataset_type=args.dataset_type,
            logger=logger
        )
        for metric in metrics:
            if metric not in scores:
                scores[metric] = []
            scores[metric].append(score[metric][0])

    return scores
Exemple #9
0
 def test_filter_good_smiles(self):
     """Test pass through for all good smiles"""
     smiles_list = [['C'], ['CC'], ['CN'], ['O']]
     dataset = MoleculeDataset([MoleculeDatapoint(s) for s in smiles_list])
     filtered_dataset = filter_invalid_smiles(dataset)
     self.assertEqual(filtered_dataset.smiles(),
                      [['C'], ['CC'], ['CN'], ['O']])
def run_split_data(data_path: str, split_type: str,
                   split_sizes: Tuple[int, int,
                                      int], seed: int, save_dir: str):
    with open(data_path) as f:
        reader = csv.reader(f)
        header = next(reader)
        lines = list(reader)

    data = []
    for line in tqdm(lines):
        datapoint = MoleculeDatapoint(line=line)
        datapoint.line = line
        data.append(datapoint)
    data = MoleculeDataset(data)

    train, dev, test = split_data(data=data,
                                  split_type=split_type,
                                  sizes=split_sizes,
                                  seed=seed)

    makedirs(save_dir)

    for name, dataset in [('train', train), ('dev', dev), ('test', test)]:
        with open(os.path.join(save_dir, f'{name}.csv'), 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header)
            for datapoint in dataset:
                writer.writerow(datapoint.line)
Exemple #11
0
 def setUp(self):
     smiles_list = [['C', 'CC'], ['CC', 'CCC'], ['CCC', 'CN'],
                    ['CN', 'CCN'], ['CCN', 'CCCN'], ['CCCN', 'CCCCN'],
                    ['CCCCN', 'CO'], ['CO', 'CCO'], ['CO', 'CCCO'],
                    ['CN', 'CCC']]
     self.dataset = MoleculeDataset(
         [MoleculeDatapoint(s) for s in smiles_list])
def evaluate(model: nn.Module,
             data: MoleculeDataset,
             num_tasks: int,
             metric_func: Callable,
             batch_size: int,
             dataset_type: str,
             scaler: StandardScaler = None,
             logger: logging.Logger = None) -> List[float]:
    """
    Evaluates an ensemble of models on a dataset.

    :param model: A model.
    :param data: A MoleculeDataset.
    :param num_tasks: Number of tasks.
    :param metric_func: Metric function which takes in a list of targets and a list of predictions.
    :param batch_size: Batch size.
    :param dataset_type: Dataset type.
    :param scaler: A StandardScaler object fit on the training targets.
    :param logger: Logger.
    :return: A list with the score for each task based on `metric_func`.
    """
    preds, check_fp = predict(
        model=model, data=data, batch_size=batch_size,
        scaler=scaler)  # wei, add check_fp for debug to check fp of each depth

    targets = data.targets()

    results = evaluate_predictions(preds=preds,
                                   targets=targets,
                                   num_tasks=num_tasks,
                                   metric_func=metric_func,
                                   dataset_type=dataset_type,
                                   logger=logger)

    return results
def split_indices(all_indices: List[int],
                  num_folds: int,
                  scaffold: bool = False,
                  data: MoleculeDataset = None,
                  shuffle: bool = True) -> List[List[int]]:
    num_data = len(all_indices)
    if scaffold:
        scaffold_to_indices = scaffold_to_smiles(data.mols(), use_indices=True)
        index_sets = sorted(list(scaffold_to_indices.values()),
                            key=lambda index_set: len(index_set),
                            reverse=True)
        fold_indices = [[] for _ in range(num_folds)]
        for s in index_sets:
            length_array = [len(fi) for fi in fold_indices]
            min_index = length_array.index(min(length_array))
            fold_indices[min_index] += s
        if shuffle:
            random.shuffle(fold_indices)
    else:  # random
        if shuffle:
            random.shuffle(all_indices)
        fold_indices = []
        for i in range(num_folds):
            begin, end = int(i * num_data / num_folds), int(
                (i + 1) * num_data / num_folds)
            fold_indices.append(np.array(all_indices[begin:end]))
    return fold_indices
Exemple #14
0
def run_split_data(args: Args):
    # Load raw data
    with open(args.data_path) as f:
        reader = csv.reader(f)
        header = next(reader)
        lines = list(reader)

    # Load SMILES
    smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column)

    # Make sure lines and smiles line up
    assert len(lines) == len(smiles)
    assert all(smile in line for smile, line in zip(smiles, lines))

    # Create data
    data = []
    for smile, line in tqdm(zip(smiles, lines), total=len(smiles)):
        datapoint = MoleculeDatapoint(smiles=smile)
        datapoint.line = line
        data.append(datapoint)
    data = MoleculeDataset(data)

    train, val, test = split_data(data=data,
                                  split_type=args.split_type,
                                  sizes=args.split_sizes,
                                  seed=args.seed)

    makedirs(args.save_dir)

    for name, dataset in [('train', train), ('val', val), ('test', test)]:
        with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header)
            for datapoint in dataset:
                writer.writerow(datapoint.line)
def examine_split_balance(split_type: str):
    results = []

    for dataset in DATASETS:
        # Load task names for the dataset
        data_path = os.path.join(BASE, dataset, f'{dataset}.csv')
        data = get_data(data_path)

        # Get class balance ratios for full dataset
        ratios = compute_ratios(data)

        # Initialize array of diffs between ratios
        ratio_diffs = []

        # Loop through folds
        for fold in os.listdir(os.path.join(BASE, dataset, split_type)):
            # Open fold indices
            with open(
                    os.path.join(BASE, dataset, split_type, fold, '0',
                                 'split_indices.pckl'), 'rb') as f:
                indices = pickle.load(f)

            # Get test data
            test_data = MoleculeDataset([data[index] for index in indices[2]])

            # Get test ratios
            test_ratios = compute_ratios(test_data)

            # Compute ratio diff
            ratio_diff = np.maximum(ratios / test_ratios, test_ratios / ratios)
            ratio_diff[np.where(np.isinf(ratio_diff))[0]] = np.nan

            # Add ratio diff
            ratio_diffs.append(ratio_diff)

        # Convert to numpy array
        ratio_diffs = np.array(ratio_diffs)  # num_folds x num_tasks

        # Determine number of folds and number of failures
        num_folds = len(ratio_diffs)
        num_failures = np.sum(np.isnan(ratio_diffs))

        # Average across tasks
        ratio_diffs = np.nanmean(ratio_diffs, axis=1)  # num_folds

        # Compute mean and standard deviation across folds
        mean, std = np.nanmean(ratio_diffs), np.nanstd(ratio_diffs)

        # Add results
        results.append({
            'dataset': dataset,
            'mean': mean,
            'std': std,
            'num_folds': num_folds,
            'num_failures': num_failures
        })

    pprint(results)
def single_task_random_forest(train_data: MoleculeDataset,
                              test_data: MoleculeDataset,
                              metric_func: Callable,
                              args: Namespace,
                              logger: Logger = None) -> List[float]:
    scores = []
    num_tasks = train_data.num_tasks()
    for task_num in trange(num_tasks):
        # Only get features and targets for molecules where target is not None
        train_features, train_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                train_data.features(), train_data.targets())
              if targets[task_num] is not None])
        test_features, test_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                test_data.features(), test_data.targets())
              if targets[task_num] is not None])

        if args.dataset_type == 'regression':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.dataset_type == 'classification':
            model = RandomForestClassifier(class_weight=args.class_weight,
                                           n_estimators=args.num_trees,
                                           n_jobs=-1)
        else:
            raise ValueError(
                f'dataset_type "{args.dataset_type}" not supported.')

        model.fit(train_features, train_targets)

        test_preds = model.predict(test_features)

        test_preds = [[pred] for pred in test_preds]
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(preds=test_preds,
                                     targets=test_targets,
                                     num_tasks=1,
                                     metric_func=metric_func,
                                     dataset_type=args.dataset_type,
                                     logger=logger)
        scores.append(score[0])

    return scores
Exemple #17
0
def predict(model: nn.Module,
            data: MoleculeDataset,
            batch_size: int,
            scaler: StandardScaler = None) -> List[List[float]]:
    """
    Makes predictions on a dataset using an ensemble of models.

    :param model: A model.
    :param data: A MoleculeDataset.
    :param batch_size: Batch size.
    :param scaler: A StandardScaler object fit on the training targets.
    :return: A list of lists of predictions. The outer list is examples
    while the inner list is tasks.
    """
    model.eval()

    preds = []

    num_iters, iter_step = len(data), batch_size
    smiles_batch_all = []

    for i in trange(0, num_iters, iter_step):
        # Prepare batch
        mol_batch = MoleculeDataset(data[i:i + batch_size])
        smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features()

        # Run model
        batch = smiles_batch

        with torch.no_grad():
            batch_preds = model(batch, features_batch)

        batch_preds = [x.data.cpu().numpy() for x in batch_preds]

        # Inverse scale if regression
        if scaler is not None:
            batch_preds = scaler.inverse_transform(batch_preds)

        # Collect vectors
        preds.append(batch_preds)
        smiles_batch_all.extend(smiles_batch)
    preds = [np.concatenate(x) for x in zip(*preds)]

    return preds, smiles_batch_all
Exemple #18
0
def predict(model: nn.Module,
            data: MoleculeDataset,
            batch_size: int,
            disable_progress_bar: bool = False,
            scaler: StandardScaler = None) -> List[List[float]]:
    """
    Makes predictions on a dataset using an ensemble of models.

    :param model: A model.
    :param data: A MoleculeDataset.
    :param batch_size: Batch size.
    :param disable_progress_bar: Whether to disable the progress bar.
    :param scaler: A StandardScaler object fit on the training targets.
    :return: A list of lists of predictions. The outer list is examples
    while the inner list is tasks.
    """
    model.eval()

    preds = []

    num_iters, iter_step = len(data), batch_size

    for i in trange(0, num_iters, iter_step, disable=disable_progress_bar):
        # Prepare batch
        mol_batch = MoleculeDataset(data[i:i + batch_size])
        smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features()

        # Run model
        batch = smiles_batch

        with torch.no_grad():
            batch_preds = model(batch, features_batch)

        batch_preds = batch_preds.data.cpu().numpy()

        # Inverse scale if regression
        if scaler is not None:
            batch_preds = scaler.inverse_transform(batch_preds)

        # Collect vectors
        batch_preds = batch_preds.tolist()
        preds.extend(batch_preds)

    return preds
    def gcnn_predict(self) -> Tuple[array, array]:
        """
        Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors

        Parameters:
            rdkit_mols (array): a numpy array containing RDKit molecules

        Returns:
            predictions, prediction_labels (Tuple[array, array]): predictions and labels
        """

        smiles = self.rdkit_mols.tolist()
        full_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        test_data = MoleculeDataset(
            [full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # create data loader
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=50,
                                              num_workers=0)

        model_preds = predict(model=rlm_gcnn_model,
                              data_loader=test_data_loader,
                              scaler=rlm_gcnn_scaler)
        predictions = np.ma.empty(len(full_data))
        predictions.mask = True
        labels = np.ma.empty(len(full_data))
        labels.mask = True
        for key in full_to_valid_indices.keys():
            full_index = int(key)
            predictions[full_index] = model_preds[
                full_to_valid_indices[key]][0]
            labels[full_index] = np.round(
                model_preds[full_to_valid_indices[key]][0], 0)

        self.predictions_df['GCNN'] = pd.Series(
            pd.Series(labels).fillna('').astype(str) + ' (' +
            pd.Series(predictions).round(2).astype(str) + ')').str.replace(
                '(nan)', '', regex=False)
        if len(self.predictions_df.index) > len(
                predictions) or np.ma.count_masked(predictions) > 0:
            self.model_errors.append('graph convolutional neural network')
            self.has_errors = True

        return predictions, labels
Exemple #20
0
def multi_task_sklearn(model,
                       train_data: MoleculeDataset,
                       test_data: MoleculeDataset,
                       metric_func: Callable,
                       args: SklearnTrainArgs,
                       logger: Logger = None) -> List[float]:
    num_tasks = train_data.num_tasks()

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    # Train
    model.fit(train_data.features(), train_targets)

    # Save model
    with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f:
        pickle.dump(model, f)

    test_preds = predict(model=model,
                         model_type=args.model_type,
                         dataset_type=args.dataset_type,
                         features=test_data.features())

    scores = evaluate_predictions(preds=test_preds,
                                  targets=test_data.targets(),
                                  num_tasks=num_tasks,
                                  metric_func=metric_func,
                                  dataset_type=args.dataset_type,
                                  logger=logger)

    return scores
Exemple #21
0
def multi_task_random_forest(train_data: MoleculeDataset,
                             test_data: MoleculeDataset,
                             metric_func: Callable,
                             args: Namespace) -> List[float]:
    if args.dataset_type == 'regression':
        model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1)
    elif args.dataset_type == 'classification':
        model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1)
    else:
        raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    model.fit(train_data.features(), train_targets)

    test_preds = model.predict(test_data.features())
    if train_data.num_tasks() == 1:
        test_preds = [[pred] for pred in test_preds]

    scores = evaluate_predictions(
        preds=test_preds,
        targets=test_data.targets(),
        metric_func=metric_func,
        dataset_type=args.dataset_type
    )

    return scores
Exemple #22
0
def create_time_splits(args: Args):
    # ASSUME DATA GIVEN IN CHRONOLOGICAL ORDER.
    # this will dump a very different format of indices, with all in one file; TODO modify as convenient later.
    data = get_data(path=args.data_path, smiles_columns=args.smiles_columns)
    num_data = len(data)
    all_indices = list(range(num_data))
    fold_indices = {'random': [], 'scaffold': [], 'time': []}
    for i in range(args.num_folds - args.time_folds_per_train_set - 1):
        begin, end = int(i * num_data / args.num_folds), int(
            (i + args.time_folds_per_train_set + 2) * num_data /
            args.num_folds)
        subset_indices = all_indices[begin:end]
        subset_data = MoleculeDataset(data[begin:end])
        fold_indices['random'].append(
            split_indices(deepcopy(subset_indices),
                          args.time_folds_per_train_set + 2))
        fold_indices['scaffold'].append(
            split_indices(subset_indices,
                          args.time_folds_per_train_set + 2,
                          scaffold=True,
                          split_key_molecule=args.split_key_molecule,
                          data=subset_data))
        fold_indices['time'].append(
            split_indices(subset_indices,
                          args.time_folds_per_train_set + 2,
                          shuffle=False))
    for split_type in ['random', 'scaffold', 'time']:
        all_splits = []
        for i in range(len(fold_indices[split_type])):
            os.makedirs(os.path.join(args.save_dir, split_type,
                                     'fold_' + str(i), '0'),
                        exist_ok=True)
            with open(
                    os.path.join(args.save_dir, split_type, 'fold_' + str(i),
                                 '0', 'split_indices.pckl'), 'wb') as wf:
                train = np.concatenate([
                    fold_indices[split_type][i][j]
                    for j in range(args.time_folds_per_train_set)
                ])
                # train = []
                # for fold in train_folds:
                #     train += fold
                val = fold_indices[split_type][i][-2]
                test = fold_indices[split_type][i][-1]
                pickle.dump(
                    [train, val, test], wf
                )  # each is a pickle file containing a list of length-3 index lists for train/val/test
                all_splits.append([train, val, test])
        with open(
                os.path.join(args.save_dir, split_type, 'fold_' + str(i),
                             'split_indices.pckl'), 'wb') as wf:
            pickle.dump(all_splits, wf)
Exemple #23
0
def single_task_sklearn(model,
                        train_data: MoleculeDataset,
                        test_data: MoleculeDataset,
                        metric_func: Callable,
                        args: SklearnTrainArgs,
                        logger: Logger = None) -> List[float]:
    scores = []
    num_tasks = train_data.num_tasks()
    for task_num in trange(num_tasks):
        # Only get features and targets for molecules where target is not None
        train_features, train_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                train_data.features(), train_data.targets())
              if targets[task_num] is not None])
        test_features, test_targets = zip(
            *[(features, targets[task_num]) for features, targets in zip(
                test_data.features(), test_data.targets())
              if targets[task_num] is not None])

        model.fit(train_features, train_targets)

        test_preds = predict(model=model,
                             model_type=args.model_type,
                             dataset_type=args.dataset_type,
                             features=test_features)
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(preds=test_preds,
                                     targets=test_targets,
                                     num_tasks=1,
                                     metric_func=metric_func,
                                     dataset_type=args.dataset_type,
                                     logger=logger)
        scores.append(score[0])

    return scores
def load_data(args: PredictArgs, smiles: List[List[str]]):
    """
    Function to load data from a list of smiles or a file.

    :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for
                 loading data and a model and making predictions.
    :param smiles: A list of list of smiles, or None if data is to be read from file
    :return: A tuple of a :class:`~chemprop.data.MoleculeDataset` containing all datapoints, a :class:`~chemprop.data.MoleculeDataset` containing only valid datapoints,
                 a :class:`~chemprop.data.MoleculeDataLoader` and a dictionary mapping full to valid indices.
    """
    print("Loading data")
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator,
        )
    else:
        full_data = get_data(
            path=args.test_path,
            smiles_columns=args.smiles_columns,
            target_columns=[],
            ignore_columns=[],
            skip_invalid_smiles=False,
            args=args,
            store_row=not args.drop_extra_columns,
        )

    print("Validating SMILES")
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if all(mol is not None for mol in full_data[full_index].mol):
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    print(f"Test size = {len(test_data):,}")

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    return full_data, test_data, test_data_loader, full_to_valid_indices
Exemple #25
0
    def __call__(self,
                 smiles: List[str],
                 batch_size: int = 500) -> List[List[float]]:
        """
        Makes predictions on a list of SMILES.

        :param smiles: A list of SMILES to make predictions on.
        :param batch_size: The batch size.
        :return: A list of lists of floats containing the predicted values.
        """
        test_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=self.args.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)
        if self.train_args.atom_descriptor_scaling and self.args.atom_descriptors is not None:
            test_data.normalize_features(self.atom_descriptor_scaler,
                                         scale_atom_descriptors=True)
        if self.train_args.bond_feature_scaling and self.args.bond_features_size > 0:
            test_data.normalize_features(self.bond_feature_scaler,
                                         scale_bond_features=True)

        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)

        return avg_preds
Exemple #26
0
def evaluate(model: nn.Module,
             data: MoleculeDataset,
             metric_func: Callable,
             args: Namespace,
             scaler: StandardScaler = None,
             logger: logging.Logger = None) -> List[float]:
    """
    Evaluates an ensemble of models on a dataset.

    :param model: A model.
    :param data: A MoleculeDataset.
    :param metric_func: Metric function which takes in a list of targets and a list of predictions.
    :param dataset_type: Dataset type.
    :param args: Arguments.
    :param scaler: A StandardScaler object fit on the training targets.
    :param logger: Logger.
    :return: A list with the score for each task based on `metric_func`.
    """
    preds = predict(
        model=model,
        data=data,
        args=args,
        scaler=scaler,
        bert_save_memory=True,
        logger=logger
    )

    if args.maml:
        preds, targets = preds  # in this case the targets are determined by the tasks sampled during prediction
    else:
        targets = data.targets()
        if args.dataset_type == 'bert_pretraining':
            # Only predict targets that are masked out
            targets['vocab'] = [target if mask == 0 else None for target, mask in zip(targets['vocab'], data.mask())]

    results = evaluate_predictions(
        preds=preds,
        targets=targets,
        metric_func=metric_func,
        dataset_type=args.dataset_type,
        args=args,
        logger=logger
    )

    return results
Exemple #27
0
def multi_task_sklearn(model: Union[RandomForestRegressor, RandomForestClassifier, SVR, SVC],
                       train_data: MoleculeDataset,
                       test_data: MoleculeDataset,
                       metrics: List[str],
                       args: SklearnTrainArgs,
                       logger: Logger = None) -> Dict[str, List[float]]:
    """
    Trains a multi-task scikit-learn model, meaning one model is trained simultaneously on all tasks.

    This is only possible if none of the tasks have None (unknown) values.

    :param model: The scikit-learn model to train.
    :param train_data: The training data.
    :param test_data: The test data.
    :param metrics: A list of names of metric functions.
    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 training the scikit-learn model.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    num_tasks = train_data.num_tasks()

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    # Train
    model.fit(train_data.features(), train_targets)

    # Save model
    with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f:
        pickle.dump(model, f)

    test_preds = predict(
        model=model,
        model_type=args.model_type,
        dataset_type=args.dataset_type,
        features=test_data.features()
    )

    scores = evaluate_predictions(
        preds=test_preds,
        targets=test_data.targets(),
        num_tasks=num_tasks,
        metrics=metrics,
        dataset_type=args.dataset_type,
        logger=logger
    )

    return scores
Exemple #28
0
def async_mol2graph(q: Queue, 
                    data: MoleculeDataset, 
                    args: Namespace,
                    num_iters: int,
                    iter_size: int,
                    exit_q: Queue,
                    last_batch: bool=False):
    batches = []
    for i in range(0, num_iters, iter_size):  # will only go up to max size of queue, then yield
        if not last_batch and i + args.batch_size > len(data):
            break
        batch = MoleculeDataset(data[i:i + args.batch_size])
        batches.append(batch)
        if len(batches) == args.batches_per_queue_group:  # many at a time, since synchronization is expensive
            with Pool() as pool:
                processed_batches = pool.map(mol2graph_helper, [(batch, args) for batch in batches])
            q.put(processed_batches)
            batches = []
    if len(batches) > 0:
        with Pool() as pool:
            processed_batches = pool.map(mol2graph_helper, [(batch, args) for batch in batches])
        q.put(processed_batches)
    exit_q.get()  # prevent from exiting until main process tells it to; otherwise we apparently can't read the end of the queue and crash
def train(inv_model, src_data, tgt_data, loss_func, inv_opt, args):
    inv_model.train()
    src_data.shuffle()

    new_size = len(tgt_data) / args.batch_size * args.src_batch_size
    new_size = int(new_size)

    src_pos_data = [d for d in src_data if d.targets[0] == 1]
    src_neg_data = [d for d in src_data if d.targets[0] == 0]
    print(len(tgt_data))
    print(len(src_pos_data), len(src_neg_data), new_size)
    src_data = MoleculeDataset(src_pos_data + src_neg_data[:new_size])

    src_data.shuffle()
    tgt_data.shuffle()

    src_iter = range(0, len(src_data), args.src_batch_size)
    tgt_iter = range(0, len(tgt_data), args.batch_size)

    for i, j in zip(src_iter, tgt_iter):
        inv_model.zero_grad()
        src_batch = src_data[i:i + args.src_batch_size]
        src_batch = MoleculeDataset(src_batch)
        src_loss = forward(inv_model, src_batch, loss_func, is_source=True)

        tgt_batch = tgt_data[j:j + args.batch_size]
        tgt_batch = MoleculeDataset(tgt_batch)
        tgt_loss = forward(inv_model, tgt_batch, loss_func, is_source=False)

        loss = (src_loss + tgt_loss) / 2
        loss.backward()
        inv_opt[0].step()
        inv_opt[1].step()

        lr = inv_opt[1].get_lr()[0]
        ignorm = compute_gnorm(inv_model)
        print(f'lr: {lr:.5f}, loss: {loss:.4f}, gnorm: {ignorm:.4f}')
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(
            smiles=[[s] for s in smiles],
            skip_invalid_smiles=False,
            features_generator=self.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol[0] is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = np.zeros((len(test_data), 1))
        for model, scaler, features_scaler in zip(self.checkpoints,
                                                  self.scalers,
                                                  self.features_scalers):
            test_data.reset_features_and_targets()
            if features_scaler is not None:
                test_data.normalize_features(features_scaler)

            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)