Exemple #1
0
    def __call__(self,
                 smiles: List[str],
                 batch_size: int = 500) -> List[List[float]]:
        test_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=self.args.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)

        return avg_preds
Exemple #2
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=self.train_args)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data=test_data,
                                  batch_size=batch_size,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)
        return avg_preds
Exemple #3
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args)
        valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = np.zeros((len(test_data), 1))
        for model in self.checkpoints:
            model_preds = predict(
                model=model,
                data=test_data,
                batch_size=batch_size,
                scaler=self.scaler
            )
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)
    def gcnn_predict(self) -> Tuple[array, array]:
        """
        Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors

        Parameters:
            rdkit_mols (array): a numpy array containing RDKit molecules

        Returns:
            predictions, prediction_labels (Tuple[array, array]): predictions and labels
        """

        smiles = self.rdkit_mols.tolist()
        full_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        test_data = MoleculeDataset(
            [full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # create data loader
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=50,
                                              num_workers=0)

        model_preds = predict(model=rlm_gcnn_model,
                              data_loader=test_data_loader,
                              scaler=rlm_gcnn_scaler)
        predictions = np.ma.empty(len(full_data))
        predictions.mask = True
        labels = np.ma.empty(len(full_data))
        labels.mask = True
        for key in full_to_valid_indices.keys():
            full_index = int(key)
            predictions[full_index] = model_preds[
                full_to_valid_indices[key]][0]
            labels[full_index] = np.round(
                model_preds[full_to_valid_indices[key]][0], 0)

        self.predictions_df['GCNN'] = pd.Series(
            pd.Series(labels).fillna('').astype(str) + ' (' +
            pd.Series(predictions).round(2).astype(str) + ')').str.replace(
                '(nan)', '', regex=False)
        if len(self.predictions_df.index) > len(
                predictions) or np.ma.count_masked(predictions) > 0:
            self.model_errors.append('graph convolutional neural network')
            self.has_errors = True

        return predictions, labels
Exemple #5
0
    def __call__(self,
                 smiles: List[str],
                 batch_size: int = 500) -> List[List[float]]:
        """
        Makes predictions on a list of SMILES.

        :param smiles: A list of SMILES to make predictions on.
        :param batch_size: The batch size.
        :return: A list of lists of floats containing the predicted values.
        """
        test_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=self.args.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)
        if self.train_args.atom_descriptor_scaling and self.args.atom_descriptors is not None:
            test_data.normalize_features(self.atom_descriptor_scaler,
                                         scale_atom_descriptors=True)
        if self.train_args.bond_feature_scaling and self.args.bond_features_size > 0:
            test_data.normalize_features(self.bond_feature_scaler,
                                         scale_bond_features=True)

        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)

        return avg_preds
def run_training(args, save_dir):
    tgt_data, val_data, test_data, src_data = prepare_data(args)
    inv_model = prepare_model(args)

    print('invariant', inv_model)

    optimizer = build_optimizer(inv_model, args)
    scheduler = build_lr_scheduler(optimizer, args)
    inv_opt = (optimizer, scheduler)

    loss_func = get_loss_func(args)
    metric_func = get_metric_func(metric=args.metric)

    best_score = float('inf') if args.minimize_score else -float('inf')
    best_epoch = 0
    for epoch in range(args.epochs):
        print(f'Epoch {epoch}')
        train(inv_model, src_data, tgt_data, loss_func, inv_opt, args)

        val_scores = evaluate(inv_model, val_data, args.num_tasks, metric_func,
                              args.batch_size, args.dataset_type)
        avg_val_score = np.nanmean(val_scores)
        print(f'Validation {args.metric} = {avg_val_score:.4f}')
        if args.minimize_score and avg_val_score < best_score or not args.minimize_score and avg_val_score > best_score:
            best_score, best_epoch = avg_val_score, epoch
            save_checkpoint(os.path.join(save_dir, 'model.pt'),
                            inv_model,
                            args=args)

    print(f'Loading model checkpoint from epoch {best_epoch}')
    model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda)
    test_smiles, test_targets = test_data.smiles(), test_data.targets()
    test_preds = predict(model, test_data, args.batch_size)
    test_scores = evaluate_predictions(test_preds, test_targets,
                                       args.num_tasks, metric_func,
                                       args.dataset_type)

    avg_test_score = np.nanmean(test_scores)
    print(f'Test {args.metric} = {avg_test_score:.4f}')
    return avg_test_score
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(
            smiles=[[s] for s in smiles],
            skip_invalid_smiles=False,
            features_generator=self.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol[0] is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = np.zeros((len(test_data), 1))
        for model, scaler, features_scaler in zip(self.checkpoints,
                                                  self.scalers,
                                                  self.features_scalers):
            test_data.reset_features_and_targets()
            if features_scaler is not None:
                test_data.normalize_features(features_scaler)

            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)
Exemple #8
0
    def gcnn_predict(self, model, scaler) -> Tuple[array, array]:
        """
        Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors

        Parameters:
            models (model): model
            scaler  (scalar): scalar

        Returns:
            predictions, prediction_labels (Tuple[array, array]): predictions and labels
        """

        smiles = self.kekule_smiles.tolist()
        full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        data = MoleculeDataset([full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # create data loader
        data_loader = MoleculeDataLoader(
            dataset=data,
            batch_size=50,
            num_workers=0
        )

        model_preds = predict(
            model=model,
            data_loader=data_loader,
            scaler=scaler
        )

        predictions = np.ma.empty(len(full_data))
        predictions.mask = True
        labels = np.ma.empty(len(full_data), dtype=np.int32)
        labels.mask = True
        for key in full_to_valid_indices.keys():
            full_index = int(key)
            predictions[full_index] = model_preds[full_to_valid_indices[key]][0]
            labels[full_index] = np.round(model_preds[full_to_valid_indices[key]][0], 0)

        if self.smiles is not None:
            dt = datetime.datetime.now(timezone.utc)
            utc_time = dt.replace(tzinfo=timezone.utc)
            utc_timestamp = utc_time.timestamp()

            self.raw_predictions_df = self.raw_predictions_df.append(
                pd.DataFrame(
                    { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp }
                ),
                ignore_index = True
            )

        # if self.interpret == True:
        #     intrprt_df = get_interpretation(self.smiles, self.model_name)
        # else:
        #     col_names = ['smiles', 'rationale_smiles', 'rationale_score']
        #     intrprt_df = pd.DataFrame(columns = col_names)


        #self.predictions_df['smiles'] = pd.Series(np.where(intrprt_df['rationale_scores']>0, intrprt_df['smiles'] + '_' + intrprt_df['rationale_smiles'], intrprt_df['smiles']))

        self.predictions_df[self.column_dict_key] = pd.Series(pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(np.where(predictions>=0.5, predictions, (1 - predictions))).round(2).astype(str) + ')').str.replace('(nan)', '', regex=False)
        if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0:
            self.model_errors.append('graph convolutional neural network')
            self.has_errors = True

        return predictions, labels