Beispiel #1
0
def find_similar_mols(test_smiles: List[str],
                      train_smiles: List[str],
                      distance_measure: str,
                      model: MoleculeModel = None,
                      num_neighbors: int = None,
                      batch_size: int = 50) -> List[OrderedDict]:
    """
    For each test molecule, finds the N most similar training molecules according to some distance measure.

    :param test_smiles: A list of test SMILES strings.
    :param train_smiles: A list of train SMILES strings.
    :param model: A trained MoleculeModel (only needed for distance_measure == 'embedding').
    :param distance_measure: The distance measure to use to determine nearest neighbors.
    :param num_neighbors: The number of nearest training molecules to find for each test molecule.
    :param batch_size: Batch size.
    :return: A list of OrderedDicts containing the test smiles, the num_neighbors nearest training smiles,
    and other relevant distance info.
    """
    test_data, train_data = get_data_from_smiles(test_smiles), get_data_from_smiles(train_smiles)
    train_smiles_set = set(train_smiles)

    print(f'Computing {distance_measure} vectors')
    if distance_measure == 'embedding':
        assert model is not None
        test_vecs = np.array(compute_molecule_vectors(model=model, data=test_data, batch_size=batch_size))
        train_vecs = np.array(compute_molecule_vectors(model=model, data=train_data, batch_size=batch_size))
        metric = 'cosine'
    elif distance_measure == 'morgan':
        test_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(test_smiles, total=len(test_smiles))])
        train_vecs = np.array([morgan_binary_features_generator(smiles) for smiles in tqdm(train_smiles, total=len(train_smiles))])
        metric = 'jaccard'
    else:
        raise ValueError(f'Distance measure "{distance_measure}" not supported.')

    print('Computing distances')
    distances = cdist(test_vecs, train_vecs, metric=metric)

    print('Finding neighbors')
    neighbors = []
    for test_index, test_smile in enumerate(test_smiles):
        # Find the num_neighbors molecules in the training set which are most similar to the test molecule
        nearest_train_indices = np.argsort(distances[test_index])[:num_neighbors]

        # Build dictionary with distance info
        neighbor = OrderedDict()
        neighbor['test_smiles'] = test_smile
        neighbor['test_in_train'] = test_smile in train_smiles_set

        for i, train_index in enumerate(nearest_train_indices):
            neighbor[f'train_{i + 1}_smiles'] = train_smiles[train_index]
            neighbor[f'train_{i + 1}_{distance_measure}_{metric}_distance'] = distances[test_index][train_index]

        neighbors.append(neighbor)

    return neighbors
Beispiel #2
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=self.train_args)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data=test_data,
                                  batch_size=batch_size,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)
        return avg_preds
Beispiel #3
0
    def __call__(self,
                 smiles: List[str],
                 batch_size: int = 500) -> List[List[float]]:
        test_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=self.args.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = []
        for model in self.checkpoints:
            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=self.scaler,
                                  disable_progress_bar=True)
            sum_preds.append(np.array(model_preds))

        # Ensemble predictions
        sum_preds = sum(sum_preds)
        avg_preds = sum_preds / len(self.checkpoints)

        return avg_preds
Beispiel #4
0
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args)
        valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        sum_preds = np.zeros((len(test_data), 1))
        for model in self.checkpoints:
            model_preds = predict(
                model=model,
                data=test_data,
                batch_size=batch_size,
                scaler=self.scaler
            )
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)
def predict_smile(checkpoint_path: str, smile: str):
    smiles = [smile]
    """
        Makes predictions. If smiles is provided, makes predictions on smiles.
        Otherwise makes predictions on args.test_data.

        :param args: Arguments.
        :param smiles: Smiles to make predictions on.
        :return: A list of lists of target predictions.
        """
    args = Namespace()
    # print('Loading training args')
    scaler, features_scaler = load_scalers(checkpoint_path)
    train_args = load_args(checkpoint_path)

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    # print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
    else:
        print("Enter Valid Smile String")
        return

    # print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))

    model = load_checkpoint(checkpoint_path, cuda=args.cuda)
    model_preds = predict(model=model,
                          data=test_data,
                          batch_size=1,
                          scaler=scaler)
    sum_preds += np.array(model_preds)

    # Ensemble predictions
    return sum_preds[0][0]
Beispiel #6
0
    def gcnn_predict(self) -> Tuple[array, array]:
        """
        Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors

        Parameters:
            rdkit_mols (array): a numpy array containing RDKit molecules

        Returns:
            predictions, prediction_labels (Tuple[array, array]): predictions and labels
        """

        smiles = self.rdkit_mols.tolist()
        full_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        test_data = MoleculeDataset(
            [full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # create data loader
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=50,
                                              num_workers=0)

        model_preds = predict(model=rlm_gcnn_model,
                              data_loader=test_data_loader,
                              scaler=rlm_gcnn_scaler)
        predictions = np.ma.empty(len(full_data))
        predictions.mask = True
        labels = np.ma.empty(len(full_data))
        labels.mask = True
        for key in full_to_valid_indices.keys():
            full_index = int(key)
            predictions[full_index] = model_preds[
                full_to_valid_indices[key]][0]
            labels[full_index] = np.round(
                model_preds[full_to_valid_indices[key]][0], 0)

        self.predictions_df['GCNN'] = pd.Series(
            pd.Series(labels).fillna('').astype(str) + ' (' +
            pd.Series(predictions).round(2).astype(str) + ')').str.replace(
                '(nan)', '', regex=False)
        if len(self.predictions_df.index) > len(
                predictions) or np.ma.count_masked(predictions) > 0:
            self.model_errors.append('graph convolutional neural network')
            self.has_errors = True

        return predictions, labels
Beispiel #7
0
def create_crossval_splits():
    assay_file = '../data/assay_matrix_discrete_270_assays.csv'

    smiles = []
    smiles_ = []

    with open(assay_file) as f:
        next(f)
        reader = csv.reader(f)
        for row in reader:
            smiles.append([row[0]])
            smiles_.append(row[0])

    data = get_data_from_smiles(smiles)
    all_indices = list(range(len(data)))
    fold_indicies = split_indices(all_indices, num_folds=5, data=data)
    array = np.array(fold_indicies)
    print(array.shape)
    np.savez('../data/scaffold_based_split_jan22.npz', features=array)
    def __call__(self, smiles, batch_size=500):
        test_data = get_data_from_smiles(
            smiles=[[s] for s in smiles],
            skip_invalid_smiles=False,
            features_generator=self.features_generator)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol[0] is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])
        test_data_loader = MoleculeDataLoader(dataset=test_data,
                                              batch_size=batch_size)

        sum_preds = np.zeros((len(test_data), 1))
        for model, scaler, features_scaler in zip(self.checkpoints,
                                                  self.scalers,
                                                  self.features_scalers):
            test_data.reset_features_and_targets()
            if features_scaler is not None:
                test_data.normalize_features(features_scaler)

            model_preds = predict(model=model,
                                  data_loader=test_data_loader,
                                  scaler=scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.checkpoints)
        avg_preds = avg_preds.squeeze(-1).tolist()

        # Put zero for invalid smiles
        full_preds = [0.0] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        return np.array(full_preds, dtype=np.float32)
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
    else:
        test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None]
    full_data = test_data
    test_data = MolPairDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(drug_scaler, cmpd_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler  # TODO: Shouldn't this be the custom scalers if avail?
        )
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
    avg_preds = full_preds
    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = ['drugSMILE', 'cmpdSMILE']

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)
        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = [test_smiles[i][0], test_smiles[i][1]]

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks * args.multiclass_num_classes)
                else:
                    row.extend([''] * args.num_tasks)

            writer.writerow(row)

    return avg_preds
    def __call__(
            self,
            smiles: List[Optional[str]] = None,
            smiles2: List[Optional[str]] = None
    ) -> List[Optional[List[float]]]:
        if self.computed_prop:
            # Identity non-None smiles
            if len(smiles) > 0:
                valid_indices, valid_smiles = zip(
                    *[(i, smile) for i, smile in enumerate(smiles)
                      if smile is not None])
            else:
                valid_indices, valid_smiles = [], []

            valid_props = [
                self.scorer(valid_smile) for valid_smile in valid_smiles
            ]

            # Combine properties of non-None smiles with Nones
            props = [None] * len(smiles)
            for i, prop in zip(valid_indices, valid_props):
                props[i] = prop

            return props

        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False)
        valid_indices = [
            i for i in range(len(test_data)) if test_data[i].mol is not None
        ]
        full_data = test_data
        test_data = MoleculeDataset([test_data[i] for i in valid_indices])
        if self.features_generator is not None:
            self.generate_features(test_data)
            valid_indices = [
                i for i in range(len(test_data))
                if test_data[i].mol is not None
                and test_data[i].features is not None
            ]
            test_data = MoleculeDataset([test_data[i] for i in valid_indices])

        # Edge case if empty list of smiles is provided
        if len(test_data) == 0:
            return [None] * len(full_data)

        # Normalize features
        if self.train_args.features_scaling:
            test_data.normalize_features(self.features_scaler)

        # Predict with each model individually and sum predictions
        sum_preds = np.zeros((len(test_data), self.num_tasks))
        for chemprop_model in self.chemprop_models:
            model_preds = predict(model=chemprop_model,
                                  data=test_data,
                                  batch_size=self.batch_size,
                                  scaler=self.scaler)
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(self.chemprop_models)
        avg_preds = avg_preds.tolist()

        # Put Nones for invalid smiles
        full_preds = [None] * len(full_data)
        for i, si in enumerate(valid_indices):
            full_preds[si] = avg_preds[i]

        if self.neg_threshold:
            return [
                -p[self.prop_index] if p is not None else None
                for p in full_preds
            ]
        else:
            return [
                p[self.prop_index] if p is not None else None
                for p in full_preds
            ]
Beispiel #11
0
def make_predictions(args: PredictArgs,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])
    num_tasks, task_names = train_args.num_tasks, train_args.task_names

    # If features were used during training, they must be used when predicting
    if ((train_args.features_path is not None
         or train_args.features_generator is not None)
            and args.features_path is None
            and args.features_generator is None):
        raise ValueError(
            'Features were used during training so they must be specified again during prediction '
            'using the same type of features as before (with either --features_generator or '
            '--features_path and using --no_features_scaling if applicable).')

    # Update predict args with training arguments to create a merged args object
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)
    args: Union[PredictArgs, TrainArgs]

    print('Loading data')
    if smiles is not None:
        full_data = get_data_from_smiles(
            smiles=smiles,
            skip_invalid_smiles=False,
            features_generator=args.features_generator)
    else:
        full_data = get_data(path=args.test_path,
                             args=args,
                             target_columns=[],
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    full_to_valid_indices = {}
    valid_index = 0
    for full_index in range(len(full_data)):
        if full_data[full_index].mol is not None:
            full_to_valid_indices[full_index] = valid_index
            valid_index += 1

    test_data = MoleculeDataset(
        [full_data[i] for i in sorted(full_to_valid_indices.keys())])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), num_tasks))

    # Create data loader
    test_data_loader = MoleculeDataLoader(dataset=test_data,
                                          batch_size=args.batch_size,
                                          num_workers=args.num_workers)

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, device=args.device)
        model_preds = predict(model=model,
                              data_loader=test_data_loader,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    # Save predictions
    print(f'Saving predictions to {args.preds_path}')
    assert len(test_data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    # Get prediction column names
    if args.dataset_type == 'multiclass':
        task_names = [
            f'{name}_class_{i}' for name in task_names
            for i in range(args.multiclass_num_classes)
        ]
    else:
        task_names = task_names

    # Copy predictions over to full_data
    for full_index, datapoint in enumerate(full_data):
        valid_index = full_to_valid_indices.get(full_index, None)
        preds = avg_preds[valid_index] if valid_index is not None else [
            'Invalid SMILES'
        ] * len(task_names)

        for pred_name, pred in zip(task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys())
        writer.writeheader()

        for datapoint in full_data:
            writer.writerow(datapoint.row)

    return avg_preds
Beispiel #12
0
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_ale_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
        sum_epi_uncs = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
        sum_ale_uncs = np.zeros((len(test_data), args.num_tasks))
        sum_epi_uncs = np.zeros((len(test_data), args.num_tasks))

    # Partial results for variance robust calculation.
    all_preds = np.zeros(
        (len(test_data), args.num_tasks, len(args.checkpoint_paths)))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    for index, checkpoint_path in enumerate(
            tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds, ale_uncs, epi_uncs = predict(
            model=model,
            data=test_data,
            batch_size=args.batch_size,
            scaler=scaler,
            sampling_size=args.sampling_size)
        sum_preds += np.array(model_preds)
        if ale_uncs is not None:
            sum_ale_uncs += np.array(ale_uncs)
        if epi_uncs is not None:
            sum_epi_uncs += np.array(epi_uncs)
        if args.estimate_variance:
            all_preds[:, :, index] = model_preds
        print('\nmodel_preds\n', model_preds)
        print('ale_uncs\n', ale_uncs)

    # Ensemble predictions

    if args.estimate_variance:
        # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = np.var(all_preds, axis=2)
        avg_epi_uncs = avg_epi_uncs.tolist()

    else:
        # Use another method to estimate uncertainty.
        # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs)
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths)
        avg_ale_uncs = avg_ale_uncs.tolist()

        avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths)
        avg_epi_uncs = avg_epi_uncs.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    assert len(test_data) == len(avg_ale_uncs)
    assert len(test_data) == len(avg_epi_uncs)

    print(f'Saving predictions to {args.preds_path}')

    # Put Nones for invalid smiles
    full_preds = [None] * len(full_data)
    full_ale_uncs = [None] * len(full_data)
    full_epi_uncs = [None] * len(full_data)

    for i, si in enumerate(valid_indices):
        full_preds[si] = avg_preds[i]
        full_ale_uncs[si] = avg_ale_uncs[i]
        full_epi_uncs[si] = avg_epi_uncs[i]

    avg_preds = full_preds
    avg_ale_uncs = full_ale_uncs
    avg_epi_uncs = full_epi_uncs

    test_smiles = full_data.smiles()

    # Write predictions
    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)

        header = []

        if args.use_compound_names:
            header.append('compound_names')

        header.append('smiles')

        if args.dataset_type == 'multiclass':
            for name in args.task_names:
                for i in range(args.multiclass_num_classes):
                    header.append(name + '_class' + str(i))
        else:
            header.extend(args.task_names)

            header.extend([tn + "_ale_unc" for tn in args.task_names])

            header.extend([tn + "_epi_unc" for tn in args.task_names])

        writer.writerow(header)

        for i in range(len(avg_preds)):
            row = []

            if args.use_compound_names:
                row.append(compound_names[i])

            row.append(test_smiles[i])

            if avg_preds[i] is not None:
                if args.dataset_type == 'multiclass':
                    for task_probs in avg_preds[i]:
                        row.extend(task_probs)
                else:
                    row.extend(avg_preds[i])
                    row.extend(avg_ale_uncs[i])
                    row.extend(avg_epi_uncs[i])
            else:
                if args.dataset_type == 'multiclass':
                    row.extend([''] * args.num_tasks *
                               args.multiclass_num_classes)
                else:
                    # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None
                    row.extend([''] * 3 * args.num_tasks)

            writer.writerow(row)

    return avg_preds
def make_predictions(args: Namespace,
                     smiles: List[str] = None) -> List[Optional[List[float]]]:
    """
    Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data.

    :param args: Arguments.
    :param smiles: Smiles to make predictions on.
    :return: A list of lists of target predictions.
    """
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles=smiles,
                                         skip_invalid_smiles=False,
                                         args=args)
    else:
        test_data = get_data(path=args.test_path,
                             args=args,
                             use_compound_names=args.use_compound_names,
                             skip_invalid_smiles=False)

    print('Validating SMILES')
    valid_indices = [
        i for i in range(len(test_data)) if test_data[i].mol is not None
    ]
    full_data = test_data
    test_data = MoleculeDataset([test_data[i] for i in valid_indices])

    # Edge case if empty list of smiles is provided
    if len(test_data) == 0:
        return [None] * len(full_data)

    if args.use_compound_names:
        compound_names = test_data.compound_names()
    print(f'Test size = {len(test_data):,}')

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    if args.dataset_type == 'multiclass':
        sum_preds = np.zeros(
            (len(test_data), args.num_tasks, args.multiclass_num_classes))
    else:
        sum_preds = np.zeros((len(test_data), args.num_tasks))
    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
    # Load model
    model = load_checkpoint(args.checkpoint_path, cuda=args.cuda)
    test_preds, test_smiles_batch = predict(model=model,
                                            data=test_data,
                                            batch_size=args.batch_size,
                                            scaler=scaler)

    return test_preds, test_smiles_batch
    '''
Beispiel #14
0
    def gcnn_predict(self, model, scaler) -> Tuple[array, array]:
        """
        Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors

        Parameters:
            models (model): model
            scaler  (scalar): scalar

        Returns:
            predictions, prediction_labels (Tuple[array, array]): predictions and labels
        """

        smiles = self.kekule_smiles.tolist()
        full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False)
        full_to_valid_indices = {}
        valid_index = 0
        for full_index in range(len(full_data)):
            if full_data[full_index].mol is not None:
                full_to_valid_indices[full_index] = valid_index
                valid_index += 1

        data = MoleculeDataset([full_data[i] for i in sorted(full_to_valid_indices.keys())])

        # create data loader
        data_loader = MoleculeDataLoader(
            dataset=data,
            batch_size=50,
            num_workers=0
        )

        model_preds = predict(
            model=model,
            data_loader=data_loader,
            scaler=scaler
        )

        predictions = np.ma.empty(len(full_data))
        predictions.mask = True
        labels = np.ma.empty(len(full_data), dtype=np.int32)
        labels.mask = True
        for key in full_to_valid_indices.keys():
            full_index = int(key)
            predictions[full_index] = model_preds[full_to_valid_indices[key]][0]
            labels[full_index] = np.round(model_preds[full_to_valid_indices[key]][0], 0)

        if self.smiles is not None:
            dt = datetime.datetime.now(timezone.utc)
            utc_time = dt.replace(tzinfo=timezone.utc)
            utc_timestamp = utc_time.timestamp()

            self.raw_predictions_df = self.raw_predictions_df.append(
                pd.DataFrame(
                    { 'SMILES': self.smiles, 'model': self.model_name, 'prediction': predictions, 'timestamp': utc_timestamp }
                ),
                ignore_index = True
            )

        # if self.interpret == True:
        #     intrprt_df = get_interpretation(self.smiles, self.model_name)
        # else:
        #     col_names = ['smiles', 'rationale_smiles', 'rationale_score']
        #     intrprt_df = pd.DataFrame(columns = col_names)


        #self.predictions_df['smiles'] = pd.Series(np.where(intrprt_df['rationale_scores']>0, intrprt_df['smiles'] + '_' + intrprt_df['rationale_smiles'], intrprt_df['smiles']))

        self.predictions_df[self.column_dict_key] = pd.Series(pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(np.where(predictions>=0.5, predictions, (1 - predictions))).round(2).astype(str) + ')').str.replace('(nan)', '', regex=False)
        if len(self.predictions_df.index) > len(predictions) or np.ma.count_masked(predictions) > 0:
            self.model_errors.append('graph convolutional neural network')
            self.has_errors = True

        return predictions, labels
def make_predictions(args: Namespace,
                     smiles: List[str] = None,
                     invalid_smiles_warning: str = None) -> List[List[float]]:
    """Makes predictions."""
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    if invalid_smiles_warning is not None:
        success_indices = []
        for i, s in enumerate(smiles):
            mol = Chem.MolFromSmiles(s)
            if mol is not None:
                success_indices.append(i)
        full_smiles = smiles
        smiles = [smiles[i] for i in success_indices]

    print('Loading training args')
    scaler, features_scaler = load_scalers(args.checkpoint_paths[0])
    train_args = load_args(args.checkpoint_paths[0])

    # Update args with training arguments
    for key, value in vars(train_args).items():
        if not hasattr(args, key):
            setattr(args, key, value)

    print('Loading data')
    if smiles is not None:
        test_data = get_data_from_smiles(smiles)
    else:
        test_data = get_data(args.test_path,
                             args,
                             use_compound_names=args.compound_names)
    test_smiles = test_data.smiles()
    if args.compound_names:
        compound_names = test_data.compound_names()
    print('Test size = {:,}'.format(len(test_data)))

    # Normalize features
    if train_args.features_scaling:
        test_data.normalize_features(features_scaler)

    # Predict with each model individually and sum predictions
    sum_preds = np.zeros((len(test_data), args.num_tasks))
    print('Predicting with an ensemble of {} models'.format(
        len(args.checkpoint_paths)))
    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        # Load model
        model = load_checkpoint(checkpoint_path, cuda=args.cuda)
        model_preds = predict(model=model,
                              data=test_data,
                              args=args,
                              scaler=scaler)
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / args.ensemble_size
    avg_preds = avg_preds.tolist()

    # Save predictions
    assert len(test_data) == len(avg_preds)
    print('Saving predictions to {}'.format(args.preds_path))

    with open(args.preds_path, 'w') as f:
        if args.write_smiles:
            f.write('smiles,')
        if args.compound_names:
            f.write('compound_name,')
        f.write(','.join(args.task_names) + '\n')

        for i in range(len(avg_preds)):
            if args.write_smiles:
                f.write(test_smiles[i] + ',')
            if args.compound_names:
                f.write(compound_names[i] + ',')
            f.write(','.join(str(p) for p in avg_preds[i]) + '\n')

    if invalid_smiles_warning is not None:
        full_preds = [[invalid_smiles_warning]
                      for _ in range(len(full_smiles))]
        for i, si in enumerate(success_indices):
            full_preds[si] = avg_preds[i]
        return full_preds

    return avg_preds
Beispiel #16
0
with open(assay_train_file) as f:
    next(f)
    reader = csv.reader(f)
    for row in reader:
        smiles.append([row[0]])
        smiles_.append(row[0])

with open(assay_test_file) as f:
    next(f)
    reader = csv.reader(f)
    for row in reader:
        smiles.append([row[0]])
        smiles_.append(row[0])

fingerprints = []
data = get_data_from_smiles(smiles)
data = data.mols(flatten=True)
for i in range(len(data)):
    mf = morgan_binary_features_generator(data[i])
    fingerprints.append(mf)

fingerprints = np.array(fingerprints)

#print(fingerprints.shape)
np.savez('fingerprints.npz', features=fingerprints)

fps = []
for i in range(len(data)):
    fps.append(Chem.RDKFingerprint(data[i]))

similarity = np.zeros([16978, 16978])