Exemple #1
0
    def __init__(self,
                 smiles: str,
                 targets: List[Optional[float]] = None,
                 row: OrderedDict = None,
                 features: np.ndarray = None,
                 features_generator: List[str] = None):
        """
        :param smiles: The SMILES string for the molecule.
        :param targets: A list of targets for the molecule (contains None for unknown target values).
        :param row: The raw CSV row containing the information for this molecule.
        :param features: A numpy array containing additional features (e.g., Morgan fingerprint).
        :param features_generator: A list of features generators to use.
        """
        if features is not None and features_generator is not None:
            raise ValueError(
                'Cannot provide both loaded features and a features generator.'
            )

        self.smiles = smiles
        self.targets = targets
        self.row = row
        self.features = features
        self.features_generator = features_generator
        self._mol = 'None'  # Initialize with 'None' to distinguish between None returned by invalid molecule

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0:
                    self.features.extend(features_generator(self.mol))

            self.features = np.array(self.features)

        # Fix nans in features
        if self.features is not None:
            replace_token = 0
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)

        # Save a copy of the raw features and targets to enable different scaling later on
        self.raw_features, self.raw_targets = features, targets
Exemple #2
0
    def __init__(self,
                 smiles: str,
                 mol: Chem.Mol,
                 targets: np.ndarray,
                 features: np.ndarray = None,
                 features_generator: List[str] = None,
                 compound_name: str = None):
        '''
        Initializes a MoleculeDatapoint, which contains a single molecule.

        :param smiles: Smiles string
        :param mol: Chem.Mol
        :param targets: A numpy array containing features
        :param features: A numpy array containing additional features
        (e.g. ƒMorgan fingerprint).
        :param features_generator: List of strings of features_generator names.
        :param use_compound_names: Whether the data CSV includes the compound
        name on each line.
        '''
        self.smiles = smiles
        self.mol = mol
        self.compound_name = compound_name

        if features and features_generator:
            raise ValueError('Currently cannot provide both loaded features'
                             ' and a features generator.')

        if features_generator:
            # Generate additional features if given a generator:
            features = []

            for fg in features_generator:
                features_generator = get_features_generator(fg)

                if self.mol and self.mol.GetNumHeavyAtoms():
                    self.features.extend(features_generator(self.mol))

            features = np.array(self.features)

        self.set_features(features)
        self.set_targets(targets)
Exemple #3
0
def predict_sklearn(args: Namespace):
    print('Loading data')
    data = get_data(path=args.test_path)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        datapoint.set_features(morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits))

    print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(
            model=model,
            model_type=args.model_type,
            dataset_type=args.dataset_type,
            features=data.features()
        )
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print('Saving predictions')
    assert len(data) == len(avg_preds)
    makedirs(args.preds_path, isfile=True)

    with open(args.preds_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['smiles'] + get_task_names(args.test_path))

        for smiles, pred in zip(data.smiles(), avg_preds):
            writer.writerow([smiles] + pred)
Exemple #4
0
    def __init__(self,
                 smiles: List[str],
                 targets: List[Optional[float]] = None,
                 row: OrderedDict = None,
                 features: np.ndarray = None,
                 features_generator: List[str] = None,
                 atom_features: np.ndarray = None,
                 atom_descriptors: np.ndarray = None):
        """
        :param smiles: A list of the SMILES strings for the molecules.
        :param targets: A list of targets for the molecule (contains None for unknown target values).
        :param row: The raw CSV row containing the information for this molecule.
        :param features: A numpy array containing additional features (e.g., Morgan fingerprint).
        :param features_generator: A list of features generators to use.
        """
        if features is not None and features_generator is not None:
            raise ValueError(
                'Cannot provide both loaded features and a features generator.'
            )

        self.smiles = smiles
        self.targets = targets
        self.row = row
        self.features = features
        self.features_generator = features_generator
        self.atom_descriptors = atom_descriptors
        self.atom_features = atom_features

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                for m in self.mol:
                    if m is not None and m.GetNumHeavyAtoms() > 0:
                        self.features.extend(features_generator(m))
                    # for H2
                    elif m is not None and m.GetNumHeavyAtoms() == 0:
                        # not all features are equally long, so use methane as dummy molecule to determine length
                        self.features.extend(
                            np.zeros(
                                len(features_generator(
                                    Chem.MolFromSmiles('C')))))

            self.features = np.array(self.features)

        # Fix nans in features
        replace_token = 0
        if self.features is not None:
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)

        # Fix nans in atom_descriptors
        if self.atom_descriptors is not None:
            self.atom_descriptors = np.where(np.isnan(self.atom_descriptors),
                                             replace_token,
                                             self.atom_descriptors)

        # Fix nans in atom_features
        if self.atom_features is not None:
            self.atom_features = np.where(np.isnan(self.atom_features),
                                          replace_token, self.atom_features)

        # Save a copy of the raw features and targets to enable different scaling later on
        self.raw_features, self.raw_targets = self.features, self.targets
Exemple #5
0
def generate_and_save_features(args: Namespace):
    """
    Computes and saves features for a dataset of molecules as a 2D array in a .npz file.

    :param args: Arguments.
    """
    # Create directory for save_path
    makedirs(args.save_path, isfile=True)

    # Get data and features function
    data = get_data(path=args.data_path, max_data_size=None)
    features_generator = get_features_generator(args.features_generator)
    temp_save_dir = args.save_path + '_temp'

    # Load partially complete data
    if args.restart:
        if os.path.exists(args.save_path):
            os.remove(args.save_path)
        if os.path.exists(temp_save_dir):
            shutil.rmtree(temp_save_dir)
    else:
        if os.path.exists(args.save_path):
            raise ValueError(
                f'"{args.save_path}" already exists and args.restart is False.'
            )

        if os.path.exists(temp_save_dir):
            features, temp_num = load_temp(temp_save_dir)

    if not os.path.exists(temp_save_dir):
        makedirs(temp_save_dir)
        features, temp_num = [], 0

    # Build features map function
    data = data[len(
        features
    ):]  # restrict to data for which features have not been computed yet
    mols = (d.mol for d in data)

    if args.sequential:
        features_map = map(features_generator, mols)
    else:
        features_map = Pool().imap(features_generator, mols)

    # Get features
    temp_features = []
    for i, feats in tqdm(enumerate(features_map), total=len(data)):
        temp_features.append(feats)

        # Save temporary features every save_frequency
        if (i > 0 and
            (i + 1) % args.save_frequency == 0) or i == len(data) - 1:
            save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'),
                          temp_features)
            features.extend(temp_features)
            temp_features = []
            temp_num += 1

    try:
        # Save all features
        save_features(args.save_path, features)

        # Remove temporary features
        shutil.rmtree(temp_save_dir)
    except OverflowError:
        print(
            'Features array is too large to save as a single file. Instead keeping features as a directory of files.'
        )
def run_sklearn(args: SklearnTrainArgs,
                data: MoleculeDataset,
                logger: Logger = None) -> Dict[str, List[float]]:
    """
    Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score.

    :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for
                 loading data and training the scikit-learn model.
    :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data.
    :param logger: A logger to record output.
    :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task.
    """
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          num_folds=args.num_folds,
                                          args=args)

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metrics=args.metrics,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metrics=args.metrics,
                                    args=args,
                                    logger=logger)

    for metric in args.metrics:
        info(f'Test {metric} = {np.nanmean(scores[metric])}')

    return scores
Exemple #7
0
def compare_datasets_tsne(args: Args):
    if len(args.smiles_paths) > len(args.colors) or len(
            args.smiles_paths) > len(args.sizes):
        raise ValueError(
            'Must have at least as many colors and sizes as datasets')

    # Random seed for random subsampling
    np.random.seed(0)

    # Load the smiles datasets
    print('Loading data')
    smiles, slices, labels = [], [], []
    for smiles_path in args.smiles_paths:
        # Get label
        label = os.path.basename(smiles_path).replace('.csv', '')

        # Get SMILES
        new_smiles = get_smiles(path=smiles_path,
                                smiles_columns=args.smiles_column,
                                flatten=True)
        print(f'{label}: {len(new_smiles):,}')

        # Subsample if dataset is too large
        if len(new_smiles) > args.max_per_dataset:
            print(f'Subsampling to {args.max_per_dataset:,} molecules')
            new_smiles = np.random.choice(new_smiles,
                                          size=args.max_per_dataset,
                                          replace=False).tolist()

        slices.append(slice(len(smiles), len(smiles) + len(new_smiles)))
        labels.append(label)
        smiles += new_smiles

    # Compute Morgan fingerprints
    print('Computing Morgan fingerprints')
    morgan_generator = get_features_generator('morgan')
    morgans = [
        morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles))
    ]

    print('Running t-SNE')
    start = time.time()
    tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard')
    X = tsne.fit_transform(morgans)
    print(f'time = {time.time() - start:.2f} seconds')

    if args.cluster:
        import hdbscan  # pip install hdbscan
        print('Running HDBSCAN')
        start = time.time()
        clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
        colors = clusterer.fit_predict(X)
        print(f'time = {time.time() - start:.2f} seconds')

    print('Plotting t-SNE')
    x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
    X = (X - x_min) / (x_max - x_min)

    makedirs(args.save_path, isfile=True)

    plt.clf()
    fontsize = 50 * args.scale
    fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale))
    plt.title('t-SNE using Morgan fingerprint with Jaccard similarity',
              fontsize=2 * fontsize)
    ax = fig.gca()
    handles = []
    legend_kwargs = dict(loc='upper right', fontsize=fontsize)

    if args.cluster:
        plt.scatter(X[:, 0],
                    X[:, 1],
                    s=150 * np.mean(args.sizes),
                    c=colors,
                    cmap='nipy_spectral')
    else:
        for slc, color, label, size in zip(slices, args.colors, labels,
                                           args.sizes):
            if args.plot_molecules:
                # Plots molecules
                handles.append(mpatches.Patch(color=color, label=label))

                for smile, (x, y) in zip(smiles[slc], X[slc]):
                    img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)],
                                               molsPerRow=1,
                                               subImgSize=(200, 200))
                    imagebox = offsetbox.AnnotationBbox(
                        offsetbox.OffsetImage(img), (x, y),
                        bboxprops=dict(color=color))
                    ax.add_artist(imagebox)
            else:
                # Plots points
                plt.scatter(X[slc, 0],
                            X[slc, 1],
                            s=150 * size,
                            color=color,
                            label=label)

        if args.plot_molecules:
            legend_kwargs['handles'] = handles

    plt.legend(**legend_kwargs)
    plt.xticks([]), plt.yticks([])

    print('Saving t-SNE')
    plt.savefig(args.save_path)
Exemple #8
0
    def __init__(self,
                 drug_smiles: str,
                 cmpd_smiles: str,
                 targets: List[float],
                 args: Namespace = None,
                 drug_feats: np.ndarray = None,
                 cmpd_feats: np.ndarray = None,
                 context: np.ndarray = None,
                 use_compound_names: bool = False):
        """
        Initializes a MolPairDatapoint, which contains a molecule pair.

        :param line: A list of strings generated by separating a line in a data CSV file by comma.
        :param args: Arguments.
        :param drug_feats: A numpy array containing additional features for molecule 1 (ex. Morgan fingerprint).
        :param cmpd_feats: A numpy array containing additional features for molecule 2 (ex. Morgan fingerprint).
        :param context: A numpy array containing additional features defining context (ex. cell line).

        NOT SUPPORTED
        :param use_compound_names: Whether the data CSV includes the compound name on each line.
        """
        if args is not None:
            self.features_generator = args.features_generator
            self.args = args
        else:
            self.features_generator = self.args = None

        if (drug_feats is not None or cmpd_feats
                is not None) and self.features_generator is not None:
            raise ValueError(
                'Currently cannot provide both loaded features and a features generator.'
            )

        self.drug_feats = drug_feats
        self.cmpd_feats = cmpd_feats
        self.context = context

        if use_compound_names:
            raise NotImplementedError
        self.compound_name = None

        self.drug_smiles = drug_smiles  # str
        self.drug_mol = Chem.MolFromSmiles(self.drug_smiles)

        self.cmpd_smiles = cmpd_smiles  # str
        self.cmpd_mol = Chem.MolFromSmiles(self.cmpd_smiles)
        # Create targets
        self.targets = targets

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.drug_feats = []
            self.cmpd_feats = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                if self.drug_mol is not None and self.drug_mol.GetNumHeavyAtoms(
                ) > 0:
                    self.drug_feats.extend(features_generator(self.drug_mol))
                if self.cmpd_mol is not None and self.cmpd_mol.GetNumHeavyAtoms(
                ) > 0:
                    self.cmpd_feats.extend(features_generator(self.cmpd_mol))

            self.drug_feats = np.array(self.drug_feats)
            self.cmpd_feats = np.array(self.cmpd_feats)

        # Fix nans in features
        if self.drug_feats is not None:
            replace_token = 0
            self.drug_feats = np.where(np.isnan(self.drug_feats),
                                       replace_token, self.drug_feats)
        if self.cmpd_feats is not None:
            replace_token = 0
            self.cmpd_feats = np.where(np.isnan(self.cmpd_feats),
                                       replace_token, self.cmpd_feats)
        if self.context is not None:
            replace_token = 0
            self.context = np.where(np.isnan(self.context), replace_token,
                                    self.context)
Exemple #9
0
    def __init__(
        self,
        smiles: List[str],
        targets: List[Optional[float]] = None,
        row: OrderedDict = None,
        features: np.ndarray = None,
        features_generator: List[str] = None,
        atom_features: np.ndarray = None,
        atom_descriptors: np.ndarray = None,
        bond_features: np.ndarray = None,
        overwrite_default_atom_features: bool = False,
        overwrite_default_bond_features: bool = False,
    ):
        """
        :param smiles: A list of the SMILES strings for the molecules.
        :param targets: A list of targets for the molecule (contains None for unknown target values).
        :param row: The raw CSV row containing the information for this molecule.
        :param features: A numpy array containing additional features (e.g., Morgan fingerprint).
        :param features_generator: A list of features generators to use.
        :param atom_descriptors: A numpy array containing additional atom descriptors to featurize the molecule
        :param bond_features: A numpy array containing additional bond features to featurize the molecule
        :param overwrite_default_atom_features: Boolean to overwrite default atom features by atom_features
        :param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features

        """
        if features is not None and features_generator is not None:
            raise ValueError(
                "Cannot provide both loaded features and a features generator."
            )

        self.smiles = smiles
        self.targets = targets
        self.row = row
        self.features = features
        self.features_generator = features_generator
        self.atom_descriptors = atom_descriptors
        self.atom_features = atom_features
        self.bond_features = bond_features
        self.overwrite_default_atom_features = overwrite_default_atom_features
        self.overwrite_default_bond_features = overwrite_default_bond_features

        # Create One-Hot Encoding for the Molecule
        self.one_hot_lookup = {
            char: idx
            for (idx, char) in enumerate(
                "#%)(+-/.0123456789=A@CBEDGFIHKMLONPSRUTWVY[Z]\\acbedgfihmlonsrutpy"
            )
        }
        self.smiles_one_hot_encoding = self.one_hot_encode_smiles()

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                for m in self.mol:
                    if m is not None and m.GetNumHeavyAtoms() > 0:
                        self.features.extend(features_generator(m))
                    # for H2
                    elif m is not None and m.GetNumHeavyAtoms() == 0:
                        # not all features are equally long, so use methane as dummy molecule to determine length
                        self.features.extend(
                            np.zeros(
                                len(features_generator(
                                    Chem.MolFromSmiles("C")))))

            self.features = np.array(self.features)

        # Fix nans in features
        replace_token = 0
        if self.features is not None:
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)

        # Fix nans in atom_descriptors
        if self.atom_descriptors is not None:
            self.atom_descriptors = np.where(np.isnan(self.atom_descriptors),
                                             replace_token,
                                             self.atom_descriptors)

        # Fix nans in atom_features
        if self.atom_features is not None:
            self.atom_features = np.where(np.isnan(self.atom_features),
                                          replace_token, self.atom_features)

        # Fix nans in bond_descriptors
        if self.bond_features is not None:
            self.bond_features = np.where(np.isnan(self.bond_features),
                                          replace_token, self.bond_features)

        # Save a copy of the raw features and targets to enable different scaling later on
        self.raw_features, self.raw_targets = self.features, self.targets
        self.raw_atom_descriptors, self.raw_atom_features, self.raw_bond_features = (
            self.atom_descriptors,
            self.atom_features,
            self.bond_features,
        )
Exemple #10
0
def predict_sklearn(args: SklearnPredictArgs):

    if args.parcel_size and args.max_data_size:
        num_iterations = math.ceil(args.max_data_size / args.parcel_size)
        max_data_size = args.parcel_size
    else:
        num_iterations = 1
        max_data_size = args.max_data_size
    offset = 0

    for iteration in range(num_iterations):

        if iteration > 0:
            offset = offset + args.parcel_size
            max_data_size = max_data_size + args.parcel_size

        print('Loading data')
        data = get_data(path=args.test_path,
                        smiles_column=args.smiles_column,
                        target_columns=[],
                        max_data_size=max_data_size,
                        data_offset=offset)

        print('Computing morgan fingerprints')
        morgan_fingerprint = get_features_generator('morgan')
        for datapoint in tqdm(data, total=len(data)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

        print(
            f'Predicting with an ensemble of {len(args.checkpoint_paths)} models'
        )
        sum_preds = np.zeros((len(data), args.num_tasks))

        for checkpoint_path in tqdm(args.checkpoint_paths,
                                    total=len(args.checkpoint_paths)):
            with open(checkpoint_path, 'rb') as f:
                model = pickle.load(f)

            model_preds = predict(model=model,
                                  model_type=args.model_type,
                                  dataset_type=args.dataset_type,
                                  features=data.features())
            sum_preds += np.array(model_preds)

        # Ensemble predictions
        avg_preds = sum_preds / len(args.checkpoint_paths)
        avg_preds = avg_preds.tolist()

        print(f'Saving predictions to {args.preds_path}')
        assert len(data) == len(avg_preds)
        makedirs(args.preds_path, isfile=True)

        # Copy predictions over to data
        task_names = get_task_names(path=args.test_path)
        for datapoint, preds in zip(data, avg_preds):
            for pred_name, pred in zip(task_names, preds):
                datapoint.row[pred_name] = pred

        # Save
        if iteration != 0:
            name, ext = os.path.splitext(args.preds_path)
            preds_path = "{name}.{it}{ext}".format(name=name,
                                                   it=iteration,
                                                   ext=ext)
        else:
            preds_path = args.preds_path
        with open(args.preds_path, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
            writer.writeheader()

            for datapoint in data:
                writer.writerow(datapoint.row)
Exemple #11
0
def compare_datasets_tsne(smiles_paths: List[str],
                          smiles_col_name: str,
                          colors: List[str],
                          plot_molecules: bool,
                          max_num_per_dataset: int,
                          save_path: str):
    assert len(smiles_paths) <= len(colors)

    # Random seed for random subsampling
    np.random.seed(1)

    # Genenrate labels based on file name
    labels = [os.path.basename(path).replace('.csv', '') for path in smiles_paths]

    # Load the smiles datasets
    print('Loading data')
    smiles, slices = [], []
    for smiles_path, color in zip(smiles_paths, colors):
        # Get SMILES
        new_smiles = pd.read_csv(smiles_path)[smiles_col_name]
        new_smiles = list(new_smiles[new_smiles.notna()])  # Exclude empty strings
        print(f'{os.path.basename(smiles_path)}: {len(new_smiles):,}')

        # Subsample if dataset is too large
        if len(new_smiles) > max_num_per_dataset:
            print(f'Subsampling to {max_num_per_dataset:,} molecules')
            new_smiles = np.random.choice(new_smiles, size=max_num_per_dataset, replace=False).tolist()

        slices.append(slice(len(smiles), len(smiles) + len(new_smiles)))
        smiles += new_smiles

    # Compute Morgan fingerprints
    print('Computing Morgan fingerprints')
    morgan_generator = get_features_generator('morgan')
    morgans = [morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles))]

    print('Running t-SNE')
    import time
    start = time.time()
    tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard')
    X = tsne.fit_transform(morgans)
    print(f'time = {time.time() - start}')

    print('Plotting t-SNE')
    x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
    X = (X - x_min) / (x_max - x_min)

    makedirs(save_path, isfile=True)

    plt.clf()
    scale = 10
    fontsize = 5 * scale
    fig = plt.figure(figsize=(6.4 * scale, 4.8 * scale))
    plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize)
    ax = fig.gca()
    handles = []
    legend_kwargs = dict(loc='upper right', fontsize=fontsize)

    for slc, color, label in zip(slices, colors, labels):
        if plot_molecules:
            # Plots molecules
            handles.append(mpatches.Patch(color=color, label=label))

            for smile, (x, y) in zip(smiles[slc], X[slc]):
                img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200))
                imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color))
                ax.add_artist(imagebox)
        else:
            # Plots points
            s = 450 if label == 'sars_pos' else 150
            plt.scatter(X[slc, 0], X[slc, 1], s=s, color=color, label=label)

    if plot_molecules:
        legend_kwargs['handles'] = handles

    plt.legend(**legend_kwargs)
    plt.xticks([]), plt.yticks([])
    plt.savefig(save_path)

    # Plot pairs of sars_pos and other dataset
    if 'sars_pos' in labels:
        pos_index = labels.index('sars_pos')
        for index in range(len(labels) - 1):
            plt.clf()
            fontsize = 50
            plt.figure(figsize=(6.4 * 10, 4.8 * 10))
            plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize)

            plt.scatter(X[slices[index], 0], X[slices[index], 1], s=150, color=colors[index], label=labels[index])
            plt.scatter(X[slices[pos_index], 0], X[slices[pos_index], 1], s=450, color=colors[pos_index], label=labels[pos_index])

            plt.xticks([]), plt.yticks([])
            plt.legend(loc='upper right', fontsize=fontsize)
            plt.savefig(save_path.replace('.png', f'_{labels[index]}.png'))
import os
from typing import List, Optional, Tuple, Union

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from tqdm import tqdm

from chemprop.data.utils import get_smiles
from chemprop.features import get_features_generator

UNCONDITIONAL_EVAL_DUMMY = '<DUMMY>'

SMILES_TO_MORGAN = {}

morgan_fingerprint = get_features_generator('morgan')


def get_pred_smiles(
    pred_smiles_dir: str,
    return_num_decode: bool = False
) -> Union[List[str], Tuple[List[str], int]]:
    # Collect all paths to .txt files
    pred_paths = []
    for root, _, files in os.walk(pred_smiles_dir):
        pred_paths += [
            os.path.join(root, fname) for fname in files
            if fname.endswith('.txt')
        ]

    # Get SMILES
    all_smiles = []
Exemple #13
0
    def __init__(self,
                 line: pd.Series,
                 args: Namespace = None,
                 features: np.ndarray = None,
                 use_compound_names: bool = False,
                 pred: bool = False):
        """
        Initializes a MoleculeDatapoint, which contains a single molecule.

        :param line: a pandas Series
        :param args: Arguments.
        :param features: A numpy array containing additional features (ex. Morgan fingerprint).
        :param use_compound_names: Whether the data CSV includes the compound name on each line.
        """
        if args is not None:
            self.features_generator = args.features_generator
            self.args = args
        else:
            self.features_generator = self.args = None

        if features is not None and self.features_generator is not None:
            raise ValueError(
                'Currently cannot provide both loaded features and a features generator.'
            )

        self.features = features

        if use_compound_names:
            self.compound_name = line['compound_name']  # str
            line = line.drop('compound_name')
        else:
            self.compound_name = None

        self.smiles = line['smiles']  # str
        self.mol = Chem.MolFromSmiles(self.smiles)

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0:
                    self.features.extend(features_generator(self.mol))

            self.features = np.array(self.features)

        # Fix nans in features
        if self.features is not None:
            replace_token = 0
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)

        line = line.drop('smiles')

        if pred:
            return

        # Create targets
        # atom targets
        self.atom_targets = line[args.atom_targets].values.tolist()
        bond_targets = []
        for b_target_name in args.bond_targets:
            if args.explicit_Hs:
                self.mol = Chem.AddHs(self.mol)
            bond_target = line[b_target_name]
            bond_target_arranged = []
            for b in self.mol.GetBonds():
                bond_target_arranged.append(
                    bond_target[b.GetBeginAtom().GetIdx(),
                                b.GetEndAtom().GetIdx()])
            bond_targets.append(bond_target_arranged)

        self.bond_targets = bond_targets
Exemple #14
0
    def __init__(self,
                 drug_smiles: str,
                 cmpd_smiles: str,
                 targets: List[float],
                 args: Namespace = None,
                 drug_feats: np.ndarray = None,
                 cmpd_feats: np.ndarray = None,
                 context: np.ndarray = None,
                 no_generation: bool = False):
        """
        Initializes a MolPairDatapoint, which contains a molecule pair.

        :param line: A list of strings generated by separating a line in a data CSV file by comma.
        :param args: Arguments.
        :param drug_feats: A numpy array containing additional features for molecule 1 (ex. Morgan fingerprint).
        :param cmpd_feats: A numpy array containing additional features for molecule 2 (ex. Morgan fingerprint).
        :param context: A numpy array containing additional features defining context (ex. cell line).
        :param no_generation: Overrides args. If True, generation doesn't happen.

        NOT SUPPORTED
        :param use_compound_names: Whether the data CSV includes the compound name on each line.
        """
        if args is not None:
            self.features_generator = args.features_generator
            self.args = args
        else:
            self.features_generator = self.args = None

        self.compound_name = None

        self.drug_smiles = drug_smiles  # str
        self.drug_mol = Chem.MolFromSmiles(self.drug_smiles)

        self.cmpd_smiles = cmpd_smiles  # str
        self.cmpd_mol = Chem.MolFromSmiles(self.cmpd_smiles)
        # Create targets
        self.targets = targets

        # Set features
        self.drug_feats = drug_feats
        self.cmpd_feats = cmpd_feats
        self.context = context
        # Generate additional features if given a generator
        if not no_generation and self.features_generator is not None:
            gen_drug_feats = []
            gen_cmpd_feats = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                if self.drug_mol is not None and self.drug_mol.GetNumHeavyAtoms() > 0:
                    gen_drug_feats.extend(features_generator(self.drug_mol))
                if self.cmpd_mol is not None and self.cmpd_mol.GetNumHeavyAtoms() > 0:
                    gen_cmpd_feats.extend(features_generator(self.cmpd_mol))

            if self.drug_feats:  # Concatenate with given input feats if already exist
                self.drug_feats = np.concatenate((self.drug_feats, gen_drug_feats))
            else:
                self.drug_feats = np.array(gen_drug_feats)
            if self.cmpd_feats:
                self.cmpd_feats = np.concatenate((self.cmpd_feats, gen_cmpd_feats))
            else:
                self.cmpd_feats = np.array(gen_cmpd_feats)

        # Fix nans in features
        if self.drug_feats is not None:
            replace_token = 0
            self.drug_feats = np.where(np.isnan(self.drug_feats), replace_token, self.drug_feats)
        if self.cmpd_feats is not None:
            replace_token = 0
            self.cmpd_feats = np.where(np.isnan(self.cmpd_feats), replace_token, self.cmpd_feats)
        if self.context is not None:
            replace_token = 0
            self.context = np.where(np.isnan(self.context), replace_token, self.context)
Exemple #15
0
    def __init__(self,
                 line: List[str],
                 args: Namespace = None,
                 features: np.ndarray = None,
                 use_compound_names: bool = False):
        """
        Initializes a MoleculeDatapoint, which contains a single molecule.

        :param line: A list of strings generated by separating a line in a data CSV file by comma.
        :param args: Arguments.
        :param features: A numpy array containing additional features (ex. Morgan fingerprint).
        :param use_compound_names: Whether the data CSV includes the compound name on each line.
        """
        if args is not None:
            self.features_generator = args.features_generator
            self.args = args
        else:
            self.features_generator = self.args = None

        if features is not None and self.features_generator is not None:
            raise ValueError(
                'Currently cannot provide both loaded features and a features generator.'
            )

        self.features = features

        if use_compound_names:
            self.compound_name = line[0]  # str
            line = line[1:]
        else:
            self.compound_name = None

        self.smiles = line[0]  # str
        self.mol = Chem.MolFromSmiles(self.smiles)

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0:
                    self.features.extend(features_generator(self.mol))

            self.features = np.array(self.features)

        # Fix nans in features
        if self.features is not None:
            replace_token = 0
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)


#        print (line)
#        print (len(line))
#        print ([line[2]])
# Create targets
#        self.targets = [float(x) if x != '' else None for x in line[1:]]
        self.targets = [float(x) if x != '' else None for x in [line[1]]]

        if len(line) > 2:
            self.weights = [float(x) if x != '' else None for x in [line[2]]]
        else:
            self.weights = [float(1.0)]
Exemple #16
0
    def __init__(self,
                 smiles: List[str],
                 targets: List[Optional[float]] = None,
                 row: OrderedDict = None,
                 data_weight: float = None,
                 gt_targets: List[bool] = None,
                 lt_targets: List[bool] = None,
                 features: np.ndarray = None,
                 features_generator: List[str] = None,
                 phase_features: List[float] = None,
                 atom_features: np.ndarray = None,
                 atom_descriptors: np.ndarray = None,
                 bond_features: np.ndarray = None,
                 overwrite_default_atom_features: bool = False,
                 overwrite_default_bond_features: bool = False):
        """
        :param smiles: A list of the SMILES strings for the molecules.
        :param targets: A list of targets for the molecule (contains None for unknown target values).
        :param row: The raw CSV row containing the information for this molecule.
        :param data_weight: Weighting of the datapoint for the loss function.
        :param gt_targets: Indicates whether the targets are an inequality regression target of the form ">x".
        :param lt_targets: Indicates whether the targets are an inequality regression target of the form "<x".
        :param features: A numpy array containing additional features (e.g., Morgan fingerprint).
        :param features_generator: A list of features generators to use.
        :param phase_features: A one-hot vector indicating the phase of the data, as used in spectra data.
        :param atom_descriptors: A numpy array containing additional atom descriptors to featurize the molecule
        :param bond_features: A numpy array containing additional bond features to featurize the molecule
        :param overwrite_default_atom_features: Boolean to overwrite default atom features by atom_features
        :param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features

        """
        if features is not None and features_generator is not None:
            raise ValueError(
                'Cannot provide both loaded features and a features generator.'
            )

        self.smiles = smiles
        self.targets = targets
        self.row = row
        self.features = features
        self.features_generator = features_generator
        self.phase_features = phase_features
        self.atom_descriptors = atom_descriptors
        self.atom_features = atom_features
        self.bond_features = bond_features
        self.overwrite_default_atom_features = overwrite_default_atom_features
        self.overwrite_default_bond_features = overwrite_default_bond_features
        self.is_reaction = is_reaction()
        self.is_explicit_h = is_explicit_h()
        self.is_adding_hs = is_adding_hs()

        if data_weight is not None:
            self.data_weight = data_weight
        if gt_targets is not None:
            self.gt_targets = gt_targets
        if lt_targets is not None:
            self.lt_targets = lt_targets

        # Generate additional features if given a generator
        if self.features_generator is not None:
            self.features = []

            for fg in self.features_generator:
                features_generator = get_features_generator(fg)
                for m in self.mol:
                    if not self.is_reaction:
                        if m is not None and m.GetNumHeavyAtoms() > 0:
                            self.features.extend(features_generator(m))
                        # for H2
                        elif m is not None and m.GetNumHeavyAtoms() == 0:
                            # not all features are equally long, so use methane as dummy molecule to determine length
                            self.features.extend(
                                np.zeros(
                                    len(
                                        features_generator(
                                            Chem.MolFromSmiles('C')))))
                    else:
                        if m[0] is not None and m[
                                1] is not None and m[0].GetNumHeavyAtoms() > 0:
                            self.features.extend(features_generator(m[0]))
                        elif m[0] is not None and m[1] is not None and m[
                                0].GetNumHeavyAtoms() == 0:
                            self.features.extend(
                                np.zeros(
                                    len(
                                        features_generator(
                                            Chem.MolFromSmiles('C')))))

            self.features = np.array(self.features)

        # Fix nans in features
        replace_token = 0
        if self.features is not None:
            self.features = np.where(np.isnan(self.features), replace_token,
                                     self.features)

        # Fix nans in atom_descriptors
        if self.atom_descriptors is not None:
            self.atom_descriptors = np.where(np.isnan(self.atom_descriptors),
                                             replace_token,
                                             self.atom_descriptors)

        # Fix nans in atom_features
        if self.atom_features is not None:
            self.atom_features = np.where(np.isnan(self.atom_features),
                                          replace_token, self.atom_features)

        # Fix nans in bond_descriptors
        if self.bond_features is not None:
            self.bond_features = np.where(np.isnan(self.bond_features),
                                          replace_token, self.bond_features)

        # Save a copy of the raw features and targets to enable different scaling later on
        self.raw_features, self.raw_targets = self.features, self.targets
        self.raw_atom_descriptors, self.raw_atom_features, self.raw_bond_features = \
            self.atom_descriptors, self.atom_features, self.bond_features
Exemple #17
0
def predict_sklearn(args: SklearnPredictArgs) -> None:
    """
    Loads data and a trained scikit-learn model and uses the model to make predictions on the data.

   :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for
                 loading data, loading a trained scikit-learn model, and making predictions with the model.
    """
    print('Loading data')
    data = get_data(path=args.test_path,
                    smiles_columns=args.smiles_columns,
                    target_columns=[],
                    ignore_columns=[],
                    store_row=True)

    print('Loading training arguments')
    with open(args.checkpoint_paths[0], 'rb') as f:
        model = pickle.load(f)
        train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict(
            model.train_args, skip_unsettable=True)

    print('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for datapoint in tqdm(data, total=len(data)):
        for s in datapoint.smiles:
            datapoint.extend_features(
                morgan_fingerprint(mol=s,
                                   radius=train_args.radius,
                                   num_bits=train_args.num_bits))

    print(
        f'Predicting with an ensemble of {len(args.checkpoint_paths)} models')
    sum_preds = np.zeros((len(data), train_args.num_tasks))

    for checkpoint_path in tqdm(args.checkpoint_paths,
                                total=len(args.checkpoint_paths)):
        with open(checkpoint_path, 'rb') as f:
            model = pickle.load(f)

        model_preds = predict(model=model,
                              model_type=train_args.model_type,
                              dataset_type=train_args.dataset_type,
                              features=data.features())
        sum_preds += np.array(model_preds)

    # Ensemble predictions
    avg_preds = sum_preds / len(args.checkpoint_paths)
    avg_preds = avg_preds.tolist()

    print(f'Saving predictions to {args.preds_path}')
    # assert len(data) == len(avg_preds)    #TODO: address with unit test later
    makedirs(args.preds_path, isfile=True)

    # Copy predictions over to data
    for datapoint, preds in zip(data, avg_preds):
        for pred_name, pred in zip(train_args.task_names, preds):
            datapoint.row[pred_name] = pred

    # Save
    with open(args.preds_path, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].row.keys())
        writer.writeheader()

        for datapoint in data:
            writer.writerow(datapoint.row)
Exemple #18
0
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]:
    if logger is not None:
        debug, info = logger.debug, logger.info
    else:
        debug = info = print

    debug(pformat(vars(args)))

    metric_func = get_metric_func(args.metric)

    debug('Loading data')
    data = get_data(path=args.data_path,
                    smiles_column=args.smiles_column,
                    target_columns=args.target_columns)
    args.task_names = get_task_names(path=args.data_path,
                                     smiles_column=args.smiles_column,
                                     target_columns=args.target_columns,
                                     ignore_columns=args.ignore_columns)

    if args.model_type == 'svm' and data.num_tasks() != 1:
        raise ValueError(
            f'SVM can only handle single-task data but found {data.num_tasks()} tasks'
        )

    debug(f'Splitting data with seed {args.seed}')
    # Need to have val set so that train and test sets are the same as when doing MPN
    train_data, _, test_data = split_data(data=data,
                                          split_type=args.split_type,
                                          seed=args.seed,
                                          sizes=args.split_sizes,
                                          args=args)

    debug(
        f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}'
    )

    debug('Computing morgan fingerprints')
    morgan_fingerprint = get_features_generator('morgan')
    for dataset in [train_data, test_data]:
        for datapoint in tqdm(dataset, total=len(dataset)):
            datapoint.set_features(
                morgan_fingerprint(mol=datapoint.smiles,
                                   radius=args.radius,
                                   num_bits=args.num_bits))

    debug('Building model')
    if args.dataset_type == 'regression':
        if args.model_type == 'random_forest':
            model = RandomForestRegressor(n_estimators=args.num_trees,
                                          n_jobs=-1)
        elif args.model_type == 'svm':
            model = SVR()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    elif args.dataset_type == 'classification':
        if args.model_type == 'random_forest':
            model = RandomForestClassifier(n_estimators=args.num_trees,
                                           n_jobs=-1,
                                           class_weight=args.class_weight)
        elif args.model_type == 'svm':
            model = SVC()
        else:
            raise ValueError(f'Model type "{args.model_type}" not supported')
    else:
        raise ValueError(f'Dataset type "{args.dataset_type}" not supported')

    debug(model)

    model.train_args = args.as_dict()

    debug('Training')
    if args.single_task:
        scores = single_task_sklearn(model=model,
                                     train_data=train_data,
                                     test_data=test_data,
                                     metric_func=metric_func,
                                     args=args,
                                     logger=logger)
    else:
        scores = multi_task_sklearn(model=model,
                                    train_data=train_data,
                                    test_data=test_data,
                                    metric_func=metric_func,
                                    args=args,
                                    logger=logger)

    info(f'Test {args.metric} = {np.nanmean(scores)}')

    return scores