Exemple #1
0
 def test_default_featurizer(self):
   # test if default parameters work
   featurizer = RdkitGridFeaturizer()
   self.assertIsInstance(featurizer, RdkitGridFeaturizer)
   feature_tensor = featurizer.featurize([(self.ligand_file,
                                           self.protein_file)])
   self.assertIsInstance(feature_tensor, np.ndarray)
Exemple #2
0
 def test_force_flatten(self):
   # test if input is flattened when flat features are used
   featurizer = RdkitGridFeaturizer(
       feature_types=['ecfp_hashed'], flatten=False)
   featurizer.flatten = True  # False should be ignored with ecfp_hashed
   feature_tensor = featurizer.featurize([(self.ligand_file,
                                           self.protein_file)])
   self.assertIsInstance(feature_tensor, np.ndarray)
   self.assertEqual(feature_tensor.shape, (1, 2 * 2**featurizer.ecfp_power))
Exemple #3
0
 def test_rotations(self):
   featurizer = RdkitGridFeaturizer(
       nb_rotations=3,
       feature_types=['voxel_combined'],
       flatten=False,
       sanitize=True)
   feature_tensors = featurizer.featurize([(self.ligand_file,
                                            self.protein_file)])
   self.assertEqual(feature_tensors.shape, (1, 4, 16, 16, 16, 40))
Exemple #4
0
  def test_combined(self):
    ecfp_power = 5
    splif_power = 5
    box_width = 75.0
    voxel_width = 1.0
    voxels_per_edge = int(box_width / voxel_width)

    # test voxel features
    featurizer = RdkitGridFeaturizer(
        voxel_width=voxel_width,
        box_width=box_width,
        feature_types=['voxel_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=False,
        sanitize=True)
    feature_tensor = featurizer.featurize([(self.ligand_file,
                                            self.protein_file)])
    self.assertIsInstance(feature_tensor, np.ndarray)
    voxel_total_len = (
        2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']) + 5)
    self.assertEqual(
        feature_tensor.shape,
        (1, voxels_per_edge, voxels_per_edge, voxels_per_edge, voxel_total_len))

    # test flat features
    featurizer = RdkitGridFeaturizer(
        voxel_width=1.0,
        box_width=75.0,
        feature_types=['flat_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        sanitize=True)
    feature_tensor = featurizer.featurize([(self.ligand_file,
                                            self.protein_file)])
    self.assertIsInstance(feature_tensor, np.ndarray)
    flat_total_len = (
        3 * 2**ecfp_power +
        len(featurizer.cutoffs['splif_contact_bins']) * 2**splif_power + len(
            featurizer.cutoffs['hbond_dist_bins']))
    self.assertEqual(feature_tensor.shape, (1, flat_total_len))

    # check if aromatic features are ignored if sanitize=False
    featurizer = RdkitGridFeaturizer(
        voxel_width=1.0,
        box_width=75.0,
        feature_types=['all_combined'],
        ecfp_power=ecfp_power,
        splif_power=splif_power,
        flatten=True,
        sanitize=False)

    self.assertTrue('pi_stack' not in featurizer.feature_types)
    self.assertTrue('cation_pi' not in featurizer.feature_types)
    feature_tensor = featurizer.featurize([(self.ligand_file,
                                            self.protein_file)])
    self.assertIsInstance(feature_tensor, np.ndarray)
    self.assertEqual(feature_tensor.shape, (1, 56109538))
Exemple #5
0
 def test_example_featurizer(self):
   # check if use-case from examples works
   featurizer = RdkitGridFeaturizer(
       voxel_width=16.0,
       feature_types=['ecfp', 'splif', 'hbond', 'salt_bridge'],
       ecfp_power=9,
       splif_power=9,
       flatten=True)
   feature_tensor = featurizer.featurize([(self.ligand_file,
                                           self.protein_file)])
   self.assertIsInstance(feature_tensor, np.ndarray)
Exemple #6
0
 def __init__(self, model, feat="grid"):
     """Initializes a pose-scorer."""
     self.model = model
     if feat == "grid":
         self.featurizer = RdkitGridFeaturizer(
             voxel_width=16.0,
             # TODO: add pi_stack and cation_pi to feature_types (it's not trivial
             # because they require sanitized molecules)
             # feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
             # "salt_bridge"],
             feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
             ecfp_power=9,
             splif_power=9,
             flatten=True)
     else:
         raise ValueError("feat not defined.")
Exemple #7
0
class GridPoseScorer(object):
    def __init__(self, model, feat="grid"):
        """Initializes a pose-scorer."""
        self.model = model
        if feat == "grid":
            self.featurizer = RdkitGridFeaturizer(
                voxel_width=16.0,
                # TODO: add pi_stack and cation_pi to feature_types (it's not trivial
                # because they require sanitized molecules)
                # feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
                # "salt_bridge"],
                feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
                ecfp_power=9,
                splif_power=9,
                flatten=True)
        else:
            raise ValueError("feat not defined.")

    def score(self, protein_file, ligand_file):
        """Returns a score for a protein/ligand pair."""
        features = self.featurizer.featurize_complexes([ligand_file],
                                                       [protein_file])
        dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
        score = self.model.predict(dataset)
        return score
Exemple #8
0
class GridPoseScorer(object):

  def __init__(self, model, feat="grid"):
    """Initializes a pose-scorer."""
    self.model = model
    if feat == "grid":
      self.featurizer = RdkitGridFeaturizer(
          voxel_width=16.0,
          # TODO: add pi_stack and cation_pi to feature_types (it's not trivial
          # because they require sanitized molecules)
          # feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
          # "salt_bridge"],
          feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
          ecfp_power=9,
          splif_power=9,
          flatten=True)
    else:
      raise ValueError("feat not defined.")

  def score(self, protein_file, ligand_file):
    """Returns a score for a protein/ligand pair."""
    features, _ = self.featurizer.featurize_complexes([ligand_file],
                                                      [protein_file])
    dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
    score = self.model.predict(dataset)
    return score
class GridPoseScorer(object):
    def __init__(self, model, feat="grid"):
        """Initializes a pose-scorer."""
        self.model = model
        if feat == "grid":
            self.featurizer = RdkitGridFeaturizer(
                voxel_width=16.0,
                feature_types="voxel_combined",
                # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
                # causes segfaults.
                #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
                #"salt_bridge"], ecfp_power=9, splif_power=9,
                voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
                ecfp_power=9,
                splif_power=9,
                parallel=True,
                flatten=True)
        else:
            raise ValueError("feat not defined.")

    def score(self, protein_file, ligand_file):
        """Returns a score for a protein/ligand pair."""
        features = self.featurizer.featurize_complexes([ligand_file],
                                                       [protein_file])
        dataset = NumpyDataset(X=features, y=None, w=None, ids=None)
        score = self.model.predict(dataset)
        return score
 def __init__(self, model, feat="grid"):
     """Initializes a pose-scorer."""
     self.model = model
     if feat == "grid":
         self.featurizer = RdkitGridFeaturizer(
             voxel_width=16.0,
             feature_types="voxel_combined",
             # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
             # causes segfaults.
             #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
             #"salt_bridge"], ecfp_power=9, splif_power=9,
             voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
             ecfp_power=9,
             splif_power=9,
             parallel=True,
             flatten=True)
     else:
         raise ValueError("feat not defined.")
Exemple #11
0
 def test_custom_cutoffs(self):
   custom_cutoffs = {
       'hbond_dist_bins': [(2., 3.), (3., 3.5)],
       'hbond_angle_cutoffs': [5, 90],
       'splif_contact_bins': [(0, 3.5), (3.5, 6.0)],
       'ecfp_cutoff': 5.0,
       'sybyl_cutoff': 3.0,
       'salt_bridges_cutoff': 4.0,
       'pi_stack_dist_cutoff': 5.0,
       'pi_stack_angle_cutoff': 15.0,
       'cation_pi_dist_cutoff': 5.5,
       'cation_pi_angle_cutoff': 20.0,
   }
   rgf_featurizer = RdkitGridFeaturizer(**custom_cutoffs)
   self.assertEqual(rgf_featurizer.cutoffs, custom_cutoffs)
Exemple #12
0
 def __init__(self, model, feat="grid"):
   """Initializes a pose-scorer."""
   self.model = model
   if feat == "grid":
     self.featurizer = RdkitGridFeaturizer(
         voxel_width=16.0,
         # TODO: add pi_stack and cation_pi to feature_types (it's not trivial
         # because they require sanitized molecules)
         # feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
         # "salt_bridge"],
         feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
         ecfp_power=9,
         splif_power=9,
         flatten=True)
   else:
     raise ValueError("feat not defined.")
Exemple #13
0
  def test_rotations(self):
    featurizer = RdkitGridFeaturizer(
        nb_rotations=3,
        box_width=75.,
        voxel_width=1.,
        feature_types=['voxel_combined'],
        flatten=False,
        sanitize=True)
    feature_tensors = featurizer.featurize([(self.ligand_file,
                                             self.protein_file)])
    self.assertEqual(feature_tensors.shape, (1, 300, 75, 75, 40))

    featurizer = RdkitGridFeaturizer(
        nb_rotations=3,
        box_width=75.,
        voxel_width=1.,
        feature_types=['flat_combined'],
        flatten=True,
        sanitize=True)
    feature_tensors = featurizer.featurize([(self.ligand_file,
                                             self.protein_file)])
    self.assertEqual(feature_tensors.shape, (1, 204))
Exemple #14
0
  def test_failures(self):
    # test flattened voxel features
    featurizer = RdkitGridFeaturizer(
        nb_rotations=0,
        box_width=75.,
        voxel_width=1.,
        feature_types=['voxel_combined'],
        flatten=True,
        sanitize=True)

    features = featurizer.featurize([(self.ligand_file, self.protein_file),
                                     ('nan', 'nan')])
    self.assertEqual(features.shape, (2, 16875000))

    # test voxel features
    featurizer = RdkitGridFeaturizer(
        nb_rotations=0,
        box_width=75.,
        voxel_width=1.,
        feature_types=['voxel_combined'],
        flatten=False,
        sanitize=True)
    features = featurizer.featurize([(self.ligand_file, self.protein_file),
                                     ('nan', 'nan')])
    self.assertEqual(features.shape, (2, 75, 75, 75, 40))

    # test flat features
    featurizer = RdkitGridFeaturizer(
        nb_rotations=0,
        box_width=75.,
        voxel_width=1.,
        feature_types=['flat_combined'],
        flatten=True,
        sanitize=True)
    features = featurizer.featurize([(self.ligand_file, self.protein_file),
                                     ('nan', 'nan')])
    self.assertEqual(features.shape, (2, 51))

    # test rotations
    featurizer = RdkitGridFeaturizer(
        nb_rotations=5,
        box_width=75.,
        voxel_width=1.,
        feature_types=['flat_combined'],
        flatten=True,
        sanitize=True)
    features = featurizer.featurize([(self.ligand_file, self.protein_file),
                                     ('nan', 'nan')])
    self.assertEqual(features.shape, (2, 306))
Exemple #15
0
def load_pdbbind_from_dir(data_folder,
                          index_files,
                          featurizer="grid",
                          split="random",
                          ex_ids=[],
                          save_dir=None):
    """Load and featurize raw PDBBind dataset from a local directory with the option to avoid certain IDs.

    Parameters
    ----------
    data_dir: String,
      Specifies the data directory to store the featurized dataset.
    index_files: List
      List of data and labels index file paths relative to the path in data_dir
    split: Str
      Either "random" or "index"
    feat: Str
      Either "grid" or "atomic" for grid and atomic featurizations.
    subset: Str
      Only "core" or "refined" for now.
    ex_ids: List
      List of PDB IDs to avoid loading if present
    save_dir: String
      Path to store featurized datasets
    """
    pdbbind_tasks = ["-logKd/Ki"]

    index_file = os.path.join(data_folder, index_files[0])
    labels_file = os.path.join(data_folder, index_files[1])

    # Extract locations of data
    pdbs = []

    with open(index_file, "r") as g:
        lines = g.readlines()
        for line in lines:
            line = line.split(" ")
            pdb = line[0]
            if len(pdb) == 4:
                pdbs.append(pdb)
    protein_files = [
        os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb) for pdb in pdbs
        if pdb not in ex_ids
    ]
    ligand_files = [
        os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
        if pdb not in ex_ids
    ]
    # Extract labels
    labels_tmp = {}
    with open(labels_file, "r") as f:
        lines = f.readlines()
        for line in lines:
            # Skip comment lines
            if line[0] == "#":
                continue
            # Lines have format
            # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
            line = line.split()
            # The base-10 logarithm, -log kd/pk
            log_label = line[3]
            labels_tmp[line[0]] = log_label

    labels = np.array([labels_tmp[pdb] for pdb in pdbs])
    print(labels)
    # Featurize Data
    if featurizer == "grid":
        featurizer = RdkitGridFeaturizer(voxel_width=2.0,
                                         feature_types=[
                                             'ecfp', 'splif', 'hbond',
                                             'salt_bridge', 'pi_stack',
                                             'cation_pi', 'charge'
                                         ],
                                         flatten=True)
    elif featurizer == "atomic":
        # Pulled from PDB files. For larger datasets with more PDBs, would use
        # max num atoms instead of exact.
        frag1_num_atoms = 70  # for ligand atoms
        frag2_num_atoms = 24000  # for protein atoms
        complex_num_atoms = 24070  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        featurizer = ComplexNeighborListFragmentAtomicCoordinates(
            frag1_num_atoms, frag2_num_atoms, complex_num_atoms,
            max_num_neighbors, neighbor_cutoff)

    else:
        raise ValueError("Featurizer not supported")
    print("Featurizing Complexes")
    features, failures = featurizer.featurize(ligand_files, protein_files)
    # Delete labels for failing elements
    labels = np.delete(labels, failures)
    dataset = deepchem.data.DiskDataset.from_numpy(features, labels)
    # No transformations of data
    transformers = []
    if split == None:
        return pdbbind_tasks, (dataset, None, None), transformers

    # TODO(rbharath): This should be modified to contain a cluster split so
    # structures of the same protein aren't in both train/test
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset)
    all_dataset = (train, valid, test)
    if save_dir:
        deepchem.utils.data_utils.save_dataset_to_disk(save_dir, train, valid,
                                                       test, transformers)
    return pdbbind_tasks, all_dataset, transformers
Exemple #16
0
def load_pdbbind(reload=True,
                 data_dir=None,
                 subset="core",
                 load_binding_pocket=False,
                 featurizer="grid",
                 split="random",
                 split_seed=None,
                 save_dir=None,
                 save_timestamp=False):
    """Load raw PDBBind dataset by featurization and split.

  Parameters
  ----------
  reload: Bool, optional
    Reload saved featurized and splitted dataset or not.
  data_dir: Str, optional
    Specifies the directory storing the raw dataset.
  load_binding_pocket: Bool, optional
    Load binding pocket or full protein.
  subset: Str
    Specifies which subset of PDBBind, only "core" or "refined" for now.
  featurizer: Str
    Either "grid" or "atomic" for grid and atomic featurizations.
  split: Str
    Either "random" or "index".
  split_seed: Int, optional
    Specifies the random seed for splitter.
  save_dir: Str, optional
    Specifies the directory to store the featurized and splitted dataset when
    reload is False. If reload is True, it will load saved dataset inside save_dir.
  save_timestamp: Bool, optional
    Save featurized and splitted dataset with timestamp or not. Set it as True
    when running similar or same jobs simultaneously on multiple compute nodes.
  """

    pdbbind_tasks = ["-logKd/Ki"]

    deepchem_dir = deepchem.utils.data_utils.get_data_dir()

    if data_dir == None:
        data_dir = DEFAULT_DATA_DIR
    data_folder = os.path.join(data_dir, "pdbbind", "v2015")

    if save_dir == None:
        save_dir = os.path.join(DEFAULT_DATA_DIR, "from-pdbbind")
    if load_binding_pocket:
        save_folder = os.path.join(
            save_dir, "protein_pocket-%s-%s-%s" % (subset, featurizer, split))
    else:
        save_folder = os.path.join(
            save_dir, "full_protein-%s-%s-%s" % (subset, featurizer, split))

    if save_timestamp:
        save_folder = "%s-%s-%s" % (
            save_folder, time.strftime("%Y%m%d", time.localtime()),
            re.search("\.(.*)", str(time.time())).group(1))

    if reload:
        if not os.path.exists(save_folder):
            print("Dataset does not exist at {}. Reconstructing...".format(
                save_folder))
        else:
            print("\nLoading featurized and splitted dataset from:\n%s\n" %
                  save_folder)
        loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
            save_folder)
        if loaded:
            return pdbbind_tasks, all_dataset, transformers

    dataset_file = os.path.join(data_dir, "pdbbind_v2015.tar.gz")
    if not os.path.exists(dataset_file):
        logger.warning(
            "About to download PDBBind full dataset. Large file, 2GB")
        deepchem.utils.data_utils.download_url(
            "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/pdbbind_v2015.tar.gz",
            dest_dir=data_dir)
    if os.path.exists(data_folder):
        logger.info("PDBBind full dataset already exists.")
    else:
        print("Untarring full dataset...")
        deepchem.utils.data_utils.untargz_file(dataset_file,
                                               dest_dir=os.path.join(
                                                   data_dir, "pdbbind"))

    print("\nRaw dataset:\n%s" % data_folder)
    print("\nFeaturized and splitted dataset:\n%s" % save_folder)

    if subset == "core":
        index_labels_file = os.path.join(data_folder, "INDEX_core_data.2013")
    elif subset == "refined":
        index_labels_file = os.path.join(data_folder,
                                         "INDEX_refined_data.2015")
    else:
        raise ValueError("Other subsets not supported")

    # Extract locations of data
    with open(index_labels_file, "r") as g:
        pdbs = [line[:4] for line in g.readlines() if line[0] != "#"]
    if load_binding_pocket:
        protein_files = [
            os.path.join(data_folder, pdb, "%s_pocket.pdb" % pdb)
            for pdb in pdbs
        ]
    else:
        protein_files = [
            os.path.join(data_folder, pdb, "%s_protein.pdb" % pdb)
            for pdb in pdbs
        ]
    ligand_files = [
        os.path.join(data_folder, pdb, "%s_ligand.sdf" % pdb) for pdb in pdbs
    ]

    # Extract labels
    with open(index_labels_file, "r") as g:
        labels = np.array([
            # Lines have format
            # PDB code, resolution, release year, -logKd/Ki, Kd/Ki, reference, ligand name
            # The base-10 logarithm, -log kd/pk
            float(line.split()[3]) for line in g.readlines() if line[0] != "#"
        ])

    # Featurize Data
    if featurizer == "grid":
        featurizer = RdkitGridFeaturizer(voxel_width=2.0,
                                         feature_types=[
                                             'ecfp', 'splif', 'hbond',
                                             'salt_bridge', 'pi_stack',
                                             'cation_pi', 'charge'
                                         ],
                                         flatten=True)
    elif featurizer == "atomic" or featurizer == "atomic_conv":
        # Pulled from PDB files. For larger datasets with more PDBs, would use
        # max num atoms instead of exact.
        frag1_num_atoms = 70  # for ligand atoms
        if load_binding_pocket:
            frag2_num_atoms = 1000
            complex_num_atoms = 1070
        else:
            frag2_num_atoms = 24000  # for protein atoms
            complex_num_atoms = 24070  # in total
        max_num_neighbors = 4
        # Cutoff in angstroms
        neighbor_cutoff = 4
        if featurizer == "atomic":
            featurizer = ComplexNeighborListFragmentAtomicCoordinates(
                frag1_num_atoms=frag1_num_atoms,
                frag2_num_atoms=frag2_num_atoms,
                complex_num_atoms=complex_num_atoms,
                max_num_neighbors=max_num_neighbors,
                neighbor_cutoff=neighbor_cutoff)
        if featurizer == "atomic_conv":
            featurizer = AtomicConvFeaturizer(
                labels=labels,
                frag1_num_atoms=frag1_num_atoms,
                frag2_num_atoms=frag2_num_atoms,
                complex_num_atoms=complex_num_atoms,
                neighbor_cutoff=neighbor_cutoff,
                max_num_neighbors=max_num_neighbors,
                batch_size=64)
    else:
        raise ValueError("Featurizer not supported")

    print("\nFeaturizing Complexes for \"%s\" ...\n" % data_folder)
    feat_t1 = time.time()
    features, failures = featurizer.featurize(ligand_files, protein_files)
    feat_t2 = time.time()
    print("\nFeaturization finished, took %0.3f s." % (feat_t2 - feat_t1))

    # Delete labels and ids for failing elements
    labels = np.delete(labels, failures)
    labels = labels.reshape((len(labels), 1))
    ids = np.delete(pdbs, failures)

    print("\nConstruct dataset excluding failing featurization elements...")
    dataset = deepchem.data.DiskDataset.from_numpy(features, y=labels, ids=ids)

    # No transformations of data
    transformers = []

    # Split dataset
    print("\nSplit dataset...\n")
    if split == None:
        return pdbbind_tasks, (dataset, None, None), transformers

    # TODO(rbharath): This should be modified to contain a cluster split so
    # structures of the same protein aren't in both train/test
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
    }
    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         seed=split_seed)

    all_dataset = (train, valid, test)
    print("\nSaving dataset to \"%s\" ..." % save_folder)
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
    return pdbbind_tasks, all_dataset, transformers