class RFConvexHullPocketFinder(BindingPocketFinder):
  """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)

  def find_pockets(self, protein_file, ligand_file):
    """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
    if not ligand_file.endswith(".sdf"):
      raise ValueError("Only .sdf ligand files can be featurized.")
    ligand_basename = os.path.basename(ligand_file).split(".")[0]
    ligand_mol2 = os.path.join(
        self.base_dir, ligand_basename + ".mol2")

    # Write mol2 file for ligand
    obConversion = ob.OBConversion()
    conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(ligand_file))
    obConversion.WriteFile(ob_mol, str(ligand_mol2))
      
    # Featurize ligand
    mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
    if mol is None:
      return None, None
    # Default for CircularFingerprint
    n_ligand_features = 1024
    ligand_features = self.ligand_featurizer.featurize([mol])

    # Featurize pocket
    pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
        protein_file, ligand_file)
    n_pockets = len(pockets)
    n_pocket_features = BindingPocketFeaturizer.n_features

    features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
    pocket_features = self.pocket_featurizer.featurize(
        protein_file, pockets, pocket_atoms_map, pocket_coords)
    # Note broadcast operation
    features[:, :n_pocket_features] = pocket_features
    features[:, n_pocket_features:] = ligand_features
    dataset = NumpyDataset(X=features)
    pocket_preds = self.model.predict(dataset)
    pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

    # Find pockets which are active
    active_pockets = []
    active_pocket_atoms_map = {}
    active_pocket_coords = []
    for pocket_ind in range(len(pockets)):
      #################################################### DEBUG
      # TODO(rbharath): For now, using a weak cutoff. Fix later.
      #if pocket_preds[pocket_ind] == 1:
      if pocket_pred_proba[pocket_ind][1] > .15:
      #################################################### DEBUG
        pocket = pockets[pocket_ind]
        active_pockets.append(pocket)
        active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
        active_pocket_coords.append(pocket_coords[pocket_ind])
    return active_pockets, active_pocket_atoms_map, active_pocket_coords
Exemple #2
0
    # Compute accuracies
    task_scores = {
        task: []
        for task in range(len(test_dataset.get_task_names()))
    }
    for (task, support) in support_generator:
        # Train model on support
        sklearn_model = RandomForestClassifier(class_weight="balanced",
                                               n_estimators=50)
        model = SklearnModel(sklearn_model, model_dir)
        model.fit(support)

        # Test model
        task_dataset = get_task_dataset_minus_support(test_dataset, support,
                                                      task)
        y_pred = model.predict_proba(task_dataset)
        score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)
        #print("Score on task %s is %s" % (str(task), str(score)))
        task_scores[task].append(score)

    # Join information for all tasks.
    mean_task_scores = {}
    for task in range(len(test_dataset.get_task_names())):
        mean_task_scores[task] = np.mean(np.array(task_scores[task]))
    print("Fold %s" % str(fold))
    print(mean_task_scores)

    for (fold_task, task) in zip(fold_tasks,
                                 range(len(test_dataset.get_task_names()))):
        all_scores[fold_task] = mean_task_scores[task]
class RFConvexHullPocketFinder(BindingPocketFinder):
    """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""
    def __init__(self, pad=5):
        self.pad = pad
        self.convex_finder = ConvexHullPocketFinder(pad)

        # Load binding pocket model
        self.base_dir = tempfile.mkdtemp()
        print("About to download trained model.")
        # TODO(rbharath): Shift refined to full once trained.
        call((
            "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz"
        ).split())
        call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
        call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
        self.model_dir = os.path.join(self.base_dir,
                                      "pocket_random_refined_RF")

        # Fit model on dataset
        self.model = SklearnModel(model_dir=self.model_dir)
        self.model.reload()

        # Create featurizers
        self.pocket_featurizer = BindingPocketFeaturizer()
        self.ligand_featurizer = CircularFingerprint(size=1024)

    def find_pockets(self, protein_file, ligand_file):
        """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
        if not ligand_file.endswith(".sdf"):
            raise ValueError("Only .sdf ligand files can be featurized.")
        ligand_basename = os.path.basename(ligand_file).split(".")[0]
        ligand_mol2 = os.path.join(self.base_dir, ligand_basename + ".mol2")

        # Write mol2 file for ligand
        obConversion = ob.OBConversion()
        conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
        ob_mol = ob.OBMol()
        obConversion.ReadFile(ob_mol, str(ligand_file))
        obConversion.WriteFile(ob_mol, str(ligand_mol2))

        # Featurize ligand
        mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
        if mol is None:
            return None, None
        # Default for CircularFingerprint
        n_ligand_features = 1024
        ligand_features = self.ligand_featurizer.featurize([mol])

        # Featurize pocket
        pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
            protein_file, ligand_file)
        n_pockets = len(pockets)
        n_pocket_features = BindingPocketFeaturizer.n_features

        features = np.zeros((n_pockets, n_pocket_features + n_ligand_features))
        pocket_features = self.pocket_featurizer.featurize(
            protein_file, pockets, pocket_atoms_map, pocket_coords)
        # Note broadcast operation
        features[:, :n_pocket_features] = pocket_features
        features[:, n_pocket_features:] = ligand_features
        dataset = NumpyDataset(X=features)
        pocket_preds = self.model.predict(dataset)
        pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

        # Find pockets which are active
        active_pockets = []
        active_pocket_atoms_map = {}
        active_pocket_coords = []
        for pocket_ind in range(len(pockets)):
            #################################################### DEBUG
            # TODO(rbharath): For now, using a weak cutoff. Fix later.
            #if pocket_preds[pocket_ind] == 1:
            if pocket_pred_proba[pocket_ind][1] > .15:
                #################################################### DEBUG
                pocket = pockets[pocket_ind]
                active_pockets.append(pocket)
                active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
                active_pocket_coords.append(pocket_coords[pocket_ind])
        return active_pockets, active_pocket_atoms_map, active_pocket_coords
Exemple #4
0
  support_generator = SupportGenerator(
      test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg,
      n_trials, replace)

  # Compute accuracies
  task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
  for (task, support) in support_generator:
    # Train model on support
    sklearn_model = RandomForestClassifier(
        class_weight="balanced", n_estimators=50)
    model = SklearnModel(sklearn_model, model_dir)
    model.fit(support)

    # Test model
    task_dataset = get_task_dataset_minus_support(test_dataset, support, task)
    y_pred = model.predict_proba(task_dataset)
    score = metric.compute_metric(
        task_dataset.y, y_pred, task_dataset.w)
    #print("Score on task %s is %s" % (str(task), str(score)))
    task_scores[task].append(score)

  # Join information for all tasks.
  mean_task_scores = {}
  for task in range(len(test_dataset.get_task_names())):
    mean_task_scores[task] = np.mean(np.array(task_scores[task]))
  print("Fold %s" % str(fold))
  print(mean_task_scores)

  for (fold_task, task) in zip(fold_tasks, range(len(test_dataset.get_task_names()))):
    all_scores[fold_task] = mean_task_scores[task]