Esempio n. 1
0
    model.fit(train_dataset)

    #Append trains cores and results
    train_scores = model.evaluate(
        train_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    train_results = np.concatenate(
        (train_results, list(train_scores.values())))
    valid_scores = model.evaluate(
        valid_dataset,
        [metric, dc.metrics.Metric(dc.metrics.mae_score)])
    test_results = np.concatenate((test_results, list(valid_scores.values())))

    #Append trains cores and results
    predict_train = pd.DataFrame(
        model.predict(train_dataset),
        columns=['prediction']).to_csv(modeldir + "predict_train_" +
                                       str(estimator) + '.csv')
    predict_valid = pd.DataFrame(
        model.predict(valid_dataset),
        columns=['prediction']).to_csv(modeldir + "predict_validation_" +
                                       str(estimator) + '.csv')

    #Append measures
    estimators = np.concatenate((estimators, [estimator, estimator]))
    metrics = np.concatenate((metrics, ["r2", "mse"]))

# Fit trained model
model.save()
print("Dataframe for n_estimator selection")
df = pd.DataFrame({
Esempio n. 2
0
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.median)

# Do setup required for tf/keras models
sklmodel = KernelRidge(kernel="rbf", alpha=1e-5, gamma=0.05)
model = SklearnModel(sklmodel, modeldir)

# Fit trained model
print("Fitting model")
model.fit(train_dataset)
model.save()
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric,  dc.metrics.Metric(dc.metrics.mae_score)])
 = model.evaluate(valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)])

#Error kernel
predict_train = pd.DataFrame(model.predict(train_dataset), columns=['prediction']).to_csv(modeldir + "predict_train.csv")
predict_valid = pd.DataFrame(model.predict(valid_dataset), columns=['prediction']).to_csv(modeldir + "predict_validation.csv")

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
metrics =   np.concatenate(([], ["r2","mse"]))
print("Dataframe for n_estimator selection")
df = pd.DataFrame({'metrics': metrics,
                   'train_scores': list(train_scores.values()),
                   'result_scores': list(valid_scores.values())})
df.to_csv(modeldir + "KRRAnalysis.csv", index= False)

Esempio n. 3
0
class RFConvexHullPocketFinder(BindingPocketFinder):
  """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""

  def __init__(self, pad=5):
    self.pad = pad
    self.convex_finder = ConvexHullPocketFinder(pad)

    # Load binding pocket model
    self.base_dir = tempfile.mkdtemp()
    print("About to download trained model.")
    # TODO(rbharath): Shift refined to full once trained.
    call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split())
    call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
    call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
    self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF")

    # Fit model on dataset
    self.model = SklearnModel(model_dir=self.model_dir)
    self.model.reload()

    # Create featurizers
    self.pocket_featurizer = BindingPocketFeaturizer()
    self.ligand_featurizer = CircularFingerprint(size=1024)

  def find_pockets(self, protein_file, ligand_file):
    """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
    if not ligand_file.endswith(".sdf"):
      raise ValueError("Only .sdf ligand files can be featurized.")
    ligand_basename = os.path.basename(ligand_file).split(".")[0]
    ligand_mol2 = os.path.join(
        self.base_dir, ligand_basename + ".mol2")

    # Write mol2 file for ligand
    obConversion = ob.OBConversion()
    conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
    ob_mol = ob.OBMol()
    obConversion.ReadFile(ob_mol, str(ligand_file))
    obConversion.WriteFile(ob_mol, str(ligand_mol2))
      
    # Featurize ligand
    mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
    if mol is None:
      return None, None
    # Default for CircularFingerprint
    n_ligand_features = 1024
    ligand_features = self.ligand_featurizer.featurize([mol])

    # Featurize pocket
    pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
        protein_file, ligand_file)
    n_pockets = len(pockets)
    n_pocket_features = BindingPocketFeaturizer.n_features

    features = np.zeros((n_pockets, n_pocket_features+n_ligand_features))
    pocket_features = self.pocket_featurizer.featurize(
        protein_file, pockets, pocket_atoms_map, pocket_coords)
    # Note broadcast operation
    features[:, :n_pocket_features] = pocket_features
    features[:, n_pocket_features:] = ligand_features
    dataset = NumpyDataset(X=features)
    pocket_preds = self.model.predict(dataset)
    pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

    # Find pockets which are active
    active_pockets = []
    active_pocket_atoms_map = {}
    active_pocket_coords = []
    for pocket_ind in range(len(pockets)):
      #################################################### DEBUG
      # TODO(rbharath): For now, using a weak cutoff. Fix later.
      #if pocket_preds[pocket_ind] == 1:
      if pocket_pred_proba[pocket_ind][1] > .15:
      #################################################### DEBUG
        pocket = pockets[pocket_ind]
        active_pockets.append(pocket)
        active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
        active_pocket_coords.append(pocket_coords[pocket_ind])
    return active_pockets, active_pocket_atoms_map, active_pocket_coords
Esempio n. 4
0
class RFConvexHullPocketFinder(BindingPocketFinder):
    """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets."""
    def __init__(self, pad=5):
        self.pad = pad
        self.convex_finder = ConvexHullPocketFinder(pad)

        # Load binding pocket model
        self.base_dir = tempfile.mkdtemp()
        print("About to download trained model.")
        # TODO(rbharath): Shift refined to full once trained.
        call((
            "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz"
        ).split())
        call(("tar -zxvf pocket_random_refined_RF.tar.gz").split())
        call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split())
        self.model_dir = os.path.join(self.base_dir,
                                      "pocket_random_refined_RF")

        # Fit model on dataset
        self.model = SklearnModel(model_dir=self.model_dir)
        self.model.reload()

        # Create featurizers
        self.pocket_featurizer = BindingPocketFeaturizer()
        self.ligand_featurizer = CircularFingerprint(size=1024)

    def find_pockets(self, protein_file, ligand_file):
        """Compute features for a given complex

    TODO(rbharath): This has a log of code overlap with
    compute_binding_pocket_features in
    examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor
    to avoid code duplication.
    """
        if not ligand_file.endswith(".sdf"):
            raise ValueError("Only .sdf ligand files can be featurized.")
        ligand_basename = os.path.basename(ligand_file).split(".")[0]
        ligand_mol2 = os.path.join(self.base_dir, ligand_basename + ".mol2")

        # Write mol2 file for ligand
        obConversion = ob.OBConversion()
        conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2"))
        ob_mol = ob.OBMol()
        obConversion.ReadFile(ob_mol, str(ligand_file))
        obConversion.WriteFile(ob_mol, str(ligand_mol2))

        # Featurize ligand
        mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False)
        if mol is None:
            return None, None
        # Default for CircularFingerprint
        n_ligand_features = 1024
        ligand_features = self.ligand_featurizer.featurize([mol])

        # Featurize pocket
        pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets(
            protein_file, ligand_file)
        n_pockets = len(pockets)
        n_pocket_features = BindingPocketFeaturizer.n_features

        features = np.zeros((n_pockets, n_pocket_features + n_ligand_features))
        pocket_features = self.pocket_featurizer.featurize(
            protein_file, pockets, pocket_atoms_map, pocket_coords)
        # Note broadcast operation
        features[:, :n_pocket_features] = pocket_features
        features[:, n_pocket_features:] = ligand_features
        dataset = NumpyDataset(X=features)
        pocket_preds = self.model.predict(dataset)
        pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset))

        # Find pockets which are active
        active_pockets = []
        active_pocket_atoms_map = {}
        active_pocket_coords = []
        for pocket_ind in range(len(pockets)):
            #################################################### DEBUG
            # TODO(rbharath): For now, using a weak cutoff. Fix later.
            #if pocket_preds[pocket_ind] == 1:
            if pocket_pred_proba[pocket_ind][1] > .15:
                #################################################### DEBUG
                pocket = pockets[pocket_ind]
                active_pockets.append(pocket)
                active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket]
                active_pocket_coords.append(pocket_coords[pocket_ind])
        return active_pockets, active_pocket_atoms_map, active_pocket_coords