def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier()) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification") reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_sklearn_classification(self): """Test that sklearn models can learn on simple classification datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] print("X_train.shape, y_train.shape, X_test.shape, y_test.shape") print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=LogisticRegression()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) assert scores[classification_metric.name] > .5
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} model_params = {} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self): """Test of singletask RF ECFP regression API: sharded edition.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["label"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join( self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) # We set shard size above to force the creation of multiple shards of the data. # pdbbind_core has ~200 examples. model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestRegressor(n_estimators=500), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=RandomForestClassifier( class_weight="balanced", n_estimators=500), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression())
def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None): """Builds random forests given hyperparameters. Last two arguments only for tensorflow models and ignored. """ n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel( tasks, task_types, params_dict, model_dir, mode="regression", model_instance=RandomForestRegressor(n_estimators=n_estimators, max_features=max_features))
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=RandomForestClassifier(), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="regression", model_instance=LinearRegression(), verbosity=verbosity)
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity)
def test_sklearn_classification_overfit(self): """Test that sklearn models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = RandomForestClassifier() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024)
def test_sklearn_skewed_classification_overfit(self): """Test sklearn models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_sklearn_regression(self): """Test that sklearn models can learn on simple regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
def test_sklearn_regression_overfit(self): """Test that sklearn models can overfit simple regression datasets.""" tasks = ["task0"] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def model_builder(tasks, task_types, params_dict, model_dir, verbosity=verbosity): n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel(tasks, task_types, params_dict, model_dir, model_instance=model_class( n_estimators=n_estimators, max_features=max_features))
def test_singletask_sklearn_rf_ECFP_regression_API(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [ Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error) ] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = {"batch_size": None, "data_shape": dataset.get_data_shape()} verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = SklearnModel( tasks, task_types, model_params, self.model_dir, mode="classification", model_instance=RandomForestClassifier(), ) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="classification") reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > 0.9
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset), NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for data in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(data) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) sklearn_model = LinearRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) assert train_scores[regression_metric.name] > .5 # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .5
def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(("wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz").split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024)
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] sklearn_model = RandomForestRegressor() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_sklearn_classification(self): """Test that sklearn models can learn on simple classification datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = LogisticRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .5
pdbbind_task_types = {task: "regression" for task in pdbbind_tasks} classification_metric = Metric(metrics.r2_score, verbosity=verbosity, mode="regression") params_dict = { "batch_size": None, "data_shape": train_dataset.get_data_shape(), } if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) model = SklearnModel(pdbbind_tasks, pdbbind_task_types, params_dict, model_dir, model_instance=RandomForestRegressor(n_estimators=500), verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores")
def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, model_instance=LogisticRegression(class_weight="balanced"), verbosity=verbosity)
class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() logger.info("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -nv -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ # if not ligand_file.endswith(".sdf"): # raise ValueError("Only .sdf ligand files can be featurized.") # ligand_basename = os.path.basename(ligand_file).split(".")[0] # ligand_mol2 = os.path.join( # self.base_dir, ligand_basename + ".mol2") # # # Write mol2 file for ligand # obConversion = ob.OBConversion() # conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) # ob_mol = ob.OBMol() # obConversion.ReadFile(ob_mol, str(ligand_file)) # obConversion.WriteFile(ob_mol, str(ligand_mol2)) # # # Featurize ligand # mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) # if mol is None: # return None, None # # Default for CircularFingerprint # n_ligand_features = 1024 # ligand_features = self.ligand_featurizer.featurize([mol]) # # # Featurize pocket # pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( # protein_file, ligand_file) # n_pockets = len(pockets) # n_pocket_features = BindingPocketFeaturizer.n_features # # features = np.zeros((n_pockets, n_pocket_features+n_ligand_features)) # pocket_features = self.pocket_featurizer.featurize( # protein_file, pockets, pocket_atoms_map, pocket_coords) # # Note broadcast operation # features[:, :n_pocket_features] = pocket_features # features[:, n_pocket_features:] = ligand_features # dataset = NumpyDataset(X=features) # pocket_preds = self.model.predict(dataset) # pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # # # Find pockets which are active # active_pockets = [] # active_pocket_atoms_map = {} # active_pocket_coords = [] # for pocket_ind in range(len(pockets)): # #################################################### DEBUG # # TODO(rbharath): For now, using a weak cutoff. Fix later. # #if pocket_preds[pocket_ind] == 1: # if pocket_pred_proba[pocket_ind][1] > .15: # #################################################### DEBUG # pocket = pockets[pocket_ind] # active_pockets.append(pocket) # active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] # active_pocket_coords.append(pocket_coords[pocket_ind]) # return active_pockets, active_pocket_atoms_map, active_pocket_coords # # TODO(LESWING) raise ValueError("Karl Implement")
#Split dataset according to index (why?) splitter = dc.splits.IndexSplitter(dataset_file) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) """ MODEL BUILDING """ # Fit metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.median) # Do setup required for tf/keras models sklmodel = KernelRidge(kernel="rbf", alpha=1e-5, gamma=0.05) model = SklearnModel(sklmodel, modeldir) # Fit trained model print("Fitting model") model.fit(train_dataset) model.save() print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) = model.evaluate(valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) #Error kernel predict_train = pd.DataFrame(model.predict(train_dataset), columns=['prediction']).to_csv(modeldir + "predict_train.csv") predict_valid = pd.DataFrame(model.predict(valid_dataset), columns=['prediction']).to_csv(modeldir + "predict_validation.csv") print("Train scores") print(train_scores)
def model_builder(model_dir): sklearn_model = RandomForestRegressor() return SklearnModel(sklearn_model, model_dir)
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for transformer in transformers: transformer.transform(train_dataset) for transformer in transformers: transformer.transform(test_dataset) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([regression_metric]) print("train_scores") print(train_scores) assert train_scores[regression_metric.name] > .5 # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def generate_rf_model(): model_dir = "." sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir)
def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir)
class RFConvexHullPocketFinder(BindingPocketFinder): """Uses pre-trained RF model + ConvexHulPocketFinder to select pockets.""" def __init__(self, pad=5): self.pad = pad self.convex_finder = ConvexHullPocketFinder(pad) # Load binding pocket model self.base_dir = tempfile.mkdtemp() print("About to download trained model.") # TODO(rbharath): Shift refined to full once trained. call(( "wget -c http://deepchem.io.s3-website-us-west-1.amazonaws.com/trained_models/pocket_random_refined_RF.tar.gz" ).split()) call(("tar -zxvf pocket_random_refined_RF.tar.gz").split()) call(("mv pocket_random_refined_RF %s" % (self.base_dir)).split()) self.model_dir = os.path.join(self.base_dir, "pocket_random_refined_RF") # Fit model on dataset self.model = SklearnModel(model_dir=self.model_dir) self.model.reload() # Create featurizers self.pocket_featurizer = BindingPocketFeaturizer() self.ligand_featurizer = CircularFingerprint(size=1024) def find_pockets(self, protein_file, ligand_file): """Compute features for a given complex TODO(rbharath): This has a log of code overlap with compute_binding_pocket_features in examples/binding_pockets/binding_pocket_datasets.py. Find way to refactor to avoid code duplication. """ # if not ligand_file.endswith(".sdf"): # raise ValueError("Only .sdf ligand files can be featurized.") # ligand_basename = os.path.basename(ligand_file).split(".")[0] # ligand_mol2 = os.path.join( # self.base_dir, ligand_basename + ".mol2") # # # Write mol2 file for ligand # obConversion = ob.OBConversion() # conv_out = obConversion.SetInAndOutFormats(str("sdf"), str("mol2")) # ob_mol = ob.OBMol() # obConversion.ReadFile(ob_mol, str(ligand_file)) # obConversion.WriteFile(ob_mol, str(ligand_mol2)) # # # Featurize ligand # mol = Chem.MolFromMol2File(str(ligand_mol2), removeHs=False) # if mol is None: # return None, None # # Default for CircularFingerprint # n_ligand_features = 1024 # ligand_features = self.ligand_featurizer.featurize([mol]) # # # Featurize pocket # pockets, pocket_atoms_map, pocket_coords = self.convex_finder.find_pockets( # protein_file, ligand_file) # n_pockets = len(pockets) # n_pocket_features = BindingPocketFeaturizer.n_features # # features = np.zeros((n_pockets, n_pocket_features+n_ligand_features)) # pocket_features = self.pocket_featurizer.featurize( # protein_file, pockets, pocket_atoms_map, pocket_coords) # # Note broadcast operation # features[:, :n_pocket_features] = pocket_features # features[:, n_pocket_features:] = ligand_features # dataset = NumpyDataset(X=features) # pocket_preds = self.model.predict(dataset) # pocket_pred_proba = np.squeeze(self.model.predict_proba(dataset)) # # # Find pockets which are active # active_pockets = [] # active_pocket_atoms_map = {} # active_pocket_coords = [] # for pocket_ind in range(len(pockets)): # #################################################### DEBUG # # TODO(rbharath): For now, using a weak cutoff. Fix later. # #if pocket_preds[pocket_ind] == 1: # if pocket_pred_proba[pocket_ind][1] > .15: # #################################################### DEBUG # pocket = pockets[pocket_ind] # active_pockets.append(pocket) # active_pocket_atoms_map[pocket] = pocket_atoms_map[pocket] # active_pocket_coords.append(pocket_coords[pocket_ind]) # return active_pockets, active_pocket_atoms_map, active_pocket_coords # # TODO(LESWING) raise ValueError("Karl Implement")
def model_builder(model_dir): sklearn_model = RandomForestClassifier() return SklearnModel(sklearn_model, model_dir)
pd.DataFrame(train_dataset.y, columns=['prediction']).to_csv(modeldir + "train_original.csv") pd.DataFrame(valid_dataset.y, columns=['prediction']).to_csv(modeldir + "valid_original.csv") for estimator in n_estimators: print('n_estimators = {0}'.format(estimator)) #Create model sklmodel = RandomForestRegressor(n_estimators=estimator, criterion="mse", max_features=max_features, bootstrap=True, oob_score=False, n_jobs=int(cpus / 2)) model = SklearnModel(sklmodel, modeldir) model.fit(train_dataset) #Append trains cores and results train_scores = model.evaluate( train_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) train_results = np.concatenate( (train_results, list(train_scores.values()))) valid_scores = model.evaluate( valid_dataset, [metric, dc.metrics.Metric(dc.metrics.mae_score)]) test_results = np.concatenate((test_results, list(valid_scores.values()))) #Append trains cores and results predict_train = pd.DataFrame(
# Get supports on test-set support_generator = SupportGenerator( test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg, n_trials, replace) # Compute accuracies task_scores = { task: [] for task in range(len(test_dataset.get_task_names())) } for (task, support) in support_generator: # Train model on support sklearn_model = RandomForestClassifier(class_weight="balanced", n_estimators=50) model = SklearnModel(sklearn_model, model_dir) model.fit(support) # Test model task_dataset = get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict_proba(task_dataset) score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w) #print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} for task in range(len(test_dataset.get_task_names())): mean_task_scores[task] = np.mean(np.array(task_scores[task])) print("Fold %s" % str(fold))
def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestClassifier(**model_params) return SklearnModel(sklearn_model, model_dir)
fold_tasks = range(fold * len(test_dataset.get_task_names()), (fold+1) * len(test_dataset.get_task_names())) # Get supports on test-set support_generator = SupportGenerator( test_dataset, range(len(test_dataset.get_task_names())), n_pos, n_neg, n_trials, replace) # Compute accuracies task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))} for (task, support) in support_generator: # Train model on support sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=50) model = SklearnModel(sklearn_model, model_dir) model.fit(support) # Test model task_dataset = get_task_dataset_minus_support(test_dataset, support, task) y_pred = model.predict_proba(task_dataset) score = metric.compute_metric( task_dataset.y, y_pred, task_dataset.w) #print("Score on task %s is %s" % (str(task), str(score))) task_scores[task].append(score) # Join information for all tasks. mean_task_scores = {} for task in range(len(test_dataset.get_task_names())): mean_task_scores[task] = np.mean(np.array(task_scores[task])) print("Fold %s" % str(fold))