def test_ensemble_model(): X = np.vstack( (np.arange(30, 10, -2, dtype='float64'), np.arange(100, 90, -1, dtype='float64'))).T Y = np.arange(10, dtype='float64') rf = regressors.randomforest(random_state=42) nn = regressors.neuralnetwork(solver='lbfgs', random_state=42) ensemble = ensemble_model((rf, nn)) # we do not need to fit underlying models, they change when we fit enseble ensemble.fit(X, Y) pred = ensemble.predict(X) mean_pred = np.vstack((rf.predict(X), nn.predict(X))).mean(axis=0) assert_array_almost_equal(pred, mean_pred) assert_almost_equal(ensemble.score(X, Y), r2_score(Y, pred)) # ensemble of a single model should behave exactly like this model nn = neuralnetwork(solver='lbfgs', random_state=42) ensemble = ensemble_model((nn, )) ensemble.fit(X, Y) assert_array_almost_equal(ensemble.predict(X), nn.predict(X)) assert_almost_equal(ensemble.score(X, Y), nn.score(X, Y))
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, **kwargs) if version == 1: cutoff = 12 descriptors = close_contacts(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) descriptors = close_contacts(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 cc = close_contacts(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina = autodock_vina_descriptor(protein) descriptors = ensemble_descriptor((vina, cc)) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def test_regressors(): X = np.vstack((np.arange(30, 10, -2, dtype='float64'), np.arange(100, 90, -1, dtype='float64'))).T Y = np.arange(10, dtype='float64') np.random.seed(42) for regressor in (regressors.svm(C=10), regressors.randomforest(random_state=42), regressors.neuralnetwork(solver='lbfgs', random_state=42, hidden_layer_sizes=(20, 20)), regressors.mlr()): regressor.fit(X, Y) pred = regressor.predict(X) assert_true((np.abs(pred.flatten() - Y) < 1).all()) assert_greater(regressor.score(X, Y), 0.9) pickled = pickle.dumps(regressor) reloaded = pickle.loads(pickled) pred_reloaded = reloaded.predict(X) assert_array_almost_equal(pred, pred_reloaded)
def __init__(self, protein = None, n_jobs = -1, version = 1, spr = 0, **kwargs): self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr model = randomforest(n_estimators = 500, oob_score = True, n_jobs = n_jobs, **kwargs) if version == 1: cutoff = 12 descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums) elif version == 2: cutoff = np.array([ 0, 2, 4, 6, 8, 10, 12]) descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums) elif version == 3: cutoff = 12 cc = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums) vina = autodock_vina_descriptor(protein) descriptors = ensemble_descriptor((vina, cc)) super(rfscore,self).__init__(model, descriptors, score_title = 'rfscore')
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = [ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors' ] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): """Scoring function implementing RF-Score variants. It predicts the binding affinity (pKi/d) of ligand in a complex utilizng simple descriptors (close contacts of atoms <12A) with sophisticated machine-learning model (random forest). The third variand supplements those contacts with Vina partial scores. For futher details see RF-Score publications v1[1]_, v2[2]_, v3[3]_. Parameters ---------- protein : oddt.toolkit.Molecule object Receptor for the scored ligands n_jobs: int (default=-1) Number of cores to use for scoring and training. By default (-1) all cores are allocated. version: int (default=1) Scoring function variant. The deault is the simplest one (v1). spr: int (default=0) The minimum number of contacts in each pair of atom types in the training set for the column to be included in training. This is a way of removal of not frequent and empty contacts. References ---------- .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to predicting protein-ligand binding affinity with applications to molecular docking. Bioinformatics. 2010;26: 1169-1175. doi:10.1093/bioinformatics/btq112 .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise chemical description of protein-ligand complexes lead to more accurate prediction of binding affinity? J Chem Inf Model. 2014;54: 944-955. doi:10.1021/ci500091r .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina Using Random Forest: The Growing Accuracy of Binding Affinity Prediction by the Effective Exploitation of Larger Data Sets. Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126. doi:10.1002/minf.201400132 """ self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = ['vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors'] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)
def __init__(self, protein = None, n_jobs = -1, **kwargs): self.protein = protein self.n_jobs = n_jobs model = randomforest(n_estimators = 500, oob_score = True, n_jobs = n_jobs, **kwargs) descriptors = close_contacts(protein, cutoff = cutoff, protein_types = protein_atomic_nums, ligand_types = ligand_atomic_nums) super(rfscore,self).__init__(model, descriptors, score_title = 'rfscore')
assert cls.score(X, Y) == 1.0 prob = cls.predict_proba(X) assert_array_almost_equal(prob, [[0, 1]] * 5 + [[1, 0]] * 5, decimal=1) log_prob = cls.predict_log_proba(X) assert_array_almost_equal(np.log(prob), log_prob) pickled = pickle.dumps(cls) reloaded = pickle.loads(pickled) prob_reloaded = reloaded.predict_proba(X) assert_array_almost_equal(prob, prob_reloaded) @pytest.mark.parametrize('reg', [ regressors.svm(C=10), regressors.randomforest(random_state=42), regressors.neuralnetwork( solver='lbfgs', random_state=42, hidden_layer_sizes=(20, 20)), regressors.mlr() ]) def test_regressors(reg): X = np.vstack( (np.arange(30, 10, -2, dtype='float64'), np.arange(100, 90, -1, dtype='float64'))).T Y = np.arange(10, dtype='float64') np.random.seed(42)
def __init__(self, protein=None, n_jobs=-1, version=1, spr=0, **kwargs): """Scoring function implementing RF-Score variants. It predicts the binding affinity (pKi/d) of ligand in a complex utilizng simple descriptors (close contacts of atoms <12A) with sophisticated machine-learning model (random forest). The third variand supplements those contacts with Vina partial scores. For futher details see RF-Score publications v1[1]_, v2[2]_, v3[3]_. Parameters ---------- protein : oddt.toolkit.Molecule object Receptor for the scored ligands n_jobs: int (default=-1) Number of cores to use for scoring and training. By default (-1) all cores are allocated. version: int (default=1) Scoring function variant. The deault is the simplest one (v1). spr: int (default=0) The minimum number of contacts in each pair of atom types in the training set for the column to be included in training. This is a way of removal of not frequent and empty contacts. References ---------- .. [1] Ballester PJ, Mitchell JBO. A machine learning approach to predicting protein-ligand binding affinity with applications to molecular docking. Bioinformatics. 2010;26: 1169-1175. doi:10.1093/bioinformatics/btq112 .. [2] Ballester PJ, Schreyer A, Blundell TL. Does a more precise chemical description of protein-ligand complexes lead to more accurate prediction of binding affinity? J Chem Inf Model. 2014;54: 944-955. doi:10.1021/ci500091r .. [3] Li H, Leung K-S, Wong M-H, Ballester PJ. Improving AutoDock Vina Using Random Forest: The Growing Accuracy of Binding Affinity Prediction by the Effective Exploitation of Larger Data Sets. Mol Inform. WILEY-VCH Verlag; 2015;34: 115-126. doi:10.1002/minf.201400132 """ self.protein = protein self.n_jobs = n_jobs self.version = version self.spr = spr if version == 1: cutoff = 12 mtry = 6 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 2: cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) mtry = 14 descriptors = close_contacts_descriptor( protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) elif version == 3: cutoff = 12 mtry = 6 cc = close_contacts_descriptor(protein, cutoff=cutoff, protein_types=protein_atomic_nums, ligand_types=ligand_atomic_nums) vina_scores = [ 'vina_gauss1', 'vina_gauss2', 'vina_repulsion', 'vina_hydrophobic', 'vina_hydrogen', 'vina_num_rotors' ] vina = oddt_vina_descriptor(protein, vina_scores=vina_scores) descriptors = ensemble_descriptor((vina, cc)) # elif version == 5: # cutoff = np.array([0, 2, 4, 6, 8, 10, 12]) # mtry = 14 # descriptors = close_contacts_descriptor( # protein, # cutoff=cutoff, # protein_types=protein_atomic_nums, # ligand_types=ligand_atomic_nums) model = randomforest(n_estimators=500, oob_score=True, n_jobs=n_jobs, max_features=mtry, bootstrap=True, min_samples_split=6, **kwargs) super(rfscore, self).__init__(model, descriptors, score_title='rfscore_v%i' % self.version)