def prepare_predictor(input_file, weights_file=None, model_file=None, mean_and_std_file=None): predictor = Predictor() predictor.load_input(input_file) if model_file is not None: predictor.load_architecture(model_file) predictor.load_parameters(param_path=weights_file, mean_and_std_path=mean_and_std_file) return predictor
def load_estimator(model_path): estimator = Predictor() input_file = os.path.join(model_path, 'predictor_input.py') weights_file = os.path.join(model_path, 'full_train.h5') model_file = os.path.join(model_path, 'full_train.json') mean_and_std_file = os.path.join(model_path, 'full_train_mean_std.npz') estimator.load_input(input_file) if os.path.exists(model_file): estimator.load_architecture(model_file) uncertainty = True else: uncertainty = False mean_and_std_file = mean_and_std_file if os.path.exists(mean_and_std_file) else None estimator.load_parameters(param_path=weights_file, mean_and_std_path=mean_and_std_file) return estimator, uncertainty
def load_estimator(model_path): estimator = Predictor() input_file = os.path.join(model_path, 'predictor_input.py') weights_file = os.path.join(model_path, 'full_train.h5') model_file = os.path.join(model_path, 'full_train.json') mean_and_std_file = os.path.join(model_path, 'full_train_mean_std.npz') estimator.load_input(input_file) if os.path.exists(model_file): estimator.load_architecture(model_file) uncertainty = True else: uncertainty = False mean_and_std_file = mean_and_std_file if os.path.exists( mean_and_std_file) else None estimator.load_parameters(param_path=weights_file, mean_and_std_path=mean_and_std_file) return estimator, uncertainty
# Log start timestamp logging.info('CNN training initiated at ' + time.asctime() + '\n') from rmgpy.rmg.main import RMG rmg = RMG() rmg.logHeader() # Importing Keras should happen after setting random seed of Numpy from dde.predictor import Predictor predictor = Predictor(data_file=data_file, save_tensors_dir=save_tensors_dir, keep_tensors=keep_tensors, out_dir=out_dir, normalize=normalize) predictor.load_input(input_file) predictor.load_parameters(param_path=weights_file) lr_func = "float({0} * np.exp(- epoch / {1}))".format(lr0, lr1) save_model_path = os.path.join(out_dir, 'saved_model') if not os.path.exists(save_model_path): os.mkdir(save_model_path) if train_mode == 'in_house': predictor.kfcv_train(folds=folds, batch_size=batch_size, lr_func=lr_func, save_model_path=save_model_path, nb_epoch=nb_epoch, patience=patience, training_ratio=training_ratio, testing_ratio=testing_ratio,
class TestPredictor(unittest.TestCase): def setUp(self): self.predictor = Predictor() def test_model(self): self.predictor.build_model() predictor_model = self.predictor.model self.assertEqual(len(predictor_model.layers), 4) self.assertTrue(isinstance(predictor_model.layers[1], MoleculeConv)) self.assertTrue(isinstance(predictor_model.layers[2], Dense)) self.assertEqual(predictor_model.layers[1].inner_dim, 32) self.assertEqual(predictor_model.layers[1].units, 512) def test_load_input(self): test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) predictor_model = self.predictor.model self.assertEqual(len(predictor_model.layers), 4) self.assertTrue(isinstance(predictor_model.layers[1], MoleculeConv)) self.assertTrue(isinstance(predictor_model.layers[2], Dense)) self.assertTrue(isinstance(predictor_model.layers[3], Dense)) gfp = self.predictor.model.layers[1] dense1 = self.predictor.model.layers[2] dense2 = self.predictor.model.layers[3] self.assertEqual(gfp.W_inner.shape.eval()[0], 4) self.assertEqual(gfp.W_inner.shape.eval()[1], 38) self.assertEqual(gfp.W_inner.shape.eval()[2], 38) self.assertEqual(gfp.b_inner.shape.eval()[0], 4) self.assertEqual(gfp.b_inner.shape.eval()[1], 1) self.assertEqual(gfp.b_inner.shape.eval()[2], 38) self.assertEqual(gfp.W_output.shape.eval()[0], 4) self.assertEqual(gfp.W_output.shape.eval()[1], 38) self.assertEqual(gfp.W_output.shape.eval()[2], 512) self.assertEqual(gfp.b_output.shape.eval()[0], 4) self.assertEqual(gfp.b_output.shape.eval()[1], 1) self.assertEqual(gfp.b_output.shape.eval()[2], 512) self.assertEqual(dense1.W.shape.eval()[0], 512) self.assertEqual(dense1.W.shape.eval()[1], 50) self.assertEqual(dense1.b.shape.eval()[0], 50) self.assertEqual(dense2.W.shape.eval()[0], 50) self.assertEqual(dense2.W.shape.eval()[1], 1) self.assertEqual(dense2.b.shape.eval()[0], 1) def test_specify_datasets(self): """ Test the datasets specification is done properly """ datasets_file = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'datasets.txt') self.predictor.specify_datasets(datasets_file) expected_datasets = [('rmg', 'sdata134k', 'polycyclic_2954_table', 0.1), ('rmg', 'sdata134k', 'cyclic_O_only_table', 0.1)] self.assertEqual(self.predictor.datasets, expected_datasets) def test_load_parameters(self): test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) param_path = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'weights.h5') self.predictor.load_parameters(param_path) gfp = self.predictor.model.layers[1] dense1 = self.predictor.model.layers[2] dense2 = self.predictor.model.layers[3] self.assertAlmostEqual(gfp.W_inner.eval()[0][0][0], 1.000, 3) self.assertAlmostEqual(gfp.b_inner.eval()[0][0][0], 0.000, 3) self.assertAlmostEqual(gfp.W_output.eval()[0][0][0], 0.040, 3) self.assertAlmostEqual(gfp.b_output.eval()[0][0][0], -0.561, 3) self.assertAlmostEqual(dense1.W.eval()[0][0], -0.023, 3) self.assertAlmostEqual(dense1.b.eval()[0], 1.517, 3) self.assertAlmostEqual(dense2.W.eval()[0][0], -4.157, 3) self.assertAlmostEqual(dense2.b.eval()[0], 1.515, 3) def test_predict(self): """ Test predictor is predicting within a reasonable range we should change weights.h5 every time we change feature space """ test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) self.assertTrue(self.predictor.add_extra_atom_attribute) self.assertTrue(self.predictor.add_extra_bond_attribute) param_path = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'weights.h5') self.predictor.load_parameters(param_path) mol_test = Molecule().fromAdjacencyList( """1 C u0 p0 c0 {2,B} {6,B} {7,S} 2 C u0 p0 c0 {1,B} {3,B} {8,S} 3 C u0 p0 c0 {2,B} {4,B} {9,S} 4 C u0 p0 c0 {3,B} {5,B} {10,S} 5 C u0 p0 c0 {4,B} {6,B} {11,S} 6 C u0 p0 c0 {1,B} {5,B} {12,S} 7 H u0 p0 c0 {1,S} 8 H u0 p0 c0 {2,S} 9 H u0 p0 c0 {3,S} 10 H u0 p0 c0 {4,S} 11 H u0 p0 c0 {5,S} 12 H u0 p0 c0 {6,S} """) h298_predicted = self.predictor.predict(mol_test) self.assertAlmostEqual(h298_predicted, 19.5, 0) def test_normalize(self): y1 = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] y2 = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] y1_norm_expected = [-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5] y2_norm_expected = [-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0] y1_norm, y2_norm = self.predictor.normalize_output(y1, y2) self.assertAlmostEqual(self.predictor.y_mean, 4.0) self.assertAlmostEqual(self.predictor.y_std, 2.0) self.assertTrue(np.allclose(y1_norm, y1_norm_expected)) self.assertTrue(np.allclose(y2_norm, y2_norm_expected)) y1 = [[1.0, 2.0], [3.0, 4.0]] y2 = [[2.0, 3.0], [4.0, 5.0]] mean_expected = [2.0, 3.0] std_expected = [1.0, 1.0] y1_norm_expected = [[-1.0, -1.0], [1.0, 1.0]] y2_norm_expected = [[0.0, 0.0], [2.0, 2.0]] y1_norm, y2_norm = self.predictor.normalize_output(y1, y2) self.assertTrue(np.allclose(self.predictor.y_mean, mean_expected)) self.assertTrue(np.allclose(self.predictor.y_std, std_expected)) self.assertTrue(np.allclose(y1_norm, y1_norm_expected)) self.assertTrue(np.allclose(y2_norm, y2_norm_expected)) self.predictor.y_mean = None self.predictor.y_std = None def test_kfcv_train(self): test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) param_path = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'weights.h5') self.predictor.load_parameters(param_path) datafile = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'datafile.csv') self.predictor.data_file = datafile self.predictor.get_data_from_file = True out_dir = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'test_out') self.predictor.out_dir = out_dir save_model_path = os.path.join(out_dir, 'saved_model') if not os.path.exists(out_dir): os.makedirs(out_dir) if not os.path.exists(save_model_path): os.mkdir(save_model_path) lr_func = "float({0} * np.exp(- epoch / {1}))".format(0.0007, 30.0) self.predictor.kfcv_train(2, lr_func, save_model_path, nb_epoch=2, patience=-1, testing_ratio=0.1) self.assertTrue( not os.path.exists(os.path.join(save_model_path, 'best_model.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'current_model.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_0.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_0.hist'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_0.json'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_0.png'))) self.assertTrue( os.path.exists( os.path.join(save_model_path, 'fold_0_loss_report.txt'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_1.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_1.hist'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_1.json'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'fold_1.png'))) self.assertTrue( os.path.exists( os.path.join(save_model_path, 'fold_1_loss_report.txt'))) self.assertTrue( os.path.exists( os.path.join(save_model_path, 'full_folds_loss_report.txt'))) self.predictor.input_file = None self.predictor.data_file = None self.predictor.save_tensors_dir = None self.predictor.keep_tensors = False self.predictor.out_dir = None shutil.rmtree(out_dir) def test_full_train(self): self.predictor.normalize = True test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) datafile = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'datafile.csv') self.predictor.data_file = datafile self.predictor.get_data_from_file = True out_dir = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'test_out') self.predictor.out_dir = out_dir save_model_path = os.path.join(out_dir, 'saved_model') if not os.path.exists(out_dir): os.makedirs(out_dir) if not os.path.exists(save_model_path): os.mkdir(save_model_path) lr_func = "float({0} * np.exp(- epoch / {1}))".format(0.0007, 30.0) self.predictor.full_train(lr_func, save_model_path, nb_epoch=2, training_ratio=1.0, testing_ratio=0.0) self.assertTrue( os.path.exists(os.path.join(out_dir, 'identifiers_test.txt'))) self.assertTrue( os.path.exists(os.path.join(out_dir, 'identifiers_train.txt'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'current_model.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'full_train.h5'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'full_train.hist'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'full_train.json'))) self.assertTrue( os.path.exists(os.path.join(save_model_path, 'full_train.png'))) self.assertTrue( os.path.exists( os.path.join(save_model_path, 'full_train_loss_report.txt'))) self.assertTrue( os.path.exists( os.path.join(save_model_path, 'full_train_mean_std.npz'))) self.predictor.input_file = None self.predictor.data_file = None self.predictor.save_tensors_dir = None self.predictor.keep_tensors = False self.predictor.out_dir = None self.predictor.normalize = False shutil.rmtree(out_dir) def test_kfcv_batch_train(self): test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'predictor_input.py') self.predictor.load_input(test_predictor_input) datafile = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'datafile.csv') self.predictor.data_file = datafile self.predictor.get_data_from_file = True out_dir = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'test_out') self.predictor.out_dir = out_dir save_model_path = os.path.join(out_dir, 'saved_model') if not os.path.exists(out_dir): os.makedirs(out_dir) if not os.path.exists(save_model_path): os.mkdir(save_model_path) weights_file = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'minimal_predictor', 'weights.h5') self.predictor.kfcv_batch_train(3, pretrained_weights=weights_file, batch_size=2, nb_epoch=2, training_ratio=0.8, testing_ratio=0.1) self.assertTrue( os.path.exists(os.path.join(out_dir, 'history.json_fold_0'))) self.assertTrue( os.path.exists(os.path.join(out_dir, 'history.json_fold_1'))) self.assertTrue( os.path.exists(os.path.join(out_dir, 'history.json_fold_2'))) self.predictor.input_file = None self.predictor.data_file = None self.predictor.save_tensors_dir = None self.predictor.keep_tensors = False self.predictor.out_dir = None shutil.rmtree(out_dir) def test_ensemble_predictor(self): test_predictor_input = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'ensemble_predictor', 'predictor_input.py') test_predictor_architecture = os.path.join( os.path.dirname(dde.__file__), 'test_data', 'ensemble_predictor', 'fold_0.json') test_predictor_parameters = os.path.join(os.path.dirname(dde.__file__), 'test_data', 'ensemble_predictor', 'fold_0.h5') self.predictor.load_input(test_predictor_input) self.predictor.load_architecture(test_predictor_architecture) self.predictor.load_parameters(test_predictor_parameters) y_avg, y_std = self.predictor.predict( Molecule().fromSMILES('CCC1CC(C)(C)C1'), sigma=True) expected_y_avg = -21.4971179962 expected_y_std = 0.89726549387 self.assertAlmostEqual(expected_y_avg, y_avg, 2) self.assertAlmostEqual(expected_y_std, y_std, 2)