def featurize_input( input_file, feature_dir, user_specified_features, tasks, smiles_field, split_field, id_field, threshold, protein_pdb_field, ligand_pdb_field, ligand_mol2_field, ): """Featurizes raw input data.""" featurizer = DataFeaturizer( tasks=tasks, smiles_field=smiles_field, split_field=split_field, id_field=id_field, threshold=threshold, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, ligand_mol2_field=ligand_mol2_field, user_specified_features=user_specified_features, verbose=True, ) out = os.path.join(feature_dir, "%s.joblib" % (os.path.splitext(os.path.basename(input_file))[0])) featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)
def featurize_input(input_file, feature_dir, user_specified_features, tasks, smiles_field, split_field, id_field, threshold, protein_pdb_field, ligand_pdb_field, ligand_mol2_field): """Featurizes raw input data.""" featurizer = DataFeaturizer(tasks=tasks, smiles_field=smiles_field, split_field=split_field, id_field=id_field, threshold=threshold, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, ligand_mol2_field=ligand_mol2_field, user_specified_features=user_specified_features, verbose=True) featurizer.featurize(input_file, FeaturizedSamples.feature_types, feature_dir)
def test_API(self): """Straightforward test of multitask deepchem classification API.""" splittype = "scaffold" feature_types = ["ECFP"] output_transforms = [] input_transforms = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2} model_name = "multitask_deep_classifier" # Featurize input featurizer = DataFeaturizer(tasks=self.tasks, smiles_field=self.smiles_field, verbose=True) feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in self.tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: evaluator.compute_model_performance(test_csv_out, test_stats_out)
def _create_model(self, splittype, feature_types, input_transforms, output_transforms, task_type, model_params, model_name, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None): """Helper method to create model for test.""" # Featurize input input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer(tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, verbose=True) feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: _, _ = evaluator.compute_model_performance( test_csv_out, test_stats_out)