def fit_model(model_name, model_params, model_dir, data_dir): """Builds model from featurized data.""" task_type = Model.get_task_type(model_name) train_dir = os.path.join(data_dir, "train-data") train = Dataset(train_dir) task_types = {task: task_type for task in train.get_task_names()} model_params["data_shape"] = train.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train) model.save(model_dir)
def featurize_and_split(input_file, feature_dir, samples_dir, train_dir, test_dir, splittype, feature_types, input_transforms, output_transforms, tasks, feature_files=None): """Featurize inputs with NNScore and do train-test split.""" loader = DataLoader(tasks=tasks, smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", verbose=True) if feature_files is None: print("About to featurize.") samples = loader.featurize(input_file, feature_dir, samples_dir, shard_size=8) print("Completed Featurization") else: # Transform data into arrays for ML samples = FeaturizedSamples(samples_dir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, train_dir, test_dir) print("Finished train test split.") train_dataset = Dataset(train_dir, train_samples, feature_types) test_dataset = Dataset(test_dir, test_samples, feature_types) print("Finished creating train test datasets") # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) print("Finished Transforming train test data.") return train_dataset, test_dataset
def _featurize_train_test_split(self, splittype, compound_featurizers, complex_featurizers, input_transforms, output_transforms, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None, user_specified_features=None, shard_size=100): # Featurize input featurizers = compound_featurizers + complex_featurizers input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer( tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, compound_featurizers=compound_featurizers, complex_featurizers=complex_featurizers, user_specified_features=user_specified_features, verbose=True) #Featurizes samples and transforms them into NumPy arrays suitable for ML. #returns an instance of class FeaturizedSamples() samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir, shard_size=shard_size) # Splits featurized samples into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) use_user_specified_features = False if user_specified_features is not None: use_user_specified_features = True train_dataset = Dataset( data_dir=self.train_dir, samples=train_samples, featurizers=featurizers, tasks=tasks, use_user_specified_features=use_user_specified_features) test_dataset = Dataset( data_dir=self.test_dir, samples=test_samples, featurizers=featurizers, tasks=tasks, use_user_specified_features=use_user_specified_features) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) return train_dataset, test_dataset
def train_test_split(input_transforms, output_transforms, feature_types, splittype, data_dir): """Saves transformed model.""" samples_dir = os.path.join(data_dir, "samples") samples = FeaturizedSamples(samples_dir, reload_data=True) print("Split data into train/test") train_samples_dir = os.path.join(data_dir, "train-samples") test_samples_dir = os.path.join(data_dir, "test-samples") train_samples, test_samples = samples.train_test_split(splittype, train_samples_dir, test_samples_dir) train_data_dir = os.path.join(data_dir, "train-data") test_data_dir = os.path.join(data_dir, "test-data") print("Generating train dataset.") train_dataset = Dataset(train_data_dir, train_samples, feature_types) print("Generating test dataset.") test_dataset = Dataset(test_data_dir, test_samples, feature_types) print("Transforming train data.") train_dataset.transform(input_transforms, output_transforms) print("Transforming test data.") test_dataset.transform(input_transforms, output_transforms)
def test_API(self): """Straightforward test of multitask deepchem classification API.""" splittype = "scaffold" feature_types = ["ECFP"] output_transforms = [] input_transforms = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2} model_name = "multitask_deep_classifier" # Featurize input featurizer = DataFeaturizer(tasks=self.tasks, smiles_field=self.smiles_field, verbose=True) feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in self.tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: evaluator.compute_model_performance(test_csv_out, test_stats_out)
def _create_model(self, splittype, feature_types, input_transforms, output_transforms, task_type, model_params, model_name, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None): """Helper method to create model for test.""" # Featurize input input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer(tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, verbose=True) feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: _, _ = evaluator.compute_model_performance( test_csv_out, test_stats_out)
def create_model(args): """Creates a model""" model_name = args.model if args.base_dir is not None: feature_dir = os.path.join(args.base_dir, "features") data_dir = os.path.join(args.base_dir, "data") model_dir = os.path.join(args.base_dir, "model") ensure_exists([args.base_dir, feature_dir, data_dir, model_dir]) else: if args.model_dir is None or args.data_dir is None or args.feature_dir is None: raise ValueError("If base-dir not specified, must specify " "feature-dir, data-dir, model-dir.") feature_dir, model_dir, data_dir = (args.feature_dir, args.model_dir, args.data_dir) ensure_exists([feature_dir, data_dir, model_dir]) if args.featurize: print("+++++++++++++++++++++++++++++++++") print("Perform featurization") featurize_inputs( feature_dir, data_dir, args.input_files, args.user_specified_features, args.tasks, args.smiles_field, args.split_field, args.id_field, args.threshold, args.protein_pdb_field, args.ligand_pdb_field, args.ligand_mol2_field, ) if args.generate_dataset: print("+++++++++++++++++++++++++++++++++") print("Generate dataset for featurized samples") samples_dir = os.path.join(data_dir, "samples") samples = FeaturizedSamples(samples_dir, reload_data=True) print("Generating dataset.") full_data_dir = os.path.join(data_dir, "full-data") full_dataset = Dataset(full_data_dir, samples, args.feature_types) print("Transform data.") full_dataset.transform(args.input_transforms, args.output_transforms) if args.train_test_split: print("+++++++++++++++++++++++++++++++++") print("Perform train-test split") train_test_split(args.input_transforms, args.output_transforms, args.feature_types, args.splittype, data_dir) if args.fit: print("+++++++++++++++++++++++++++++++++") print("Fit model") model_params = extract_model_params(args) fit_model(model_name, model_params, model_dir, data_dir) if args.eval: print("+++++++++++++++++++++++++++++++++") print("Eval Model on Train") print("-------------------") train_dir = os.path.join(data_dir, "train-data") csv_out_train = os.path.join(data_dir, "train.csv") stats_out_train = os.path.join(data_dir, "train-stats.txt") eval_trained_model(model_name, model_dir, train_dir, csv_out_train, stats_out_train) print("Eval Model on Test") print("------------------") test_dir = os.path.join(data_dir, "test-data") csv_out_test = os.path.join(data_dir, "test.csv") stats_out_test = os.path.join(data_dir, "test-stats.txt") eval_trained_model(model_name, model_dir, test_dir, csv_out_test, stats_out_test) if args.eval_full: print("+++++++++++++++++++++++++++++++++") print("Eval Model on Full Dataset") print("--------------------------") full_data_dir = os.path.join(data_dir, "full-data") csv_out_full = os.path.join(data_dir, "full.csv") stats_out_full = os.path.join(data_dir, "full-stats.txt") eval_trained_model(model_name, model_dir, full_data_dir, csv_out_full, stats_out_full)