def featurize_and_split(input_file, feature_dir, samples_dir, train_dir, test_dir, splittype, feature_types, input_transforms, output_transforms, tasks, feature_files=None): """Featurize inputs with NNScore and do train-test split.""" loader = DataLoader(tasks=tasks, smiles_field="smiles", protein_pdb_field="protein_pdb", ligand_pdb_field="ligand_pdb", verbose=True) if feature_files is None: print("About to featurize.") samples = loader.featurize(input_file, feature_dir, samples_dir, shard_size=8) print("Completed Featurization") else: # Transform data into arrays for ML samples = FeaturizedSamples(samples_dir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, train_dir, test_dir) print("Finished train test split.") train_dataset = Dataset(train_dir, train_samples, feature_types) test_dataset = Dataset(test_dir, test_samples, feature_types) print("Finished creating train test datasets") # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) print("Finished Transforming train test data.") return train_dataset, test_dataset
def _featurize_train_test_split(self, splittype, compound_featurizers, complex_featurizers, input_transforms, output_transforms, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None, user_specified_features=None, shard_size=100): # Featurize input featurizers = compound_featurizers + complex_featurizers input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer( tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, compound_featurizers=compound_featurizers, complex_featurizers=complex_featurizers, user_specified_features=user_specified_features, verbose=True) #Featurizes samples and transforms them into NumPy arrays suitable for ML. #returns an instance of class FeaturizedSamples() samples = featurizer.featurize(input_file, self.feature_dir, self.samples_dir, shard_size=shard_size) # Splits featurized samples into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) use_user_specified_features = False if user_specified_features is not None: use_user_specified_features = True train_dataset = Dataset( data_dir=self.train_dir, samples=train_samples, featurizers=featurizers, tasks=tasks, use_user_specified_features=use_user_specified_features) test_dataset = Dataset( data_dir=self.test_dir, samples=test_samples, featurizers=featurizers, tasks=tasks, use_user_specified_features=use_user_specified_features) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) return train_dataset, test_dataset