Esempio n. 1
0
def featurize_and_split(input_file, feature_dir, samples_dir, train_dir,
                        test_dir, splittype, feature_types, input_transforms,
                        output_transforms, tasks, feature_files=None):
  """Featurize inputs with NNScore and do train-test split."""

  loader = DataLoader(tasks=tasks,
                      smiles_field="smiles",
                      protein_pdb_field="protein_pdb",
                      ligand_pdb_field="ligand_pdb",
                      verbose=True)
  
  if feature_files is None:
    print("About to featurize.")
    samples = loader.featurize(input_file, feature_dir,
                                   samples_dir, shard_size=8)
    print("Completed Featurization")
  else:
    # Transform data into arrays for ML
    samples = FeaturizedSamples(samples_dir, feature_files,
                                reload_data=False)

  # Split into train/test
  train_samples, test_samples = samples.train_test_split(
      splittype, train_dir, test_dir)
  print("Finished train test split.")
  train_dataset = Dataset(train_dir, train_samples, feature_types)
  test_dataset = Dataset(test_dir, test_samples, feature_types)
  print("Finished creating train test datasets")
  # Transforming train/test data
  train_dataset.transform(input_transforms, output_transforms)
  test_dataset.transform(input_transforms, output_transforms)
  print("Finished Transforming train test data.")

  return train_dataset, test_dataset
Esempio n. 2
0
    def _featurize_train_test_split(self,
                                    splittype,
                                    compound_featurizers,
                                    complex_featurizers,
                                    input_transforms,
                                    output_transforms,
                                    input_file,
                                    tasks,
                                    protein_pdb_field=None,
                                    ligand_pdb_field=None,
                                    user_specified_features=None,
                                    shard_size=100):
        # Featurize input
        featurizers = compound_featurizers + complex_featurizers

        input_file = os.path.join(self.current_dir, input_file)
        featurizer = DataFeaturizer(
            tasks=tasks,
            smiles_field=self.smiles_field,
            protein_pdb_field=protein_pdb_field,
            ligand_pdb_field=ligand_pdb_field,
            compound_featurizers=compound_featurizers,
            complex_featurizers=complex_featurizers,
            user_specified_features=user_specified_features,
            verbose=True)

        #Featurizes samples and transforms them into NumPy arrays suitable for ML.
        #returns an instance of class FeaturizedSamples()

        samples = featurizer.featurize(input_file,
                                       self.feature_dir,
                                       self.samples_dir,
                                       shard_size=shard_size)

        # Splits featurized samples into train/test
        train_samples, test_samples = samples.train_test_split(
            splittype, self.train_dir, self.test_dir)

        use_user_specified_features = False
        if user_specified_features is not None:
            use_user_specified_features = True

        train_dataset = Dataset(
            data_dir=self.train_dir,
            samples=train_samples,
            featurizers=featurizers,
            tasks=tasks,
            use_user_specified_features=use_user_specified_features)
        test_dataset = Dataset(
            data_dir=self.test_dir,
            samples=test_samples,
            featurizers=featurizers,
            tasks=tasks,
            use_user_specified_features=use_user_specified_features)

        # Transforming train/test data
        train_dataset.transform(input_transforms, output_transforms)
        test_dataset.transform(input_transforms, output_transforms)

        return train_dataset, test_dataset