Example #1
0
 def create_dataset(self) -> Dataset:
   dataset_file = os.path.join(self.data_dir, "hppb.csv")
   if not os.path.exists(dataset_file):
     dc.utils.data_utils.download_url(url=HPPB_URL, dest_dir=self.data_dir)
   loader = dc.data.CSVLoader(
       tasks=self.tasks, feature_field="smile", featurizer=self.featurizer)
   dataset = loader.create_dataset(dataset_file, shard_size=2000)
   remove_missing_entries(dataset)
   return dataset
Example #2
0
def gen_kaggle(KAGGLE_tasks,
               train_dir,
               valid_dir,
               test_dir,
               data_dir,
               shard_size=2000):
  """Load KAGGLE datasets. Does not do train/test split"""
  # TIMING
  time1 = time.time()
  # TIMING
  # Set some global variables up top
  train_files = os.path.join(data_dir,
                             "KAGGLE_training_disguised_combined_full.csv.gz")
  valid_files = os.path.join(data_dir,
                             "KAGGLE_test1_disguised_combined_full.csv.gz")
  test_files = os.path.join(data_dir,
                            "KAGGLE_test2_disguised_combined_full.csv.gz")
  if not os.path.exists(train_files):
    deepchem.utils.data_utils.download_url(
        "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz",
        dest_dir=data_dir)
    deepchem.utils.data_utils.download_url(
        "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz",
        dest_dir=data_dir)
    deepchem.utils.data_utils.download_url(
        "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz",
        dest_dir=data_dir)

  # Featurize KAGGLE dataset
  logger.info("About to featurize KAGGLE dataset.")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)

  loader = deepchem.data.UserCSVLoader(
      tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing train datasets")
  train_dataset = loader.featurize(train_files, shard_size=shard_size)

  logger.info("Featurizing valid datasets")
  valid_dataset = loader.featurize(valid_files, shard_size=shard_size)

  logger.info("Featurizing test datasets")
  test_dataset = loader.featurize(test_files, shard_size=shard_size)

  logger.info("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  logger.info("Shuffling order of train dataset.")
  train_dataset.sparse_shuffle()

  logger.info("Transforming datasets with transformers.")
  transformers = get_transformers(train_dataset)

  for transformer in transformers:
    logger.info(
        "Performing transformations with %s" % transformer.__class__.__name__)
    logger.info("Transforming datasets")
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)

  logger.info("Moving directories")
  train_dataset.move(train_dir)
  valid_dataset.move(valid_dir)
  test_dataset.move(test_dir)

  # TIMING
  time2 = time.time()
  logger.info("TIMING: KAGGLE fitting took %0.3f s" % (time2 - time1))
  # TIMING

  return train_dataset, valid_dataset, test_dataset
Example #3
0
def gen_kinase(KINASE_tasks,
               train_dir,
               valid_dir,
               test_dir,
               data_dir,
               shard_size=2000):

  time1 = time.time()

  train_files = os.path.join(data_dir, TRAIN_FILENAME)
  valid_files = os.path.join(data_dir, VALID_FILENAME)
  test_files = os.path.join(data_dir, TEST_FILENAME)

  # Download files if they don't exist

  if not os.path.exists(train_files):

    logger.info("Downloading training file...")
    deepchem.utils.data_utils.download_url(url=TRAIN_URL, dest_dir=data_dir)
    logger.info("Training file download complete.")

    logger.info("Downloading validation file...")
    deepchem.utils.data_utils.download_url(url=VALID_URL, dest_dir=data_dir)
    logger.info("Validation file download complete.")

    logger.info("Downloading test file...")
    deepchem.utils.data_utils.download_url(url=TEST_URL, dest_dir=data_dir)
    logger.info("Test file download complete")

  # Featurize the KINASE dataset
  logger.info("About to featurize KINASE dataset.")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)

  loader = deepchem.data.UserCSVLoader(
      tasks=KINASE_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing train datasets...")
  train_dataset = loader.featurize(
      input_files=train_files, shard_size=shard_size)

  logger.info("Featurizing validation datasets...")
  valid_dataset = loader.featurize(
      input_files=valid_files, shard_size=shard_size)

  logger.info("Featurizing test datasets....")
  test_dataset = loader.featurize(input_files=test_files, shard_size=shard_size)

  logger.info("Remove missing entries from dataset")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  # Shuffle the training data
  logger.info("Shuffling the training dataset")
  train_dataset.sparse_shuffle()

  # Apply transformations
  logger.info("Transformating datasets with transformers")
  transformers = get_transformers(train_dataset)

  for transformer in transformers:
    logger.info("Performing transformations with {}".format(
        transformer.__class__.__name__))

    logger.info("Transforming the training dataset...")
    train_dataset = transformer.transform(train_dataset)

    logger.info("Transforming the validation dataset...")
    valid_dataset = transformer.transform(valid_dataset)

    logger.info("Transforming the test dataset...")
    test_dataset = transformer.transform(test_dataset)

  logger.info("Transformations complete.")
  logger.info("Moving datasets to corresponding directories")

  train_dataset.move(train_dir)
  logger.info("Train dataset moved.")

  valid_dataset.move(valid_dir)
  logger.info("Validation dataset moved.")

  test_dataset.move(test_dir)
  logger.info("Test dataset moved.")

  time2 = time.time()

  # TIMING

  logger.info("TIMING: KINASE fitting took %0.3f s" % (time2 - time1))

  return train_dataset, valid_dataset, test_dataset