def create_dataset(self) -> Dataset: dataset_file = os.path.join(self.data_dir, "hppb.csv") if not os.path.exists(dataset_file): dc.utils.data_utils.download_url(url=HPPB_URL, dest_dir=self.data_dir) loader = dc.data.CSVLoader( tasks=self.tasks, feature_field="smile", featurizer=self.featurizer) dataset = loader.create_dataset(dataset_file, shard_size=2000) remove_missing_entries(dataset) return dataset
def gen_kaggle(KAGGLE_tasks, train_dir, valid_dir, test_dir, data_dir, shard_size=2000): """Load KAGGLE datasets. Does not do train/test split""" # TIMING time1 = time.time() # TIMING # Set some global variables up top train_files = os.path.join(data_dir, "KAGGLE_training_disguised_combined_full.csv.gz") valid_files = os.path.join(data_dir, "KAGGLE_test1_disguised_combined_full.csv.gz") test_files = os.path.join(data_dir, "KAGGLE_test2_disguised_combined_full.csv.gz") if not os.path.exists(train_files): deepchem.utils.data_utils.download_url( "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz", dest_dir=data_dir) deepchem.utils.data_utils.download_url( "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz", dest_dir=data_dir) deepchem.utils.data_utils.download_url( "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz", dest_dir=data_dir) # Featurize KAGGLE dataset logger.info("About to featurize KAGGLE dataset.") featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors) loader = deepchem.data.UserCSVLoader( tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer) logger.info("Featurizing train datasets") train_dataset = loader.featurize(train_files, shard_size=shard_size) logger.info("Featurizing valid datasets") valid_dataset = loader.featurize(valid_files, shard_size=shard_size) logger.info("Featurizing test datasets") test_dataset = loader.featurize(test_files, shard_size=shard_size) logger.info("Remove missing entries from datasets.") remove_missing_entries(train_dataset) remove_missing_entries(valid_dataset) remove_missing_entries(test_dataset) logger.info("Shuffling order of train dataset.") train_dataset.sparse_shuffle() logger.info("Transforming datasets with transformers.") transformers = get_transformers(train_dataset) for transformer in transformers: logger.info( "Performing transformations with %s" % transformer.__class__.__name__) logger.info("Transforming datasets") train_dataset = transformer.transform(train_dataset) valid_dataset = transformer.transform(valid_dataset) test_dataset = transformer.transform(test_dataset) logger.info("Moving directories") train_dataset.move(train_dir) valid_dataset.move(valid_dir) test_dataset.move(test_dir) # TIMING time2 = time.time() logger.info("TIMING: KAGGLE fitting took %0.3f s" % (time2 - time1)) # TIMING return train_dataset, valid_dataset, test_dataset
def gen_kinase(KINASE_tasks, train_dir, valid_dir, test_dir, data_dir, shard_size=2000): time1 = time.time() train_files = os.path.join(data_dir, TRAIN_FILENAME) valid_files = os.path.join(data_dir, VALID_FILENAME) test_files = os.path.join(data_dir, TEST_FILENAME) # Download files if they don't exist if not os.path.exists(train_files): logger.info("Downloading training file...") deepchem.utils.data_utils.download_url(url=TRAIN_URL, dest_dir=data_dir) logger.info("Training file download complete.") logger.info("Downloading validation file...") deepchem.utils.data_utils.download_url(url=VALID_URL, dest_dir=data_dir) logger.info("Validation file download complete.") logger.info("Downloading test file...") deepchem.utils.data_utils.download_url(url=TEST_URL, dest_dir=data_dir) logger.info("Test file download complete") # Featurize the KINASE dataset logger.info("About to featurize KINASE dataset.") featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors) loader = deepchem.data.UserCSVLoader( tasks=KINASE_tasks, id_field="Molecule", featurizer=featurizer) logger.info("Featurizing train datasets...") train_dataset = loader.featurize( input_files=train_files, shard_size=shard_size) logger.info("Featurizing validation datasets...") valid_dataset = loader.featurize( input_files=valid_files, shard_size=shard_size) logger.info("Featurizing test datasets....") test_dataset = loader.featurize(input_files=test_files, shard_size=shard_size) logger.info("Remove missing entries from dataset") remove_missing_entries(train_dataset) remove_missing_entries(valid_dataset) remove_missing_entries(test_dataset) # Shuffle the training data logger.info("Shuffling the training dataset") train_dataset.sparse_shuffle() # Apply transformations logger.info("Transformating datasets with transformers") transformers = get_transformers(train_dataset) for transformer in transformers: logger.info("Performing transformations with {}".format( transformer.__class__.__name__)) logger.info("Transforming the training dataset...") train_dataset = transformer.transform(train_dataset) logger.info("Transforming the validation dataset...") valid_dataset = transformer.transform(valid_dataset) logger.info("Transforming the test dataset...") test_dataset = transformer.transform(test_dataset) logger.info("Transformations complete.") logger.info("Moving datasets to corresponding directories") train_dataset.move(train_dir) logger.info("Train dataset moved.") valid_dataset.move(valid_dir) logger.info("Validation dataset moved.") test_dataset.move(test_dir) logger.info("Test dataset moved.") time2 = time.time() # TIMING logger.info("TIMING: KINASE fitting took %0.3f s" % (time2 - time1)) return train_dataset, valid_dataset, test_dataset