def test_move_load(self): """Test that datasets can be moved and loaded.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X, y, w, ids = dataset.to_numpy() shutil.move(data_dir, moved_data_dir) moved_dataset = Dataset(moved_data_dir, reload=reload) X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy() np.testing.assert_allclose(X, X_moved) np.testing.assert_allclose(y, y_moved) np.testing.assert_allclose(w, w_moved) np.testing.assert_array_equal(ids, ids_moved)
def test_samples_move(self): """Test that featurized samples can be moved and reloaded.""" verbosity = "high" data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join( self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) featurized_dataset = loader.featurize( dataset_file, data_dir) n_dataset = len(featurized_dataset) # Now perform move shutil.move(data_dir, moved_data_dir) moved_featurized_dataset = Dataset( data_dir=moved_data_dir, reload=True) assert len(moved_featurized_dataset) == n_dataset
def load_muv(base_dir, reload=True): """Load MUV datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load MUV dataset print("About to load MUV dataset.") dataset_file = os.path.join( current_dir, "../../datasets/muv.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize MUV dataset print("About to featurize MUV dataset.") featurizer = CircularFingerprint(size=1024) all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832']) loader = DataLoader(tasks=all_MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_MUV_tasks, dataset, transformers
def load_tox21(base_dir, reload=True): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. samples_dir = os.path.join(base_dir, "samples") data_dir = os.path.join(base_dir, "dataset") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join( current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=all_tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=8192) else: dataset = Dataset(data_dir, all_tox21_tasks, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_tox21_tasks, dataset, transformers
from deepchem.datasets import Dataset from deepchem.models.tensorflow_models import TensorflowModel from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier from bokeh.plotting import figure from bokeh.models import HoverTool, ColumnDataSource from bokeh.embed import components from bokeh.palettes import Plasma256 import pandas as pd app = Flask(__name__) Plasma256.extend(Plasma256[::-1]) cur_dir = os.path.dirname(__file__) data_dir = os.path.join(cur_dir, 'data') base_dir = '/data/ballen/ML/kinaseDeepLearningAllKinase_081516' test_dir = os.path.join(base_dir, 'test_dataset_random') kinase_tasks = Dataset(test_dir, reload=True).get_task_names() kinase_task_types = {task: 'classification' for task in kinase_tasks} params_dict = { "activation": "relu", "momentum": .9, "batch_size": 128, "init": "glorot_uniform", "data_shape": (1024, ), "learning_rate": 1e-3, "decay": 1e-6, "nb_hidden": (2000, 500), "nb_epoch": 100, "nesterov": False, "dropouts": (.5, .5), "nb_layers": 2, "batchnorm": False,
def load_pcba(base_dir, reload=True): """Load PCBA datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PCBA dataset print("About to load PCBA dataset.") dataset_file = os.path.join( current_dir, "../../datasets/pcba.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize PCBA dataset print("About to featurize PCBA dataset.") featurizer = CircularFingerprint(size=1024) all_PCBA_tasks = [ 'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457', 'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469', 'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688', 'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242', 'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546', 'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676', 'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294', 'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349', 'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947', 'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339', 'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842', 'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317', 'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590', 'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233', 'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171', 'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288', 'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635', 'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104', 'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979', 'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553', 'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709', 'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881', 'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899', 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] loader = DataLoader(tasks=all_PCBA_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_PCBA_tasks, dataset, transformers
def featurize(self, input_files, data_dir, shard_size=8192, num_shards_per_batch=24, worker_pool=None, logging=True, debug=False): """Featurize provided files and write to specified location.""" ############################################################## TIMING time1 = time.time() ############################################################## TIMING log("Loading raw samples now.", self.verbosity) log("shard_size: %d" % shard_size, self.verbosity) log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity) # Allow users to specify a single file for featurization if not isinstance(input_files, list): input_files = [input_files] if not os.path.exists(data_dir): os.makedirs(data_dir) # Construct partial function to write datasets. if not len(input_files): return None input_type = get_input_type(input_files[0]) if logging: mp.log_to_stderr() if worker_pool is None: if logging: worker_pool = LoggingPool(processes=1) else: worker_pool = mp.Pool(processes=1) log("Spawning workers now.", self.verbosity) metadata_rows = [] data_iterator = it.izip( it.repeat((self, shard_size, input_type, data_dir)), enumerate(load_data(input_files, shard_size, self.verbosity))) # Turns out python map is terrible and exhausts the generator as given. # Solution seems to be to to manually pull out N elements from iterator, # then to map on only those N elements. BLECH. Python should do a better # job here. num_batches = 0 ############################################################## TIMING time2 = time.time() log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1)) ############################################################## TIMING while True: log("About to start processing next batch of shards", self.verbosity) ############################################################## TIMING time1 = time.time() ############################################################## TIMING iterator = itertools.islice(data_iterator, num_shards_per_batch) if not debug: batch_metadata = worker_pool.map(featurize_map_function, iterator) else: batch_metadata = [] for elt in iterator: batch_metadata.append(featurize_map_function(elt)) ############################################################## TIMING time2 = time.time() log("TIMING: map call on batch took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING if batch_metadata: metadata_rows.extend( [elt for elt in batch_metadata if elt is not None]) num_batches += 1 log( "Featurized %d datapoints\n" % (shard_size * num_shards_per_batch * num_batches), self.verbosity) else: break ############################################################## TIMING time1 = time.time() ############################################################## TIMING # TODO(rbharath): This whole bit with metadata_rows is an awkward way of # creating a Dataset. Is there a more elegant solutions? dataset = Dataset(data_dir=data_dir, metadata_rows=metadata_rows, reload=reload, verbosity=self.verbosity) ############################################################## TIMING time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING return dataset
def load_nci(base_dir, reload=True, force_transform=False, shard_size=1000, num_shards_per_batch=4): """Load NCI datasets. Does not do train/test split""" # Set some global variables up top verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): print("Deleting dir in nci_datasets.py") print(base_dir) shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load nci dataset print("About to load NCI dataset.") dataset_file1_path = os.path.join(current_dir, "../../datasets/nci_1.csv.gz") dataset_file2_path = os.path.join(current_dir, "../../datasets/nci_2.csv.gz") dataset_paths = [dataset_file1_path, dataset_file2_path] dataset = load_sharded_csv(dataset_paths) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize nci dataset print("About to featurize nci dataset.") featurizer = CircularFingerprint(size=1024) #was sorted list originally in muv_datasets.py, but csv is ordered so removed all_nci_tasks = ([ 'CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226', 'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226', 'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205', 'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620', 'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251', 'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2', 'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1', 'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES', 'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393', 'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7', 'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D' ]) loader = DataLoader(tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_paths, data_dir, shard_size=shard_size, num_shards_per_batch=num_shards_per_batch) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [] if regen or force_transform: print("About to transform data") transformers = [ NormalizationTransformer(transform_y=True, dataset=dataset) ] for transformer in transformers: transformer.transform(dataset) return all_nci_tasks, dataset, transformers
def load_bace(mode="regression", transform=True, split="20-80"): """Load BACE-1 dataset as regression/classification problem.""" reload = True verbosity = "high" regen = False assert split in ["20-80", "80-20"] current_dir = os.path.dirname(os.path.realpath(__file__)) if split == "20-80": dataset_file = os.path.join(current_dir, "../../datasets/desc_canvas_aug30.csv") elif split == "80-20": dataset_file = os.path.join(current_dir, "../../datasets/rev8020split_desc.csv") dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ("[" + ",".join( ["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") crystal_dataset_file = os.path.join( current_dir, "../../datasets/crystal_desc_canvas_aug30.csv") crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0])) #Make directories to store the raw and featurized datasets. base_dir = tempfile.mkdtemp() data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") crystal_dir = os.path.join(base_dir, "crystal") if mode == "regression": bace_tasks = ["pIC50"] elif mode == "classification": bace_tasks = ["Class"] else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) if not reload or not os.path.exists(crystal_dir): crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir) else: crystal_dataset = Dataset(crystal_dir, reload=True) if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir) or not os.path.exists(test_dir)): regen = True splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir, valid_dir, test_dir) else: train_dataset = Dataset(train_dir, reload=True) valid_dataset = Dataset(valid_dir, reload=True) test_dataset = Dataset(test_dir, reload=True) #NOTE THE RENAMING: if split == "20-80": valid_dataset, test_dataset = test_dataset, valid_dataset print("Number of compounds in train set") print(len(train_dataset)) print("Number of compounds in validation set") print(len(valid_dataset)) print("Number of compounds in test set") print(len(test_dataset)) print("Number of compounds in crystal set") print(len(crystal_dataset)) if transform and regen: input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset) ] output_transformers = [] if mode == "regression": output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] else: output_transformers = [] else: input_transformers, output_transformers = [], [] transformers = input_transformers + output_transformers for dataset in [ train_dataset, valid_dataset, test_dataset, crystal_dataset ]: for transformer in transformers: transformer.transform(dataset) return (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, output_transformers)
def test_multiload(self): """Check can re-use featurization for multiple task selections. TODO(rbharath): This test seems silly after the recent round of refactoring. Can it be removed? """ # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(self.base_dir, "dataset") train_dir = os.path.join(self.base_dir, "train_dataset") valid_dir = os.path.join(self.base_dir, "valid_dataset") test_dir = os.path.join(self.base_dir, "test_dataset") model_dir = os.path.join(self.base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d" % i for i in range(17)] ####### Do featurization loader = DataLoader(tasks=all_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy() ####### Do singletask load y_tasks, w_tasks, = [], [] for ind, task in enumerate(all_tasks): print("Processing task %s" % task) dataset = Dataset(data_dir, verbosity=verbosity, reload=reload) X_task, y_task, w_task, ids_task = dataset.to_numpy() y_tasks.append(y_task[:, ind]) w_tasks.append(w_task[:, ind]) ################## Do comparison for ind, task in enumerate(all_tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())