def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbosity) task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbosity) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbosity) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] if X_nonzero.size > 0: task_metadata_rows[task].append( DiskDataset.write_data_to_disk( task_dirs[task_num], basename, [task], X_nonzero, y_nonzero, w_nonzero, ids_nonzero)) task_datasets = [ DiskDataset(data_dir=task_dirs[task_num], metadata_rows=task_metadata_rows[task], verbosity=dataset.verbosity) for (task_num, task) in enumerate(tasks)] return task_datasets
def test_move_load(self): """Test that datasets can be moved and loaded.""" verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) shutil.move(data_dir, moved_data_dir) moved_dataset = DiskDataset(moved_data_dir, reload=True) X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X, moved_dataset.y, moved_dataset.w, moved_dataset.ids) np.testing.assert_allclose(X, X_moved) np.testing.assert_allclose(y, y_moved) np.testing.assert_allclose(w, w_moved) np.testing.assert_array_equal(ids, ids_moved)
def test_samples_move(self): """Test that featurized samples can be moved and reloaded.""" verbosity = "high" data_dir = os.path.join(self.base_dir, "data") moved_data_dir = os.path.join(self.base_dir, "moved_data") dataset_file = os.path.join( self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) featurized_dataset = loader.featurize( dataset_file, data_dir) n_dataset = len(featurized_dataset) # Now perform move shutil.move(data_dir, moved_data_dir) moved_featurized_dataset = DiskDataset( data_dir=moved_data_dir, reload=True) assert len(moved_featurized_dataset) == n_dataset
def test_power_X_transformer(self): """Test Power transformer on Gaussian normal dataset.""" gaussian_dataset = self.load_gaussian_cdf_data() powers=[1,2,0.5] power_transformer = PowerTransformer(transform_X=True, powers=powers) X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids) power_transformer.transform(gaussian_dataset) gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True) X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids) # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check y is unchanged since this is an X transformer np.testing.assert_allclose(y, y_t) # Check w is unchanged since this is an X transformer np.testing.assert_allclose(w, w_t) # Check X is now holding the proper values in each column. np.testing.assert_allclose(X, X_t[:,:2]) np.testing.assert_allclose(np.power(X,2),X_t[:,2:4]) np.testing.assert_allclose(np.power(X,0.5),X_t[:,4:])
def test_cdf_X_transformer(self): """Test CDF transformer on Gaussian normal dataset.""" target = np.array(np.transpose(np.linspace(0.,1.,1001))) target = np.transpose(np.array(np.append([target],[target], axis=0))) gaussian_dataset = self.load_gaussian_cdf_data() bins=1001 cdf_transformer = CDFTransformer(transform_X=True, bins=bins) X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids) cdf_transformer.transform(gaussian_dataset, bins=bins) gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True) X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids) # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check y is unchanged since this is an X transformer np.testing.assert_allclose(y, y_t) # Check w is unchanged since this is an X transformer np.testing.assert_allclose(w, w_t) # Check X is now holding the proper values when sorted. sorted = np.sort(X_t,axis=0) np.testing.assert_allclose(sorted, target)
def test_multiload(self): """Check can re-use featurization for multiple task selections. TODO(rbharath): This test seems silly after the recent round of refactoring. Can it be removed? """ # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(self.base_dir, "dataset") train_dir = os.path.join(self.base_dir, "train_dataset") valid_dir = os.path.join(self.base_dir, "valid_dataset") test_dir = os.path.join(self.base_dir, "test_dataset") model_dir = os.path.join(self.base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d" % i for i in range(17)] ####### Do featurization loader = DataLoader(tasks=all_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w, dataset.ids) ####### Do singletask load y_tasks, w_tasks, = [], [] for ind, task in enumerate(all_tasks): print("Processing task %s" % task) dataset = DiskDataset(data_dir, verbosity=verbosity, reload=reload) X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w, dataset.ids) y_tasks.append(y_task[:, ind]) w_tasks.append(w_task[:, ind]) ################## Do comparison for ind, task in enumerate(all_tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
def load_pcba(base_dir, reload=True, frac_train=.8): """Load PCBA datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load PCBA dataset print("About to load PCBA dataset.") dataset_file = os.path.join( current_dir, "../../datasets/pcba.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize PCBA dataset print("About to featurize PCBA dataset.") featurizer = CircularFingerprint(size=1024) PCBA_tasks = [ 'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457', 'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469', 'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688', 'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242', 'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546', 'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676', 'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294', 'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349', 'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947', 'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339', 'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842', 'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317', 'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590', 'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233', 'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171', 'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288', 'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635', 'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104', 'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979', 'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553', 'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709', 'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881', 'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899', 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] loader = DataLoader(tasks=PCBA_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) print("About to perform train/valid/test split.") num_train = frac_train * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 120 PCBA_tasks = PCBA_tasks[:num_tasks] print("Using following tasks") print(PCBA_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, PCBA_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, PCBA_tasks) return PCBA_tasks, dataset, transformers
def load_tox21(base_dir, reload=True, num_train=7200): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train") valid_dir = os.path.join(base_dir, "valid") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) tox21_tasks = [ 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53' ] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=8192) else: dataset = DiskDataset(data_dir, tox21_tasks, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, tox21_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, tox21_tasks) return tox21_tasks, (train_dataset, valid_dataset), transformers
def featurize(self, input_files, data_dir, shard_size=8192, num_shards_per_batch=24, worker_pool=None, logging=True, debug=False): """Featurize provided files and write to specified location.""" ############################################################## TIMING time1 = time.time() ############################################################## TIMING log("Loading raw samples now.", self.verbosity) log("shard_size: %d" % shard_size, self.verbosity) log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity) # Allow users to specify a single file for featurization if not isinstance(input_files, list): input_files = [input_files] if not os.path.exists(data_dir): os.makedirs(data_dir) # Construct partial function to write datasets. if not len(input_files): return None input_type = get_input_type(input_files[0]) if logging: mp.log_to_stderr() if worker_pool is None: if logging: worker_pool = LoggingPool(processes=1) else: worker_pool = mp.Pool(processes=1) log("Spawning workers now.", self.verbosity) metadata_rows = [] def wrap_with_shard_metadata(iterator): for item in iterator: yield ((self, shard_size, input_type, data_dir), item) data_iterator = wrap_with_shard_metadata( enumerate(load_data(input_files, shard_size, self.verbosity))) # Turns out python map is terrible and exhausts the generator as given. # Solution seems to be to to manually pull out N elements from iterator, # then to map on only those N elements. BLECH. Python should do a better # job here. num_batches = 0 ############################################################## TIMING time2 = time.time() log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1)) ############################################################## TIMING while True: log("About to start processing next batch of shards", self.verbosity) ############################################################## TIMING time1 = time.time() ############################################################## TIMING iterator = itertools.islice(data_iterator, num_shards_per_batch) if not debug: batch_metadata = worker_pool.map(featurize_map_function, iterator) else: batch_metadata = [] for elt in iterator: batch_metadata.append(featurize_map_function(elt)) ############################################################## TIMING time2 = time.time() log("TIMING: map call on batch took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING if batch_metadata: metadata_rows.extend( [elt for elt in batch_metadata if elt is not None]) num_batches += 1 log( "Featurized %d datapoints\n" % (shard_size * num_shards_per_batch * num_batches), self.verbosity) else: break ############################################################## TIMING time1 = time.time() ############################################################## TIMING # TODO(rbharath): This whole bit with metadata_rows is an awkward way of # creating a Dataset. Is there a more elegant solutions? dataset = DiskDataset(data_dir=data_dir, metadata_rows=metadata_rows, reload=True, verbosity=self.verbosity) ############################################################## TIMING time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING return dataset
def load_sweet(base_dir, reload=True, frac_train=.8): """Load sweet datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load SWEET dataset print("About to load SWEET dataset.") dataset_file = os.path.join(current_dir, "./sweet.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize SWEET dataset print("About to featurize SWEET dataset.") featurizer = CircularFingerprint(size=1024) SWEET_tasks = dataset.columns.values[1:].tolist() loader = DataLoader(tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 17 num_train = frac_train * len(dataset) SWEET_tasks = SWEET_tasks[:num_tasks] print("Using following tasks") print(SWEET_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, SWEET_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, SWEET_tasks) return SWEET_tasks, (train_dataset, valid_dataset), transformers