def load_tox21(featurizer='ECFP', split='index'): """Load Tox21 datasets. Does not do train/test split""" # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") current_dir = os.path.dirname(os.path.realpath(__file__)) dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") data_dir = deepchem.utils.get_data_dir() tox21_tasks = [ 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53' ] dataset_dir = os.path.join(data_dir, "tox21", featurizer, split) train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join( dataset_dir, 'valid'), os.path.join(dataset_dir, 'test') if os.path.isdir(dataset_dir): train, valid, test = DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ dc.trans.BalancingTransformer(transform_w=True, dataset=train) ] return tox21_tasks, (train, valid, test), transformers if featurizer == 'ECFP': featurizer_func = dc.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer_func = dc.feat.ConvMolFeaturizer() elif featurizer == 'AdjMatrix': featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True) loader = dc.data.CSVLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func) dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers transformers = [ dc.trans.BalancingTransformer(transform_w=True, dataset=dataset) ] print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': dc.splits.IndexSplitter(), 'random': dc.splits.RandomSplitter(), 'scaffold': dc.splits.ScaffoldSplitter(), 'butina': dc.splits.ButinaSplitter() } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset, train_dir=train, valid_dir=valid, test_dir=test) return tox21_tasks, (train, valid, test), transformers
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbosity) task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbosity) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbosity) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] if X_nonzero.size > 0: task_metadata_rows[task].append( DiskDataset.write_data_to_disk( task_dirs[task_num], basename, [task], X_nonzero, y_nonzero, w_nonzero, ids_nonzero)) task_datasets = [ DiskDataset(data_dir=task_dirs[task_num], metadata_rows=task_metadata_rows[task], verbosity=dataset.verbosity) for (task_num, task) in enumerate(tasks)] return task_datasets
def load_dataset( self, name: str, reload: bool ) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]: """Load the dataset. Parameters ---------- name: str the name of the dataset, used to identify the directory on disk reload: bool if True, the first call for a particular featurizer and splitter will cache the datasets to disk, and subsequent calls will reload the cached datasets. """ # Build the path to the dataset on disk. featurizer_name = str(self.featurizer) splitter_name = 'None' if self.splitter is None else str(self.splitter) save_folder = os.path.join(self.save_dir, name + "-featurized", featurizer_name, splitter_name) if len(self.transformers) > 0: transformer_name = '_'.join( t.get_directory_name() for t in self.transformers) save_folder = os.path.join(save_folder, transformer_name) # Try to reload cached datasets. if reload: if self.splitter is None: if os.path.exists(save_folder): transformers = dc.utils.data_utils.load_transformers(save_folder) return self.tasks, (DiskDataset(save_folder),), transformers else: loaded, all_dataset, transformers = dc.utils.data_utils.load_dataset_from_disk( save_folder) if all_dataset is not None: return self.tasks, all_dataset, transformers # Create the dataset logger.info("About to featurize %s dataset." % name) dataset = self.create_dataset() # Split and transform the dataset. if self.splitter is None: transformer_dataset: Dataset = dataset else: logger.info("About to split dataset with {} splitter.".format( self.splitter.__class__.__name__)) train, valid, test = self.splitter.train_valid_test_split(dataset) transformer_dataset = train transformers = [ t.create_transformer(transformer_dataset) for t in self.transformers ] logger.info("About to transform data.") if self.splitter is None: for transformer in transformers: dataset = transformer.transform(dataset) if reload and isinstance(dataset, DiskDataset): dataset.move(save_folder) dc.utils.data_utils.save_transformers(save_folder, transformers) return self.tasks, (dataset,), transformers for transformer in transformers: train = transformer.transform(train) valid = transformer.transform(valid) test = transformer.transform(test) if reload and isinstance(train, DiskDataset) and isinstance( valid, DiskDataset) and isinstance(test, DiskDataset): dc.utils.data_utils.save_dataset_to_disk(save_folder, train, valid, test, transformers) return self.tasks, (train, valid, test), transformers
def load_gpcr(dataset_file, featurizer='ECFP', transformers=True, reload=True, sep='OnePositiveSplit', K=5): # data_dir=os.path.dirname(dataset_file) save_dir = os.path.join( os.path.dirname(dataset_file), '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp", "split") train, valid, test = os.path.join(save_dir, 'train'), os.path.join( save_dir, 'valid'), os.path.join(save_dir, 'test') fopen = open(dataset_file, "r") ss = fopen.readlines() m = ss[0].strip('\n').split(',') m.remove('SMILES') if os.path.isdir(save_dir): if reload: dataset, train_dataset, valid_dataset, test_dataset = DiskDataset( data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=train_dataset) ] all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return m, all_dataset, transformers if featurizer == 'ECFP': featurizer = deepchem.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer = deepchem.feat.ConvMolFeaturizer() elif featurizer == 'Weave': featurizer = deepchem.feat.WeaveFeaturizer() elif featurizer == 'Raw': featurizer = deepchem.feat.RawFeaturizer() elif featurizer == 'AdjacencyConv': featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150, max_valence=6) elif featurizer == 'SelfDefine': featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field) loader = deepchem.data.CSVLoader(tasks=m, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, data_dir=save_dir, shard_size=8192) # dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers if transformers: transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=dataset) ] for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), 'random_stratified': deepchem.splits.RandomStratifiedSplitter(), 'scaffold': deepchem.splits.ScaffoldSplitter(), 'butina': deepchem.splits.ButinaSplitter(), 'task': deepchem.splits.TaskSplitter(), 'Harmonious_positive': Harmonious_positive(), 'OnePositiveSplit': OnePositiveSplit() } splitter = splitters[sep] if sep == 'task': fold_datasets = splitter.k_fold_split(dataset, K) all_dataset = fold_datasets elif sep == 'Harmonious_positive': train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) train_dataset = DiskDataset.from_numpy(train_dataset.X, train_dataset.y, train_dataset.w, train_dataset.ids, dataset.tasks, data_dir=train) valid_dataset = DiskDataset.from_numpy(valid_dataset.X, valid_dataset.y, valid_dataset.w, valid_dataset.ids, dataset.tasks, data_dir=valid) test_dataset = DiskDataset.from_numpy(test_dataset.X, test_dataset.y, test_dataset.w, test_dataset.ids, dataset.tasks, data_dir=test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) elif sep == 'Harmonious_positive' and K: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( # dataset, # frac_train=frac_train, # frac_valid=0, # frac_test=1- frac_train, # ) # train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids, # dataset.tasks,data_dir=train) # train_dataset.reshard(8192) # test_dataset = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids, # dataset.tasks,data_dir=test) # test_dataset.reshard(8192) # fold_dataset = splitter.k_fold_split( # train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True) fold_dataset = splitter.k_fold_split( dataset, K, directories=[os.path.join(valid, str(i)) for i in range(K)], verbose=True) folds = [] for i in range(K): print('merge fold dataset {}...'.format(i)) train_fold = DiskDataset.merge( [fold_dataset[j] for j in range(K) if j != i], merge_dir=os.path.join(valid, str(i), 'train_fold')) test_fold = DiskDataset.merge([fold_dataset[i]], merge_dir=os.path.join( valid, str(i), 'valid_fold')) folds.append([train_fold, test_fold]) all_dataset = (dataset, [], folds, []) else: train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir=train, valid_dir=valid, test_dir=test, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) # else: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test) # all_dataset = (dataset,train_dataset, valid_dataset, test_dataset) # if reload: # deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers) return m, all_dataset, transformers
def load_tf(samp_num=0, reload=True, split='random', frac_train_and_valid=0.9, data_time=10, data_num=1, data_dir='/home/hdd2/lifei/sam/script/tmm/data'): tf_tasks = ['values'] dataset_file = (data_dir + "/fingerprint_" + str(samp_num) + '.csv') save_dir = os.path.join( os.path.dirname(dataset_file), '.'.join(os.path.basename(dataset_file).split('.')[:-1])) dataset_dir, test_dir = os.path.join(save_dir, 'dataset'), os.path.join( save_dir, 'test') if os.path.isdir(save_dir): if reload: dataset, train_dataset, valid_dataset, test_dataset = DiskDataset( data_dir=dataset_dir), DiskDataset( data_dir=os.path.join(save_dir, ( 'train_vaild_' + str(data_num)), 'train')), DiskDataset( data_dir=os.path.join(save_dir, ( 'train_vaild_' + str(data_num)), 'valid')), DiskDataset( data_dir=test_dir) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return all_dataset else: print("About to featurize TF dataset.") featurizer = dc.feat.UserDefinedFeaturizer(tf_descriptors) loader = dc.data.UserCSVLoader(tasks=tf_tasks, id_field="compounds", featurizer=featurizer) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) dataset = loader.featurize(dataset_file, data_dir=dataset_dir, shard_size=8192) splitters = { 'index': dc.splits.IndexSplitter(), 'random': dc.splits.RandomSplitter(), 'scaffold': dc.splits.ScaffoldSplitter() } splitter = splitters[split] if not os.path.exists(test_dir): os.makedirs(test_dir) train_valid, test_dataset = splitter.train_test_split( dataset, test_dir=test_dir, frac_train=frac_train_and_valid) test_df = pd.DataFrame(test_dataset.ids) test_df.to_csv(os.path.join(test_dir, 'test.csv')) for i in range(data_time): train_valid_dir = os.path.join(save_dir, ('train_vaild_' + str(i + 1))) train_dir, valid_dir = os.path.join(train_valid_dir, 'train'), os.path.join( train_valid_dir, 'valid') for i in (train_dir, valid_dir): if not os.path.exists(i): os.makedirs(i) train_dataset_t, vaild_dataset_t = splitter.train_test_split( train_valid, train_dir=train_dir, test_dir=valid_dir, frac_train=8.0 / 9) train_df, vaild_df = pd.DataFrame( train_dataset_t.ids), pd.DataFrame(vaild_dataset_t.ids) train_df.to_csv(train_dir + '/train.csv') vaild_df.to_csv(valid_dir + '/valid.csv') train_dataset, valid_dataset = DiskDataset( data_dir=os.path.join(save_dir, ( 'train_vaild_' + str(data_num)), 'train')), DiskDataset( data_dir=os.path.join(save_dir, ('train_vaild_' + str(data_num)), 'valid')) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return all_dataset
def featurize(self, input_files, data_dir=None, shard_size=8192, num_shards_per_batch=24, worker_pool=None, logging=True, debug=False): """Featurize provided files and write to specified location.""" ############################################################## TIMING time1 = time.time() ############################################################## TIMING log("Loading raw samples now.", self.verbosity) log("shard_size: %d" % shard_size, self.verbosity) log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity) # Allow users to specify a single file for featurization if not isinstance(input_files, list): input_files = [input_files] if data_dir is not None: if not os.path.exists(data_dir): os.makedirs(data_dir) else: data_dir = tempfile.mkdtemp() # Construct partial function to write datasets. if not len(input_files): return None input_type = get_input_type(input_files[0]) if logging: mp.log_to_stderr() if worker_pool is None: if logging: worker_pool = LoggingPool(processes=1) else: worker_pool = mp.Pool(processes=1) log("Spawning workers now.", self.verbosity) metadata_rows = [] def wrap_with_shard_metadata(iterator): for item in iterator: yield ((self, shard_size, input_type, data_dir), item) data_iterator = wrap_with_shard_metadata( enumerate(load_data(input_files, shard_size, self.verbosity))) # Turns out python map is terrible and exhausts the generator as given. # Solution seems to be to to manually pull out N elements from iterator, # then to map on only those N elements. BLECH. Python should do a better # job here. num_batches = 0 ############################################################## TIMING time2 = time.time() log("TIMING: pre-map featurization took %0.3f s" % (time2-time1)) ############################################################## TIMING while True: log("About to start processing next batch of shards", self.verbosity) ############################################################## TIMING time1 = time.time() ############################################################## TIMING iterator = itertools.islice(data_iterator, num_shards_per_batch) if not debug: batch_metadata = worker_pool.map( featurize_map_function, iterator) else: batch_metadata = [] for elt in iterator: batch_metadata.append(featurize_map_function(elt)) ############################################################## TIMING time2 = time.time() log("TIMING: map call on batch took %0.3f s" % (time2-time1), self.verbosity) ############################################################## TIMING if batch_metadata: metadata_rows.extend([elt for elt in batch_metadata if elt is not None]) num_batches += 1 log("Featurized %d datapoints\n" % (shard_size * num_shards_per_batch * num_batches), self.verbosity) else: break ############################################################## TIMING time1 = time.time() ############################################################## TIMING # TODO(rbharath): This whole bit with metadata_rows is an awkward way of # creating a Dataset. Is there a more elegant solutions? dataset = DiskDataset(data_dir=data_dir, metadata_rows=metadata_rows, reload=True, verbosity=self.verbosity) ############################################################## TIMING time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2-time1), self.verbosity) ############################################################## TIMING return dataset