def transform(self, dataset): """Performs power transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids n_powers = len(self.powers) if self.transform_X: X_t = np.power(X, self.powers[0]) for i in range(1, n_powers): X_t = np.hstack((X_t,np.power(X, self.powers[i]))) y_t = y if self.transform_y: print("y will not be transformed by PowerTransformer, for now.") """ y_t = np.power(y, self.powers[0]) for i in range(1, n_powers): y_t = np.hstack((y_t,np.power(y, self.powers[i]))) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir) return dataset
def transform(self, dataset): """Performs power transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids n_powers = len(self.powers) if self.transform_X: X_t = np.power(X, self.powers[0]) for i in range(1, n_powers): X_t = np.hstack((X_t, np.power(X, self.powers[i]))) y_t = y if self.transform_y: print("y will not be transformed by PowerTransformer, for now.") """ y_t = np.power(y, self.powers[0]) for i in range(1, n_powers): y_t = np.hstack((y_t,np.power(y, self.powers[i]))) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir) return dataset
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbosity) task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbosity) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbosity) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] if X_nonzero.size > 0: task_metadata_rows[task].append( DiskDataset.write_data_to_disk( task_dirs[task_num], basename, [task], X_nonzero, y_nonzero, w_nonzero, ids_nonzero)) task_datasets = [ DiskDataset(data_dir=task_dirs[task_num], metadata_rows=task_metadata_rows[task], verbosity=dataset.verbosity) for (task_num, task) in enumerate(tasks)] return task_datasets
def load_tox21(featurizer='ECFP', split='index'): """Load Tox21 datasets. Does not do train/test split""" # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") current_dir = os.path.dirname(os.path.realpath(__file__)) dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") data_dir = deepchem.utils.get_data_dir() tox21_tasks = [ 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53' ] dataset_dir = os.path.join(data_dir, "tox21", featurizer, split) train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join( dataset_dir, 'valid'), os.path.join(dataset_dir, 'test') if os.path.isdir(dataset_dir): train, valid, test = DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ dc.trans.BalancingTransformer(transform_w=True, dataset=train) ] return tox21_tasks, (train, valid, test), transformers if featurizer == 'ECFP': featurizer_func = dc.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer_func = dc.feat.ConvMolFeaturizer() elif featurizer == 'AdjMatrix': featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True) loader = dc.data.CSVLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func) dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers transformers = [ dc.trans.BalancingTransformer(transform_w=True, dataset=dataset) ] print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': dc.splits.IndexSplitter(), 'random': dc.splits.RandomSplitter(), 'scaffold': dc.splits.ScaffoldSplitter(), 'butina': dc.splits.ButinaSplitter() } splitter = splitters[split] train, valid, test = splitter.train_valid_test_split(dataset, train_dir=train, valid_dir=valid, test_dir=test) return tox21_tasks, (train, valid, test), transformers
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: `dc.data.Dataset` Dataset to do a k-fold split k: int Number of folds to split `dataset` into. directories: list[str] list of length 2*k filepaths to save the result disk-datasets Returns ------- list of length k tuples of (train, cv) where `train` and `cv` are both lists of `Dataset`s. """ logger.info("Computing K-fold split") if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split(rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) logger.info("Splitting multitask dataset into singletask datasets") task_datasets = [ DiskDataset.create_dataset([], task_dirs[task_num], [task.item()]) for (task_num, task) in enumerate(tasks) ] #task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): logger.info("Processing shard %d" % shard_num) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): logger.info("\tTask %s" % task) if len(w.shape) == 1: w_task = w elif w.shape[1] == 1: w_task = w[:, 0] else: w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero, ids_nonzero) return task_datasets
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def test_fit(self): tf_enc = TensorflowMoleculeEncoder.zinc_encoder() smiles = [ "Cn1cnc2c1c(=O)n(C)c(=O)n2C", "O=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N21", "Cn1c2nncnc2c(=O)n(C)c1=O", "Cn1cnc2c1c(=O)[nH]c(=O)n2C", "NC(=O)c1ncc[nH]c1=O", "O=C1OCc2c1[nH]c(=O)[nH]c2=O", "Cn1c(N)c(N)c(=O)n(C)c1=O", "CNc1nc2c([nH]1)c(=O)[nH]c(=O)n2C", "CC(=O)N1CN(C(C)=O)[C@@H](O)[C@@H]1O", "CC(=O)N1CN(C(C)=O)[C@H](O)[C@H]1O", "Cc1[nH]c(=O)[nH]c(=O)c1CO", "O=C1NCCCc2c1no[n+]2[O-]", "Cc1nc(C(N)=O)c(N)n1CCO", "O=c1[nH]cc(N2CCOCC2)c(=O)[nH]1" ] featurizer = dc.feat.one_hot.OneHotFeaturizer(zinc_charset, 120) mols = [Chem.MolFromSmiles(x) for x in smiles] features = featurizer.featurize(mols) dataset = DiskDataset.from_numpy(features, features) prediction = tf_enc.predict_on_batch(dataset.X) tf_de = TensorflowMoleculeDecoder.zinc_decoder() one_hot_decoded = tf_de.predict_on_batch(prediction) decoded_smiles = featurizer.untransform(one_hot_decoded) assert_equals(len(decoded_smiles), len(smiles))
def create_dataset(self, input_files: OneOrMany[str], data_dir: Optional[str] = None, shard_size: Optional[int] = None) -> DiskDataset: """Creates a `Dataset` from input FASTA files. At present, FASTA support is limited and only allows for one-hot featurization, and doesn't allow for sharding. Parameters ---------- input_files: list List of fasta files. data_dir: str, optional Name of directory where featurized data is stored. shard_size: int, optional For now, this argument is ignored and each FASTA file gets its own shard. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`. """ if isinstance(input_files, str): input_files = [input_files] def shard_generator(): for input_file in input_files: X = encode_fasta_sequence(input_file) ids = np.ones(len(X)) # (X, y, w, ids) yield X, None, None, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset( shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def create_dataset(self, input_files, data_dir=None, shard_size=8192): """Creates and returns a `Dataset` object by featurizing provided files. Reads in `input_files` and uses `self.featurizer` to featurize the data in these input files. For large files, automatically shards into smaller chunks of `shard_size` datapoints for convenience. Returns a `Dataset` object that contains the featurized dataset. This implementation assumes that the helper methods `_get_shards` and `_featurize_shard` are implemented and that each shard returned by `_get_shards` is a pandas dataframe. You may choose to reuse or override this method in your subclass implementations. Parameters ---------- input_files: list List of input filenames. data_dir: str, optional Directory to store featurized dataset. shard_size: int, optional Number of examples stored in each shard. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`. """ logger.info("Loading raw samples now.") logger.info("shard_size: %d" % shard_size) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self._get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self._featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = _convert_df_to_numpy(shard, self.tasks) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it # makes no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def featurize_complexes(self, mol_files, protein_files): pool = multiprocessing.Pool() results = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): log_message = "Featurizing %d / %d" % (i, len(mol_files)) results.append( pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb, log_message))) pool.close() features = [] failures = [] for ind, result in enumerate(results): new_features = result.get() # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = list() for layer in self.atomic_conv_model.layers.values(): if isinstance(layer, dc.models.tensorgraph.models.atomic_conv.AtomicConvolution): layers_to_fetch.append(layer) # Extract the atomic convolution features atomic_conv_features = list() feed_dict_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for feed_dict in self.atomic_conv_model._create_feed_dicts( feed_dict_generator, training=False): frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph( outputs=layers_to_fetch, feed_dict=feed_dict, training=False) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def featurize_complexes(self, mol_files, protein_files): pool = multiprocessing.Pool() results = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): log_message = "Featurizing %d / %d" % (i, len(mol_files)) results.append( pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb, log_message))) pool.close() features = [] failures = [] for ind, result in enumerate(results): new_features = result.get() # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = list() for layer in self.atomic_conv_model.layers.values(): if isinstance(layer, dc.models.atomic_conv.AtomicConvolution): layers_to_fetch.append(layer) # Extract the atomic convolution features atomic_conv_features = list() feed_dict_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for feed_dict in self.atomic_conv_model._create_feed_dicts( feed_dict_generator, training=False): frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph( outputs=layers_to_fetch, feed_dict=feed_dict, training=False) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def create_dataset(self, inputs: Sequence[Any], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192) -> DiskDataset: """Creates and returns a `Dataset` object by featurizing provided files. Reads in `inputs` and uses `self.featurizer` to featurize the data in these input files. For large files, automatically shards into smaller chunks of `shard_size` datapoints for convenience. Returns a `Dataset` object that contains the featurized dataset. This implementation assumes that the helper methods `_get_shards` and `_featurize_shard` are implemented and that each shard returned by `_get_shards` is a pandas dataframe. You may choose to reuse or override this method in your subclass implementations. Parameters ---------- inputs: Sequence[Any] List of inputs to process. Entries can be arbitrary objects so long as they are understood by `self.featurizer` data_dir: str, optional (default None) Directory to store featurized dataset. shard_size: int, optional (default 8192) Number of examples stored in each shard. Returns ------- DiskDataset A `DiskDataset` object containing a featurized representation of data from `inputs`. """ logger.info("Loading raw samples now.") logger.info("shard_size: %s" % str(shard_size)) if not isinstance(inputs, list): try: inputs = list(inputs) except TypeError: inputs = [inputs] def shard_generator(): global_index = 0 for shard_num, shard in enumerate( self._get_shards(inputs, shard_size)): time1 = time.time() X, y, w, ids = self._featurize_shard(shard, global_index) global_index += len(shard) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def test_select_attrs_by_dset_smiles(): #testing that the method can split a attr according to a disk dataset. In this case, the attr_ids need to be converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( test_scaffold.X, test_scaffold.y, ids=test_scaffold_attr[data_obj_scaffold.params.smiles_col].values) newDD = split.select_attrs_by_dset_smiles( dataset, data_obj_scaffold.attr, data_obj_scaffold.params.smiles_col) assert newDD.equals(test_scaffold_attr)
def test_select_dset_by_attr_ids_using_smiles(): #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values) newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()}, index=test_scaffold_attr.smiles) newDD = split.select_dset_by_attr_ids(dataset, newdf) assert (newDD.y == test_scaffold.y).all()
def split(self, dataset, frac_split, split_dirs=None): """ Method that does bulk of splitting dataset. """ if split_dirs is not None: assert len(split_dirs) == 2 else: split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays( (dataset.X, dataset.y, dataset.w, dataset.ids)) if len(y.shape) == 1: y = np.expand_dims(y, 1) if len(w.shape) == 1: w = np.expand_dims(w, 1) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def target_4_dataset_save(self,dataset,file): compound=dataset.ids.tolist() target=dataset.get_task_names() print(target) w=dataset.w print('w.shape') print(w.shape) compuond_4_target=[] target_4=['P21728','P14416','P08908','P28223'] target_4=sorted(target_4,key=lambda x:target.index(x)) target_4_index=[target.index(i) for i in target_4] print('target_4') print(target_4_index) for i in range(len(compound)): z=0 for j in target_4_index: if w[i,j]>0: z=z+1 if z>0: compuond_4_target.append(i) compound_shard=[] dataset1=dataset.select(compuond_4_target) print(compuond_4_target) cpd=compuond_4_target metadata_rows=[] shard_generator=self.shard_generator(cpd,target_4_index,dataset1) for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(file, basename,target_4 , X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) self.save_metadata(target_4, metadata_df, file) time2 = time.time()
def split(self, dataset, frac_split, split_dirs=None): """ Method that does bulk of splitting dataset. """ if split_dirs is not None: assert len(split_dirs) == 2 else: split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays((dataset.X, dataset.y, dataset.w, dataset.ids)) if len(y.shape) == 1: y = np.expand_dims(y, 1) if len(w.shape) == 1: w = np.expand_dims(w, 1) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def featurize_complexes(self, mol_files, protein_files): features = [] failures = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): logging.info("Featurizing %d / %d" % (i, len(mol_files))) new_features = self._featurize_complex(mol_file, protein_pdb) # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = [ self.atomic_conv_model._frag1_conv, self.atomic_conv_model._frag2_conv, self.atomic_conv_model._complex_conv ] # Extract the atomic convolution features atomic_conv_features = list() batch_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for X, y, w in batch_generator: frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model.predict_on_generator( [(X, y, w)], outputs=layers_to_fetch) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location. For large datasets, automatically shards into smaller chunks for convenience. Parameters ---------- input_files: list List of input filenames. data_dir: str (Optional) Directory to store featurized dataset. shard_size: int (Optional) Number of examples stored in each shard. """ log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False neighbor_cutoff = 4 max_num_neighbors = 10 # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors, neighbor_cutoff) # Featurize Dataset features = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_coordinate_features( featurizer, pdb_subdir, pdb_code) features.append(computed_feature) X = np.array(features, dtype - object) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False neighbor_cutoff = 4 max_num_neighbors = 10 # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers featurizer = NeighborListComplexAtomicCoordinates( max_num_neighbors, neighbor_cutoff) # Featurize Dataset features = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_coordinate_features( featurizer, pdb_subdir, pdb_code) features.append(computed_feature) X = np.array(features, dtype-object) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks)
def featurize(self, input_files, data_dir=None): """Featurizes fasta files. Parameters ---------- input_files: list List of fasta files. data_dir: str (Optional) Name of directory where featurized data is stored. """ if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for input_file in input_files: X = encode_fasta_sequence(input_file) ids = np.ones(len(X)) # (X, y, w, ids) yield X, None, None, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def create_dataset(self, input_files: OneOrMany[str], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192) -> DiskDataset: """Creates a `Dataset` from input JSON files. Parameters ---------- input_files: OneOrMany[str] List of JSON filenames. data_dir: Optional[str], default None Name of directory where featurized data is stored. shard_size: Optional[int], default 8192 Shard size when loading data. Returns ------- dataset: dc.data.Dataset A `Dataset` object containing a featurized representation of data from `input_files`. """ if not isinstance(input_files, list): try: if isinstance(input_files, str): input_files = [input_files] else: input_files = list(input_files) except TypeError: raise ValueError( "input_files is of an unrecognized form. Must be one filename or a list of filenames." ) def shard_generator(): """Yield X, y, w, and ids for shards.""" for shard_num, shard in enumerate( self._get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self._featurize_shard(shard) if self.id_field: ids = shard[self.id_field].values else: ids = np.ones(len(X)) ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results if they exist. y, w = _convert_df_to_numpy(shard, self.tasks) if self.label_field: y = shard[self.label_field] if self.weight_field: w = shard[self.weight_field] # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it # makes no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() logger.info("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1)) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir)
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ dataset_dup = False if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): log.info("Duplicate ids or smiles in the dataset, will deduplicate first and assign all records per compound ID to same partition") dataset_dup = True dataset_ori = copy.deepcopy(dataset) id_df = pd.DataFrame({'indices' : np.arange(len(dataset.ids), dtype=np.int32), "compound_id": [str(e) for e in dataset.ids]}) sel_df = id_df.drop_duplicates(subset="compound_id") dataset = dataset.select(sel_df.indices.values) if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, w=dataset.w, ids=attr_df.drop_duplicates(subset=smiles_col)[smiles_col].values) if dataset_dup: dataset_ori = DiskDataset.from_numpy(dataset_ori.X, dataset_ori.y, w=dataset_ori.w, ids=attr_df[smiles_col].values) # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction # is directly specified through command line parameters. If we use Butina splitting, we can't control # the test set size either. train_frac = 1.0 - self.params.split_test_frac # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the # training set to split it into training/validation folds. if self.split == 'butina': train_cv, test, _ = self.splitter.train_valid_test_split(dataset) self.splitter = dc.splits.ScaffoldSplitter() train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) else: # TODO: Add special handling for AVE splitter train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac) train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) train_valid_dsets = [] train_valid_attr = [] if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. for train, valid in train_cv_pairs: # assign the subsets to the original dataset if duplicated compounds exist if dataset_dup: train = select_dset_by_id_list(dataset_ori, train.ids) valid = select_dset_by_id_list(dataset_ori, valid.ids) train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, w=train.w, ids=train_attr.index.values) valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, w=valid.w, ids=valid_attr.index.values) train_valid_dsets.append((train, valid)) train_valid_attr.append((train_attr, valid_attr)) if dataset_dup: test = select_dset_by_id_list(dataset_ori, test.ids) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, w=test.w, ids=test_attr.index.values) else: # Otherwise just subset the ID-to-SMILES maps. for train, valid in train_cv_pairs: if dataset_dup: train = select_dset_by_id_list(dataset_ori, train.ids) valid = select_dset_by_id_list(dataset_ori, valid.ids) train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) train_valid_attr.append((train_attr, valid_attr)) train_valid_dsets = train_cv_pairs if dataset_dup: test = select_dset_by_id_list(dataset_ori, test.ids) test_attr = select_attrs_by_dset_ids(test, attr_df) return train_valid_dsets, test, train_valid_attr, test_attr
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. For ave_min, random, scaffold, index splits self.params.split_valid_frac & self.params.split_test_frac should be defined and train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac For butina split, test size is not user defined, and depends on available clusters that qualify for placement in the test set train_frac = 1.0 - self.params.split_valid_frac For temporal split, test size is also not user defined, and depends on number of compounds with dates after cutoff date. train_frac = 1.0 - self.params.split_valid_frac Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): raise Exception("Duplicate ids or smiles in the dataset") log.warning("Splitting data by %s" % self.params.splitter) if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, ids=attr_df[smiles_col].values, verbose=False) if self.split == 'butina': #train_valid, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff) # Can't use train_test_split with Butina because Butina splits into train and valid sets only. train_valid, test, _ = self.splitter.train_valid_test_split( dataset) self.splitter = dc.splits.ScaffoldSplitter() # With Butina splitting, we don't have control over the size of the test set train_frac = 1.0 - self.params.split_valid_frac train, valid = self.splitter.train_test_split( train_valid, seed=np.random.seed(123), frac_train=train_frac) elif self.split == 'ave_min': # AVEMinSplitter also only does train-valid splits, but at least nested splits seem to work. # TODO: Change this if we modify AVE splitter to do 3-way splits internally. train_valid_frac = 1.0 - self.params.split_test_frac train_frac = train_valid_frac - self.params.split_valid_frac log.info("Performing split for test set") train_valid, test, _ = self.splitter.train_valid_test_split( dataset, frac_train=train_valid_frac, frac_valid=self.params.split_test_frac, frac_test=0.0) log.info("Performing split of training and validation sets") train, valid, _ = self.splitter.train_valid_test_split( train_valid, frac_train=train_frac / train_valid_frac, frac_valid=self.params.split_valid_frac / train_valid_frac, frac_test=0.0) log.info( "Results of 3-way split: %d training, %d validation, %d test compounds" % (train.X.shape[0], valid.X.shape[0], test.X.shape[0])) elif self.split == 'temporal': # TemporalSplitter requires that we pass attr_df so it can get the dates for each compound train_frac = 1.0 - self.params.split_valid_frac train, valid, test = self.splitter.train_valid_test_split( dataset, attr_df, frac_train=train_frac, frac_valid=self.params.split_valid_frac) else: train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac train, valid, test = self.splitter.train_valid_test_split( dataset, frac_train=train_frac, frac_valid=self.params.split_valid_frac, frac_test=self.params.split_test_frac, seed=np.random.seed(123)) # Extract the ID-to_SMILES maps from attr_df for each subset. if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False) valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False) else: # Otherwise just subset the ID-to-SMILES maps. train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) test_attr = select_attrs_by_dset_ids(test, attr_df) # Note grouping of train/valid return values as tuple lists, to match format of # KFoldSplitting.split_dataset(). return [(train, valid)], test, [(train_attr, valid_attr)], test_attr
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): raise Exception("Duplicate ids or smiles in the dataset") if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, ids=attr_df[smiles_col].values, verbose=False) # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction # is directly specified through command line parameters. If we use Butina splitting, we can't control # the test set size either. train_frac = 1.0 - self.params.split_test_frac # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the # training set to split it into training/validation folds. if self.split == 'butina': # TODO: Splitter.train_test_split() doesn't provide a way to pass the cutoff parameter # through to the ButinaSplitter.split() function. Simple fix would be to reimplement # train_test_split() here (it's not a complicated function). For now, allow cutoff to default. #train_cv, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff) train_cv, test, _ = self.splitter.train_valid_test_split(dataset) self.splitter = dc.splits.ScaffoldSplitter() train_cv_pairs = self.splitter.k_fold_split( train_cv, self.num_folds) else: # TODO: Add special handling for AVE splitter train_cv, test = self.splitter.train_test_split( dataset, seed=np.random.seed(123), frac_train=train_frac) train_cv_pairs = self.splitter.k_fold_split( train_cv, self.num_folds) train_valid_dsets = [] train_valid_attr = [] if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. for train, valid in train_cv_pairs: train_attr = select_attrs_by_dset_smiles( train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False) valid_attr = select_attrs_by_dset_smiles( valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False) train_valid_dsets.append((train, valid)) train_valid_attr.append((train_attr, valid_attr)) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False) else: # Otherwise just subset the ID-to-SMILES maps. for train, valid in train_cv_pairs: train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) train_valid_attr.append((train_attr, valid_attr)) train_valid_dsets = train_cv_pairs test_attr = select_attrs_by_dset_ids(test, attr_df) return train_valid_dsets, test, train_valid_attr, test_attr
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: Dataset Dataset to do a k-fold split k: int number of folds directories: list of str list of length 2*k filepaths to save the result disk-datasets kwargs Returns ------- list of length k tuples of (train, cv) """ """ :param dataset: :param k: :param directories: :param kwargs: :return: list of length k tuples of (train, cv) """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split( rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def load_dataset( self, name: str, reload: bool ) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]: """Load the dataset. Parameters ---------- name: str the name of the dataset, used to identify the directory on disk reload: bool if True, the first call for a particular featurizer and splitter will cache the datasets to disk, and subsequent calls will reload the cached datasets. """ # Build the path to the dataset on disk. featurizer_name = str(self.featurizer) splitter_name = 'None' if self.splitter is None else str(self.splitter) save_folder = os.path.join(self.save_dir, name + "-featurized", featurizer_name, splitter_name) if len(self.transformers) > 0: transformer_name = '_'.join( t.get_directory_name() for t in self.transformers) save_folder = os.path.join(save_folder, transformer_name) # Try to reload cached datasets. if reload: if self.splitter is None: if os.path.exists(save_folder): transformers = dc.utils.data_utils.load_transformers(save_folder) return self.tasks, (DiskDataset(save_folder),), transformers else: loaded, all_dataset, transformers = dc.utils.data_utils.load_dataset_from_disk( save_folder) if all_dataset is not None: return self.tasks, all_dataset, transformers # Create the dataset logger.info("About to featurize %s dataset." % name) dataset = self.create_dataset() # Split and transform the dataset. if self.splitter is None: transformer_dataset: Dataset = dataset else: logger.info("About to split dataset with {} splitter.".format( self.splitter.__class__.__name__)) train, valid, test = self.splitter.train_valid_test_split(dataset) transformer_dataset = train transformers = [ t.create_transformer(transformer_dataset) for t in self.transformers ] logger.info("About to transform data.") if self.splitter is None: for transformer in transformers: dataset = transformer.transform(dataset) if reload and isinstance(dataset, DiskDataset): dataset.move(save_folder) dc.utils.data_utils.save_transformers(save_folder, transformers) return self.tasks, (dataset,), transformers for transformer in transformers: train = transformer.transform(train) valid = transformer.transform(valid) test = transformer.transform(test) if reload and isinstance(train, DiskDataset) and isinstance( valid, DiskDataset) and isinstance(test, DiskDataset): dc.utils.data_utils.save_dataset_to_disk(save_folder, train, valid, test, transformers) return self.tasks, (train, valid, test), transformers
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True, verbosity=verbosity) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) y = y[y_inds] X = np.vstack(features) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def load_uspto(featurizer="plain", split=None, num_to_load=10000, reload=True, verbose=False): """Load USPTO dataset. For now, only loads the subset of data for 2008-2011 reactions. See https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873 for more details. The full dataset contains some 400K reactions. This causes an out-of-memory error on development laptop if full dataset is featurized. For now, return a truncated subset of dataset. Reloading is not entirely supported for this dataset. """ # Most reaction dataset ML tasks train the prediction of products from # ractants. Both of these are contained in the rxn object that is output, # so there is no "tasks" field. uspto_tasks = [] if split is not None: raise ValueError("Train/valid/test not yet supported.") # Download USPTO dataset data_dir = deepchem.utils.get_data_dir() if reload: save_dir = os.path.join(data_dir, "uspto/" + featurizer + "/") loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk( save_dir) if loaded: return uspto_tasks, all_dataset, transformers dataset_file = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered.zip") if not os.path.exists(dataset_file): deepchem.utils.download_url( "https://bitbucket.org/dan2097/patent-reaction-extraction/downloads/2008-2011_USPTO_reactionSmiles_filtered.zip" ) # Unzip unzip_dir = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered") if not os.path.exists(unzip_dir): deepchem.utils.unzip_file(dataset_file, dest_dir=unzip_dir) # Unzipped file is a tap seperated values file (despite the .txt) filename = os.path.join(unzip_dir, "2008-2011_USPTO_reactionSmiles_filtered.txt") rxns = [] from rdkit.Chem import rdChemReactions with open(filename) as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for ind, row in enumerate(reader): if ind > num_to_load: break if verbose: print("Loading reaction %d" % ind) # The first element in the row is the reaction smarts smarts = row[0] # Sometimes smarts have extraneous information at end of form " # |f:0" that causes parsing to fail. Not sure what this information # is, but just ignoring for now. smarts = smarts.split(" ")[0] rxn = rdChemReactions.ReactionFromSmarts(smarts) rxns.append(rxn) rxn_array = np.array(rxns) # Make up dummy labels since DiskDataset.from_numpy doesn't allow # creation from just features for now. y = np.ones(len(rxn_array)) # TODO: This dataset isn't saved to disk so reload doesn't happen. rxn_dataset = DiskDataset.from_numpy(rxn_array, y) transformers = [] return uspto_tasks, (rxn_dataset, None, None), transformers
def featurize(self, input_files, in_memory=True): """Featurizes image files. Parameters ---------- input_files: list Each file in this list should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). in_memory: bool If true, return in-memory NumpyDataset. Else return DiskDataset. """ if not isinstance(input_files, list): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder images = [] for image_file in image_files: _, extension = os.path.splitext(image_file) if extension == ".png": image = misc.imread(image_file) images.append(image) elif extension == ".tif": im = Image.open(image_file) imarray = np.array(im) images.append(imarray) else: raise ValueError("Unsupported image filetype for %s" % image_file) images = np.array(images) if in_memory: return NumpyDataset(images) else: # from_numpy currently requires labels. Make dummy labels labels = np.zeros(len(images)) return DiskDataset.from_numpy(images, labels)
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) y = y[y_inds] X = np.vstack(features) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
train_attr.smiles.tolist(), valid_attr.smiles.tolist(), test_attr.smiles.tolist() ], []) test.append(len(full_dataset_smiles) == len(set(full_dataset_smiles))) assert all(test) #*********************************************************************************** ([(train, valid)], test_scaffold, [(train_attr, valid_attr)], test_scaffold_attr) = splitter_scaffold.split_dataset( data_obj_scaffold.dataset, data_obj_scaffold.attr, data_obj_scaffold.params.smiles_col) dataset_scaffold = DiskDataset.from_numpy(data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr.index) def test_select_dset_by_attr_ids_using_smiles(): #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values) newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()}, index=test_scaffold_attr.smiles) newDD = split.select_dset_by_attr_ids(dataset, newdf) assert (newDD.y == test_scaffold.y).all()
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind" pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid( pdbbind_dir, base_dir) print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pdbbind_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pdbbind_tasks) classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity, mode="regression") n_features = dataset.get_data_shape()[0] tensorflow_model = TensorflowMultiTaskRegressor( len(pdbbind_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset, nb_epoch=20)
def create_dataset(self, inputs: Union[OneOrMany[str], Tuple[Any]], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192, in_memory: bool = False) -> Dataset: """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights. Parameters ---------- inputs: `Union[OneOrMany[str], Tuple[Any]]` The inputs provided should be one of the following - filename - list of filenames - Tuple (list of filenames, labels) - Tuple (list of filenames, labels, weights) Each file in a given list of filenames should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). If `labels` or `weights` are provided, they must correspond to the sorted order of all filenames provided, with one label/weight per file. data_dir: str, optional Directory to store featurized dataset. in_memory: bool If true, return in-memory NumpyDataset. Else return ImageDataset. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`, `labels`, and `weights`. """ labels, weights = None, None if isinstance(inputs, tuple): if len(inputs) == 1: input_files = inputs[0] if isinstance(inputs, str): input_files = [inputs] elif len(inputs) == 2: input_files, labels = inputs elif len(inputs) == 3: input_files, labels, weights = inputs else: raise ValueError("Input must be a tuple of length 1, 2, or 3") else: input_files = inputs if isinstance(input_files, str): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) extension = extension.lower() # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) extension = extension.lower() if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder # Sort image files image_files = sorted(image_files) if in_memory: if data_dir is None: return NumpyDataset(self.load_img(image_files), y=labels, w=weights, ids=image_files) else: dataset = DiskDataset.from_numpy(self.load_img(image_files), y=labels, w=weights, ids=image_files, tasks=self.tasks, data_dir=data_dir) if shard_size is not None: dataset.reshard(shard_size) return dataset else: return ImageDataset(image_files, y=labels, w=weights, ids=image_files)
def load_gpcr(dataset_file, featurizer='ECFP', transformers=True, reload=True, sep='OnePositiveSplit', K=5): # data_dir=os.path.dirname(dataset_file) save_dir = os.path.join( os.path.dirname(dataset_file), '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp", "split") train, valid, test = os.path.join(save_dir, 'train'), os.path.join( save_dir, 'valid'), os.path.join(save_dir, 'test') fopen = open(dataset_file, "r") ss = fopen.readlines() m = ss[0].strip('\n').split(',') m.remove('SMILES') if os.path.isdir(save_dir): if reload: dataset, train_dataset, valid_dataset, test_dataset = DiskDataset( data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=train_dataset) ] all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return m, all_dataset, transformers if featurizer == 'ECFP': featurizer = deepchem.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer = deepchem.feat.ConvMolFeaturizer() elif featurizer == 'Weave': featurizer = deepchem.feat.WeaveFeaturizer() elif featurizer == 'Raw': featurizer = deepchem.feat.RawFeaturizer() elif featurizer == 'AdjacencyConv': featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150, max_valence=6) elif featurizer == 'SelfDefine': featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field) loader = deepchem.data.CSVLoader(tasks=m, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, data_dir=save_dir, shard_size=8192) # dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers if transformers: transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=dataset) ] for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), 'random_stratified': deepchem.splits.RandomStratifiedSplitter(), 'scaffold': deepchem.splits.ScaffoldSplitter(), 'butina': deepchem.splits.ButinaSplitter(), 'task': deepchem.splits.TaskSplitter(), 'Harmonious_positive': Harmonious_positive(), 'OnePositiveSplit': OnePositiveSplit() } splitter = splitters[sep] if sep == 'task': fold_datasets = splitter.k_fold_split(dataset, K) all_dataset = fold_datasets elif sep == 'Harmonious_positive': train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) train_dataset = DiskDataset.from_numpy(train_dataset.X, train_dataset.y, train_dataset.w, train_dataset.ids, dataset.tasks, data_dir=train) valid_dataset = DiskDataset.from_numpy(valid_dataset.X, valid_dataset.y, valid_dataset.w, valid_dataset.ids, dataset.tasks, data_dir=valid) test_dataset = DiskDataset.from_numpy(test_dataset.X, test_dataset.y, test_dataset.w, test_dataset.ids, dataset.tasks, data_dir=test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) elif sep == 'Harmonious_positive' and K: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( # dataset, # frac_train=frac_train, # frac_valid=0, # frac_test=1- frac_train, # ) # train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids, # dataset.tasks,data_dir=train) # train_dataset.reshard(8192) # test_dataset = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids, # dataset.tasks,data_dir=test) # test_dataset.reshard(8192) # fold_dataset = splitter.k_fold_split( # train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True) fold_dataset = splitter.k_fold_split( dataset, K, directories=[os.path.join(valid, str(i)) for i in range(K)], verbose=True) folds = [] for i in range(K): print('merge fold dataset {}...'.format(i)) train_fold = DiskDataset.merge( [fold_dataset[j] for j in range(K) if j != i], merge_dir=os.path.join(valid, str(i), 'train_fold')) test_fold = DiskDataset.merge([fold_dataset[i]], merge_dir=os.path.join( valid, str(i), 'valid_fold')) folds.append([train_fold, test_fold]) all_dataset = (dataset, [], folds, []) else: train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir=train, valid_dir=valid, test_dir=test, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) # else: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test) # all_dataset = (dataset,train_dataset, valid_dataset, test_dataset) # if reload: # deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers) return m, all_dataset, transformers