def transform(self, dataset): """Performs power transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids n_powers = len(self.powers) if self.transform_X: X_t = np.power(X, self.powers[0]) for i in range(1, n_powers): X_t = np.hstack((X_t, np.power(X, self.powers[i]))) y_t = y if self.transform_y: print("y will not be transformed by PowerTransformer, for now.") """ y_t = np.power(y, self.powers[0]) for i in range(1, n_powers): y_t = np.hstack((y_t,np.power(y, self.powers[i]))) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir) return dataset
def transform(self, dataset): """Performs power transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids n_powers = len(self.powers) if self.transform_X: X_t = np.power(X, self.powers[0]) for i in range(1, n_powers): X_t = np.hstack((X_t,np.power(X, self.powers[i]))) y_t = y if self.transform_y: print("y will not be transformed by PowerTransformer, for now.") """ y_t = np.power(y, self.powers[0]) for i in range(1, n_powers): y_t = np.hstack((y_t,np.power(y, self.powers[i]))) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(X_t, y_t, w_t, ids_t, data_dir=dataset.data_dir) return dataset
def test_fit(self): tf_enc = TensorflowMoleculeEncoder.zinc_encoder() smiles = [ "Cn1cnc2c1c(=O)n(C)c(=O)n2C", "O=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N21", "Cn1c2nncnc2c(=O)n(C)c1=O", "Cn1cnc2c1c(=O)[nH]c(=O)n2C", "NC(=O)c1ncc[nH]c1=O", "O=C1OCc2c1[nH]c(=O)[nH]c2=O", "Cn1c(N)c(N)c(=O)n(C)c1=O", "CNc1nc2c([nH]1)c(=O)[nH]c(=O)n2C", "CC(=O)N1CN(C(C)=O)[C@@H](O)[C@@H]1O", "CC(=O)N1CN(C(C)=O)[C@H](O)[C@H]1O", "Cc1[nH]c(=O)[nH]c(=O)c1CO", "O=C1NCCCc2c1no[n+]2[O-]", "Cc1nc(C(N)=O)c(N)n1CCO", "O=c1[nH]cc(N2CCOCC2)c(=O)[nH]1" ] featurizer = dc.feat.one_hot.OneHotFeaturizer(zinc_charset, 120) mols = [Chem.MolFromSmiles(x) for x in smiles] features = featurizer.featurize(mols) dataset = DiskDataset.from_numpy(features, features) prediction = tf_enc.predict_on_batch(dataset.X) tf_de = TensorflowMoleculeDecoder.zinc_decoder() one_hot_decoded = tf_de.predict_on_batch(prediction) decoded_smiles = featurizer.untransform(one_hot_decoded) assert_equals(len(decoded_smiles), len(smiles))
def featurize_complexes(self, mol_files, protein_files): pool = multiprocessing.Pool() results = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): log_message = "Featurizing %d / %d" % (i, len(mol_files)) results.append( pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb, log_message))) pool.close() features = [] failures = [] for ind, result in enumerate(results): new_features = result.get() # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = list() for layer in self.atomic_conv_model.layers.values(): if isinstance(layer, dc.models.atomic_conv.AtomicConvolution): layers_to_fetch.append(layer) # Extract the atomic convolution features atomic_conv_features = list() feed_dict_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for feed_dict in self.atomic_conv_model._create_feed_dicts( feed_dict_generator, training=False): frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph( outputs=layers_to_fetch, feed_dict=feed_dict, training=False) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def featurize_complexes(self, mol_files, protein_files): pool = multiprocessing.Pool() results = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): log_message = "Featurizing %d / %d" % (i, len(mol_files)) results.append( pool.apply_async(_featurize_complex, (self, mol_file, protein_pdb, log_message))) pool.close() features = [] failures = [] for ind, result in enumerate(results): new_features = result.get() # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = list() for layer in self.atomic_conv_model.layers.values(): if isinstance(layer, dc.models.tensorgraph.models.atomic_conv.AtomicConvolution): layers_to_fetch.append(layer) # Extract the atomic convolution features atomic_conv_features = list() feed_dict_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for feed_dict in self.atomic_conv_model._create_feed_dicts( feed_dict_generator, training=False): frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model._run_graph( outputs=layers_to_fetch, feed_dict=feed_dict, training=False) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: `dc.data.Dataset` Dataset to do a k-fold split k: int Number of folds to split `dataset` into. directories: list[str] list of length 2*k filepaths to save the result disk-datasets Returns ------- list of length k tuples of (train, cv) where `train` and `cv` are both lists of `Dataset`s. """ logger.info("Computing K-fold split") if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split(rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def test_select_attrs_by_dset_smiles(): #testing that the method can split a attr according to a disk dataset. In this case, the attr_ids need to be converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( test_scaffold.X, test_scaffold.y, ids=test_scaffold_attr[data_obj_scaffold.params.smiles_col].values) newDD = split.select_attrs_by_dset_smiles( dataset, data_obj_scaffold.attr, data_obj_scaffold.params.smiles_col) assert newDD.equals(test_scaffold_attr)
def test_select_dset_by_attr_ids_using_smiles(): #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values) newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()}, index=test_scaffold_attr.smiles) newDD = split.select_dset_by_attr_ids(dataset, newdf) assert (newDD.y == test_scaffold.y).all()
def split(self, dataset, frac_split, split_dirs=None): """ Method that does bulk of splitting dataset. """ if split_dirs is not None: assert len(split_dirs) == 2 else: split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays((dataset.X, dataset.y, dataset.w, dataset.ids)) if len(y.shape) == 1: y = np.expand_dims(y, 1) if len(w.shape) == 1: w = np.expand_dims(w, 1) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def split(self, dataset, frac_split, split_dirs=None): """ Method that does bulk of splitting dataset. """ if split_dirs is not None: assert len(split_dirs) == 2 else: split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays( (dataset.X, dataset.y, dataset.w, dataset.ids)) if len(y.shape) == 1: y = np.expand_dims(y, 1) if len(w.shape) == 1: w = np.expand_dims(w, 1) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def featurize_complexes(self, mol_files, protein_files): features = [] failures = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_files)): logging.info("Featurizing %d / %d" % (i, len(mol_files))) new_features = self._featurize_complex(mol_file, protein_pdb) # Handle loading failures which return None if new_features is not None: features.append(new_features) else: failures.append(ind) features = np.asarray(features) labels = np.delete(self.labels, failures) dataset = DiskDataset.from_numpy(features, labels) # Fit atomic conv model self.atomic_conv_model.fit(dataset, nb_epoch=self.epochs) # Add the Atomic Convolution layers to fetches layers_to_fetch = [ self.atomic_conv_model._frag1_conv, self.atomic_conv_model._frag2_conv, self.atomic_conv_model._complex_conv ] # Extract the atomic convolution features atomic_conv_features = list() batch_generator = self.atomic_conv_model.default_generator( dataset=dataset, epochs=1) for X, y, w in batch_generator: frag1_conv, frag2_conv, complex_conv = self.atomic_conv_model.predict_on_generator( [(X, y, w)], outputs=layers_to_fetch) concatenated = np.concatenate( [frag1_conv, frag2_conv, complex_conv], axis=1) atomic_conv_features.append(concatenated) batch_size = self.atomic_conv_model.batch_size if len(features) % batch_size != 0: num_batches = (len(features) // batch_size) + 1 num_to_skip = num_batches * batch_size - len(features) else: num_to_skip = 0 atomic_conv_features = np.asarray(atomic_conv_features) atomic_conv_features = atomic_conv_features[-num_to_skip:] atomic_conv_features = np.squeeze(atomic_conv_features) return atomic_conv_features, failures
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False neighbor_cutoff = 4 max_num_neighbors = 10 # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors, neighbor_cutoff) # Featurize Dataset features = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_coordinate_features( featurizer, pdb_subdir, pdb_code) features.append(computed_feature) X = np.array(features, dtype - object) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False neighbor_cutoff = 4 max_num_neighbors = 10 # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers featurizer = NeighborListComplexAtomicCoordinates( max_num_neighbors, neighbor_cutoff) # Featurize Dataset features = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_coordinate_features( featurizer, pdb_subdir, pdb_code) features.append(computed_feature) X = np.array(features, dtype-object) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): raise Exception("Duplicate ids or smiles in the dataset") if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, ids=attr_df[smiles_col].values, verbose=False) # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction # is directly specified through command line parameters. If we use Butina splitting, we can't control # the test set size either. train_frac = 1.0 - self.params.split_test_frac # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the # training set to split it into training/validation folds. if self.split == 'butina': # TODO: Splitter.train_test_split() doesn't provide a way to pass the cutoff parameter # through to the ButinaSplitter.split() function. Simple fix would be to reimplement # train_test_split() here (it's not a complicated function). For now, allow cutoff to default. #train_cv, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff) train_cv, test, _ = self.splitter.train_valid_test_split(dataset) self.splitter = dc.splits.ScaffoldSplitter() train_cv_pairs = self.splitter.k_fold_split( train_cv, self.num_folds) else: # TODO: Add special handling for AVE splitter train_cv, test = self.splitter.train_test_split( dataset, seed=np.random.seed(123), frac_train=train_frac) train_cv_pairs = self.splitter.k_fold_split( train_cv, self.num_folds) train_valid_dsets = [] train_valid_attr = [] if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. for train, valid in train_cv_pairs: train_attr = select_attrs_by_dset_smiles( train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False) valid_attr = select_attrs_by_dset_smiles( valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False) train_valid_dsets.append((train, valid)) train_valid_attr.append((train_attr, valid_attr)) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False) else: # Otherwise just subset the ID-to-SMILES maps. for train, valid in train_cv_pairs: train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) train_valid_attr.append((train_attr, valid_attr)) train_valid_dsets = train_cv_pairs test_attr = select_attrs_by_dset_ids(test, attr_df) return train_valid_dsets, test, train_valid_attr, test_attr
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind" pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid( pdbbind_dir, base_dir) print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pdbbind_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pdbbind_tasks) classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity, mode="regression") n_features = dataset.get_data_shape()[0] tensorflow_model = TensorflowMultiTaskRegressor( len(pdbbind_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset, nb_epoch=20)
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ dataset_dup = False if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): log.info("Duplicate ids or smiles in the dataset, will deduplicate first and assign all records per compound ID to same partition") dataset_dup = True dataset_ori = copy.deepcopy(dataset) id_df = pd.DataFrame({'indices' : np.arange(len(dataset.ids), dtype=np.int32), "compound_id": [str(e) for e in dataset.ids]}) sel_df = id_df.drop_duplicates(subset="compound_id") dataset = dataset.select(sel_df.indices.values) if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, w=dataset.w, ids=attr_df.drop_duplicates(subset=smiles_col)[smiles_col].values) if dataset_dup: dataset_ori = DiskDataset.from_numpy(dataset_ori.X, dataset_ori.y, w=dataset_ori.w, ids=attr_df[smiles_col].values) # Under k-fold CV, the training/validation splits are determined by num_folds; only the test set fraction # is directly specified through command line parameters. If we use Butina splitting, we can't control # the test set size either. train_frac = 1.0 - self.params.split_test_frac # Use DeepChem train_test_split() to select held-out test set; then use k_fold_split on the # training set to split it into training/validation folds. if self.split == 'butina': train_cv, test, _ = self.splitter.train_valid_test_split(dataset) self.splitter = dc.splits.ScaffoldSplitter() train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) else: # TODO: Add special handling for AVE splitter train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac) train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds) train_valid_dsets = [] train_valid_attr = [] if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. for train, valid in train_cv_pairs: # assign the subsets to the original dataset if duplicated compounds exist if dataset_dup: train = select_dset_by_id_list(dataset_ori, train.ids) valid = select_dset_by_id_list(dataset_ori, valid.ids) train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, w=train.w, ids=train_attr.index.values) valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, w=valid.w, ids=valid_attr.index.values) train_valid_dsets.append((train, valid)) train_valid_attr.append((train_attr, valid_attr)) if dataset_dup: test = select_dset_by_id_list(dataset_ori, test.ids) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, w=test.w, ids=test_attr.index.values) else: # Otherwise just subset the ID-to-SMILES maps. for train, valid in train_cv_pairs: if dataset_dup: train = select_dset_by_id_list(dataset_ori, train.ids) valid = select_dset_by_id_list(dataset_ori, valid.ids) train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) train_valid_attr.append((train_attr, valid_attr)) train_valid_dsets = train_cv_pairs if dataset_dup: test = select_dset_by_id_list(dataset_ori, test.ids) test_attr = select_attrs_by_dset_ids(test, attr_df) return train_valid_dsets, test, train_valid_attr, test_attr
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: Dataset Dataset to do a k-fold split k: int number of folds directories: list of str list of length 2*k filepaths to save the result disk-datasets kwargs Returns ------- list of length k tuples of (train, cv) """ """ :param dataset: :param k: :param directories: :param kwargs: :return: list of length k tuples of (train, cv) """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split( rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def load_uspto(featurizer="plain", split=None, num_to_load=10000, reload=True, verbose=False): """Load USPTO dataset. For now, only loads the subset of data for 2008-2011 reactions. See https://figshare.com/articles/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873 for more details. The full dataset contains some 400K reactions. This causes an out-of-memory error on development laptop if full dataset is featurized. For now, return a truncated subset of dataset. Reloading is not entirely supported for this dataset. """ # Most reaction dataset ML tasks train the prediction of products from # ractants. Both of these are contained in the rxn object that is output, # so there is no "tasks" field. uspto_tasks = [] if split is not None: raise ValueError("Train/valid/test not yet supported.") # Download USPTO dataset data_dir = deepchem.utils.get_data_dir() if reload: save_dir = os.path.join(data_dir, "uspto/" + featurizer + "/") loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk( save_dir) if loaded: return uspto_tasks, all_dataset, transformers dataset_file = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered.zip") if not os.path.exists(dataset_file): deepchem.utils.download_url( "https://bitbucket.org/dan2097/patent-reaction-extraction/downloads/2008-2011_USPTO_reactionSmiles_filtered.zip" ) # Unzip unzip_dir = os.path.join(data_dir, "2008-2011_USPTO_reactionSmiles_filtered") if not os.path.exists(unzip_dir): deepchem.utils.unzip_file(dataset_file, dest_dir=unzip_dir) # Unzipped file is a tap seperated values file (despite the .txt) filename = os.path.join(unzip_dir, "2008-2011_USPTO_reactionSmiles_filtered.txt") rxns = [] from rdkit.Chem import rdChemReactions with open(filename) as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for ind, row in enumerate(reader): if ind > num_to_load: break if verbose: print("Loading reaction %d" % ind) # The first element in the row is the reaction smarts smarts = row[0] # Sometimes smarts have extraneous information at end of form " # |f:0" that causes parsing to fail. Not sure what this information # is, but just ignoring for now. smarts = smarts.split(" ")[0] rxn = rdChemReactions.ReactionFromSmarts(smarts) rxns.append(rxn) rxn_array = np.array(rxns) # Make up dummy labels since DiskDataset.from_numpy doesn't allow # creation from just features for now. y = np.ones(len(rxn_array)) # TODO: This dataset isn't saved to disk so reload doesn't happen. rxn_dataset = DiskDataset.from_numpy(rxn_array, y) transformers = [] return uspto_tasks, (rxn_dataset, None, None), transformers
def featurize(self, input_files, in_memory=True): """Featurizes image files. Parameters ---------- input_files: list Each file in this list should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). in_memory: bool If true, return in-memory NumpyDataset. Else return DiskDataset. """ if not isinstance(input_files, list): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder images = [] for image_file in image_files: _, extension = os.path.splitext(image_file) if extension == ".png": image = misc.imread(image_file) images.append(image) elif extension == ".tif": im = Image.open(image_file) imarray = np.array(im) images.append(imarray) else: raise ValueError("Unsupported image filetype for %s" % image_file) images = np.array(images) if in_memory: return NumpyDataset(images) else: # from_numpy currently requires labels. Make dummy labels labels = np.zeros(len(images)) return DiskDataset.from_numpy(images, labels)
def load_gpcr(dataset_file, featurizer='ECFP', transformers=True, reload=True, sep='OnePositiveSplit', K=5): # data_dir=os.path.dirname(dataset_file) save_dir = os.path.join( os.path.dirname(dataset_file), '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp", "split") train, valid, test = os.path.join(save_dir, 'train'), os.path.join( save_dir, 'valid'), os.path.join(save_dir, 'test') fopen = open(dataset_file, "r") ss = fopen.readlines() m = ss[0].strip('\n').split(',') m.remove('SMILES') if os.path.isdir(save_dir): if reload: dataset, train_dataset, valid_dataset, test_dataset = DiskDataset( data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=train_dataset) ] all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return m, all_dataset, transformers if featurizer == 'ECFP': featurizer = deepchem.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer = deepchem.feat.ConvMolFeaturizer() elif featurizer == 'Weave': featurizer = deepchem.feat.WeaveFeaturizer() elif featurizer == 'Raw': featurizer = deepchem.feat.RawFeaturizer() elif featurizer == 'AdjacencyConv': featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150, max_valence=6) elif featurizer == 'SelfDefine': featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field) loader = deepchem.data.CSVLoader(tasks=m, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, data_dir=save_dir, shard_size=8192) # dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers if transformers: transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=dataset) ] for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), 'random_stratified': deepchem.splits.RandomStratifiedSplitter(), 'scaffold': deepchem.splits.ScaffoldSplitter(), 'butina': deepchem.splits.ButinaSplitter(), 'task': deepchem.splits.TaskSplitter(), 'Harmonious_positive': Harmonious_positive(), 'OnePositiveSplit': OnePositiveSplit() } splitter = splitters[sep] if sep == 'task': fold_datasets = splitter.k_fold_split(dataset, K) all_dataset = fold_datasets elif sep == 'Harmonious_positive': train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) train_dataset = DiskDataset.from_numpy(train_dataset.X, train_dataset.y, train_dataset.w, train_dataset.ids, dataset.tasks, data_dir=train) valid_dataset = DiskDataset.from_numpy(valid_dataset.X, valid_dataset.y, valid_dataset.w, valid_dataset.ids, dataset.tasks, data_dir=valid) test_dataset = DiskDataset.from_numpy(test_dataset.X, test_dataset.y, test_dataset.w, test_dataset.ids, dataset.tasks, data_dir=test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) elif sep == 'Harmonious_positive' and K: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( # dataset, # frac_train=frac_train, # frac_valid=0, # frac_test=1- frac_train, # ) # train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids, # dataset.tasks,data_dir=train) # train_dataset.reshard(8192) # test_dataset = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids, # dataset.tasks,data_dir=test) # test_dataset.reshard(8192) # fold_dataset = splitter.k_fold_split( # train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True) fold_dataset = splitter.k_fold_split( dataset, K, directories=[os.path.join(valid, str(i)) for i in range(K)], verbose=True) folds = [] for i in range(K): print('merge fold dataset {}...'.format(i)) train_fold = DiskDataset.merge( [fold_dataset[j] for j in range(K) if j != i], merge_dir=os.path.join(valid, str(i), 'train_fold')) test_fold = DiskDataset.merge([fold_dataset[i]], merge_dir=os.path.join( valid, str(i), 'valid_fold')) folds.append([train_fold, test_fold]) all_dataset = (dataset, [], folds, []) else: train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir=train, valid_dir=valid, test_dir=test, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) # else: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test) # all_dataset = (dataset,train_dataset, valid_dataset, test_dataset) # if reload: # deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers) return m, all_dataset, transformers
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True, verbosity=verbosity) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) y = y[y_inds] X = np.vstack(features) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def split_dataset(self, dataset, attr_df, smiles_col): #smiles_col is a hack for now until deepchem fixes their scaffold and butina splitters """Splits dataset into training, testing and validation sets. For ave_min, random, scaffold, index splits self.params.split_valid_frac & self.params.split_test_frac should be defined and train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac For butina split, test size is not user defined, and depends on available clusters that qualify for placement in the test set train_frac = 1.0 - self.params.split_valid_frac For temporal split, test size is also not user defined, and depends on number of compounds with dates after cutoff date. train_frac = 1.0 - self.params.split_valid_frac Args: dataset (deepchem Dataset): full featurized dataset attr_df (Pandas DataFrame): dataframe containing SMILES strings indexed by compound IDs, smiles_col (string): name of SMILES column (hack for now until deepchem fixes scaffold and butina splitters) Returns: [(train, valid)], test, [(train_attr, valid_attr)], test_attr: train (deepchem Dataset): training dataset. valid (deepchem Dataset): validation dataset. test (deepchem Dataset): testing dataset. train_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for training set. valid_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for validation set. test_attr (Pandas DataFrame): dataframe of SMILES strings indexed by compound IDs for test set. Raises: Exception if there are duplicate ids or smiles strings in the dataset or the attr_df """ if check_if_dupe_smiles_dataset(dataset, attr_df, smiles_col): raise Exception("Duplicate ids or smiles in the dataset") log.warning("Splitting data by %s" % self.params.splitter) if self.needs_smiles(): # Some DeepChem splitters require compound IDs in dataset to be SMILES strings. Swap in the # SMILES strings now; we'll reverse this later. dataset = DiskDataset.from_numpy(dataset.X, dataset.y, ids=attr_df[smiles_col].values, verbose=False) if self.split == 'butina': #train_valid, test = self.splitter.train_test_split(dataset, cutoff=self.params.butina_cutoff) # Can't use train_test_split with Butina because Butina splits into train and valid sets only. train_valid, test, _ = self.splitter.train_valid_test_split( dataset) self.splitter = dc.splits.ScaffoldSplitter() # With Butina splitting, we don't have control over the size of the test set train_frac = 1.0 - self.params.split_valid_frac train, valid = self.splitter.train_test_split( train_valid, seed=np.random.seed(123), frac_train=train_frac) elif self.split == 'ave_min': # AVEMinSplitter also only does train-valid splits, but at least nested splits seem to work. # TODO: Change this if we modify AVE splitter to do 3-way splits internally. train_valid_frac = 1.0 - self.params.split_test_frac train_frac = train_valid_frac - self.params.split_valid_frac log.info("Performing split for test set") train_valid, test, _ = self.splitter.train_valid_test_split( dataset, frac_train=train_valid_frac, frac_valid=self.params.split_test_frac, frac_test=0.0) log.info("Performing split of training and validation sets") train, valid, _ = self.splitter.train_valid_test_split( train_valid, frac_train=train_frac / train_valid_frac, frac_valid=self.params.split_valid_frac / train_valid_frac, frac_test=0.0) log.info( "Results of 3-way split: %d training, %d validation, %d test compounds" % (train.X.shape[0], valid.X.shape[0], test.X.shape[0])) elif self.split == 'temporal': # TemporalSplitter requires that we pass attr_df so it can get the dates for each compound train_frac = 1.0 - self.params.split_valid_frac train, valid, test = self.splitter.train_valid_test_split( dataset, attr_df, frac_train=train_frac, frac_valid=self.params.split_valid_frac) else: train_frac = 1.0 - self.params.split_valid_frac - self.params.split_test_frac train, valid, test = self.splitter.train_valid_test_split( dataset, frac_train=train_frac, frac_valid=self.params.split_valid_frac, frac_test=self.params.split_test_frac, seed=np.random.seed(123)) # Extract the ID-to_SMILES maps from attr_df for each subset. if self.needs_smiles(): # Now that DeepChem splitters have done their work, replace the SMILES strings in the split # dataset objects with actual compound IDs. train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col) train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False) valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col) valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False) test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col) test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False) else: # Otherwise just subset the ID-to-SMILES maps. train_attr = select_attrs_by_dset_ids(train, attr_df) valid_attr = select_attrs_by_dset_ids(valid, attr_df) test_attr = select_attrs_by_dset_ids(test, attr_df) # Note grouping of train/valid return values as tuple lists, to match format of # KFoldSplitting.split_dataset(). return [(train, valid)], test, [(train_attr, valid_attr)], test_attr
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) y = y[y_inds] X = np.vstack(features) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
train_attr.smiles.tolist(), valid_attr.smiles.tolist(), test_attr.smiles.tolist() ], []) test.append(len(full_dataset_smiles) == len(set(full_dataset_smiles))) assert all(test) #*********************************************************************************** ([(train, valid)], test_scaffold, [(train_attr, valid_attr)], test_scaffold_attr) = splitter_scaffold.split_dataset( data_obj_scaffold.dataset, data_obj_scaffold.attr, data_obj_scaffold.params.smiles_col) dataset_scaffold = DiskDataset.from_numpy(data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr.index) def test_select_dset_by_attr_ids_using_smiles(): #testing that the method can split a dataset according to its attr ids into the correct deepchem diskdataframe. In this case, the attr_ids are converted back to smiles to match the input dataset. dataset = DiskDataset.from_numpy( data_obj_scaffold.dataset.X, data_obj_scaffold.dataset.y, ids=data_obj_scaffold.attr[data_obj_scaffold.params.smiles_col].values) newdf = pd.DataFrame({'compound_ids': test_scaffold_attr.index.tolist()}, index=test_scaffold_attr.smiles) newDD = split.select_dset_by_attr_ids(dataset, newdf) assert (newDD.y == test_scaffold.y).all()
def create_dataset(self, inputs: Union[OneOrMany[str], Tuple[Any]], data_dir: Optional[str] = None, shard_size: Optional[int] = 8192, in_memory: bool = False) -> Dataset: """Creates and returns a `Dataset` object by featurizing provided image files and labels/weights. Parameters ---------- inputs: `Union[OneOrMany[str], Tuple[Any]]` The inputs provided should be one of the following - filename - list of filenames - Tuple (list of filenames, labels) - Tuple (list of filenames, labels, weights) Each file in a given list of filenames should either be of a supported image format (.png, .tif only for now) or of a compressed folder of image files (only .zip for now). If `labels` or `weights` are provided, they must correspond to the sorted order of all filenames provided, with one label/weight per file. data_dir: str, optional Directory to store featurized dataset. in_memory: bool If true, return in-memory NumpyDataset. Else return ImageDataset. Returns ------- A `Dataset` object containing a featurized representation of data from `input_files`, `labels`, and `weights`. """ labels, weights = None, None if isinstance(inputs, tuple): if len(inputs) == 1: input_files = inputs[0] if isinstance(inputs, str): input_files = [inputs] elif len(inputs) == 2: input_files, labels = inputs elif len(inputs) == 3: input_files, labels, weights = inputs else: raise ValueError("Input must be a tuple of length 1, 2, or 3") else: input_files = inputs if isinstance(input_files, str): input_files = [input_files] image_files = [] # Sometimes zip files contain directories within. Traverse directories while len(input_files) > 0: remainder = [] for input_file in input_files: filename, extension = os.path.splitext(input_file) extension = extension.lower() # TODO(rbharath): Add support for more extensions if os.path.isdir(input_file): dirfiles = [ os.path.join(input_file, subfile) for subfile in os.listdir(input_file) ] remainder += dirfiles elif extension == ".zip": zip_dir = tempfile.mkdtemp() zip_ref = zipfile.ZipFile(input_file, 'r') zip_ref.extractall(path=zip_dir) zip_ref.close() zip_files = [ os.path.join(zip_dir, name) for name in zip_ref.namelist() ] for zip_file in zip_files: _, extension = os.path.splitext(zip_file) extension = extension.lower() if extension in [".png", ".tif"]: image_files.append(zip_file) elif extension in [".png", ".tif"]: image_files.append(input_file) else: raise ValueError("Unsupported file format") input_files = remainder # Sort image files image_files = sorted(image_files) if in_memory: if data_dir is None: return NumpyDataset(self.load_img(image_files), y=labels, w=weights, ids=image_files) else: dataset = DiskDataset.from_numpy(self.load_img(image_files), y=labels, w=weights, ids=image_files, tasks=self.tasks, data_dir=data_dir) if shard_size is not None: dataset.reshard(shard_size) return dataset else: return ImageDataset(image_files, y=labels, w=weights, ids=image_files)
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind" pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid( pdbbind_dir, base_dir) print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pdbbind_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pdbbind_tasks) classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity, mode="regression") n_features = dataset.get_data_shape()[0] tensorflow_model = TensorflowMultiTaskRegressor(len(pdbbind_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64,