def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True): """Creates a new DiskDataset Parameters ---------- shard_generator: Iterable An iterable (either a list or generator) that provides tuples of data (X, y, w, ids). Each tuple will be written to a separate shard on disk. data_dir: str Filename for data directory. Creates a temp directory if none specified. tasks: list List of tasks for this dataset. """ if data_dir is None: data_dir = tempfile.mkdtemp() elif not os.path.exists(data_dir): os.makedirs(data_dir) metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) save_metadata(tasks, metadata_df, data_dir) time2 = time.time() log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose) return DiskDataset(data_dir, verbose=verbose)
def _featurize_compounds(self, df, featurizer, parallel=True): """Featurize individual compounds. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_smiles = df["smiles"].tolist() if not parallel: features = [] for ind, smiles in enumerate(sample_smiles): if ind % self.log_every_n == 0: log("Featurizing sample %d" % ind, self.verbose) mol = Chem.MolFromSmiles(smiles) features.append(featurizer.featurize([mol])) else: def featurize_wrapper(smiles): mol = Chem.MolFromSmiles(smiles) return featurizer.featurize([mol]) features = ProcessingPool(mp.cpu_count()).map(featurize_wrapper, sample_smiles) df[featurizer.__class__.__name__] = features
def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines): tempdir = tempfile.mkdtemp() ############################################################## TIMING time1 = time.time() ############################################################## TIMING ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext) with open(ligand_file, "w") as mol_f: mol_f.writelines(ligand_lines) ############################################################## TIMING time2 = time.time() log("TIMING: Writing ligand took %0.3f s" % (time2 - time1), self.verbose) ############################################################## TIMING ############################################################## TIMING time1 = time.time() ############################################################## TIMING protein_pdb_file = os.path.join(tempdir, "protein.pdb") with open(protein_pdb_file, "w") as protein_f: protein_f.writelines(protein_pdb_lines) ############################################################## TIMING time2 = time.time() log("TIMING: Writing protein took %0.3f s" % (time2 - time1), self.verbose) ############################################################## TIMING features_dict = self._transform(protein_pdb_file, ligand_file) shutil.rmtree(tempdir) return features_dict.values()
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def write_dataframe(val, data_dir, featurizer=None, tasks=None, raw_data=None, basename=None, mol_id_field="mol_id", verbosity=None, compute_feature_statistics=None): """Writes data from dataframe to disk.""" if featurizer is not None and tasks is not None: feature_type = featurizer.__class__.__name__ (basename, df) = val # TODO(rbharath): This is a hack. clean up. if not len(df): return None if compute_feature_statistics is None: if hasattr(featurizer, "dtype"): dtype = featurizer.dtype compute_feature_statistics = False else: dtype = float compute_feature_statistics = True ############################################################## TIMING time1 = time.time() ############################################################## TIMING ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field, dtype, verbosity) ############################################################## TIMING time2 = time.time() log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity) ############################################################## TIMING else: ids, X, y, w = raw_data basename = "" assert X.shape[0] == y.shape[0] assert y.shape == w.shape assert len(ids) == X.shape[0] return DiskDataset.write_data_to_disk( data_dir, basename, tasks, X, y, w, ids, compute_feature_statistics=compute_feature_statistics)
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True): """Featurize individual compounds in a numpy array. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features array """ features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = np.squeeze(np.array(features)) return features.reshape(-1, )
def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, threshold=None): """ Computes statistics of model on test data and saves results to csv. """ y = self.dataset.get_labels() y = undo_transforms(y, self.transformers) w = self.dataset.get_weights() if not len(metrics): return {} else: mode = metrics[0].mode if mode == "classification": y_pred = self.model.predict_proba(self.dataset, self.transformers) y_pred_print = self.model.predict(self.dataset, self.transformers).astype(int) else: y_pred = self.model.predict(self.dataset, self.transformers) y_pred_print = y_pred multitask_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbosity) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbosity) self.output_statistics(multitask_scores, stats_out) return multitask_scores
def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True, log_every_n=1000): """ Calculate features for mol/protein complexes. Parameters ---------- mol_pdbs: list List of PDBs for molecules. Each PDB should be a list of lines of the PDB file. protein_pdbs: list List of PDBs for proteins. Each PDB should be a list of lines of the PDB file. """ features = [] for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)): if verbose and i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mol_pdbs))) features.append(self._featurize_complex(mol_pdb, protein_pdb)) features = np.asarray(features) return features
def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000): """ Calculate features for mol/protein complexes. Parameters ---------- mols: list List of PDB filenames for molecules. protein_pdbs: list List of PDB filenames for proteins. """ features = [] for i, (mol_file, protein_pdb) in enumerate(zip(mol_files, protein_pdbs)): if i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mol_files))) ligand_ext = get_ligand_filetype(mol_file) with open(mol_file) as mol_f: mol_lines = mol_f.readlines() with open(protein_pdb) as protein_file: protein_pdb_lines = protein_file.readlines() features += self._featurize_complex(ligand_ext, mol_lines, protein_pdb_lines) features = np.asarray(features) return features
def _add_user_specified_features(self, df, featurizer): """Merge user specified features. Merge features included in dataset provided by user into final features dataframe Three types of featurization here: 1) Molecule featurization -) Smiles string featurization -) Rdkit MOL featurization 2) Complex featurization -) PDB files for interacting molecules. 3) User specified featurizations. """ ############################################################## TIMING time1 = time.time() ############################################################## TIMING df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(pd.to_numeric) X_shard = df.as_matrix(columns=featurizer.feature_fields) df[featurizer.__class__.__name__] = [np.array(elt) for elt in X_shard.tolist()] ############################################################## TIMING time2 = time.time() log("TIMING: user specified processing took %0.3f s" % (time2-time1), self.verbosity)
def generate_scaffolds(self, dataset, log_every_n=1000): """ Returns all scaffolds from the dataset """ scaffolds = {} data_len = len(dataset) log("About to generate scaffolds", self.verbose) for ind, smiles in enumerate(dataset.ids): if ind % log_every_n == 0: log(f"Generating scaffold {ind} {data_len}", self.verbose) scaffold = generate_scaffold(smiles) if scaffold not in scaffolds: scaffolds[scaffold] = [ind] else: scaffolds[scaffold].append(ind) # Sort from largest to smallest scaffold sets scaffolds = {key: sorted(value) for key, value in scaffolds.items()} scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ] return scaffold_sets
def split(self, dataset, frac_train=.8, frac_valid=.1, frac_test=.1, log_every_n=1000): """ Splits internal compounds into train/validation/test by scaffold. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) scaffold_sets = self.generate_scaffolds(dataset) train_cutoff = frac_train * len(dataset) valid_cutoff = (frac_train + frac_valid) * len(dataset) train_inds, valid_inds, test_inds = [], [], [] log("About to sort in scaffold sets", self.verbose) for scaffold_set in scaffold_sets: if len(train_inds) + len(scaffold_set) > train_cutoff: if len(train_inds) + len(valid_inds) + len( scaffold_set) > valid_cutoff: test_inds += scaffold_set else: valid_inds += scaffold_set else: train_inds += scaffold_set return train_inds, valid_inds, test_inds
def _featurize_shard(self, df_shard, write_fn, shard_num, input_type): """Featurizes a shard of an input dataframe.""" field = self.mol_field if input_type == "sdf" else self.smiles_field field_type = "mol" if input_type == "sdf" else "smiles" log( "Currently featurizing feature_type: %s" % self.featurizer.__class__.__name__, self.verbosity) if isinstance(self.featurizer, UserDefinedFeaturizer): self._add_user_specified_features(df_shard, self.featurizer) elif isinstance(self.featurizer, Featurizer): self._featurize_mol(df_shard, self.featurizer, field=field, field_type=field_type) elif isinstance(self.featurizer, ComplexFeaturizer): self._featurize_complexes(df_shard, self.featurizer) basename = "shard-%d" % shard_num ############################################################## TIMING time1 = time.time() ############################################################## TIMING metadata_row = write_fn((basename, df_shard)) ############################################################## TIMING time2 = time.time() log("TIMING: writing metadata row took %0.3f s" % (time2 - time1), self.verbosity) ############################################################## TIMING return metadata_row
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location.""" log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset( shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def __init__(self, shard_generator=[], data_dir=None, tasks=[], reload=False, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if data_dir is not None: if not os.path.exists(data_dir): os.makedirs(data_dir) else: data_dir = tempfile.mkdtemp() self.data_dir = data_dir self.verbose = verbose if reload: log("Loading pre-existing dataset.", self.verbose) if os.path.exists(self._get_metadata_filename()): (self.tasks, self.metadata_df) = load_from_disk( self._get_metadata_filename()) else: raise ValueError("No metadata found.") return metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk( self.data_dir, basename, tasks, X, y, w, ids)) self.tasks = tasks self.metadata_df = DiskDataset.construct_metadata(metadata_rows) self.save_to_disk() time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2-time1), self.verbose)
def sparse_shuffle(self): """Shuffling that exploits data sparsity to shuffle large datasets. Only for 1-dimensional feature vectors (does not work for tensorial featurizations). """ time1 = time.time() shard_size = self.get_shard_size() num_shards = self.get_number_shards() X_sparses, ys, ws, ids = [], [], [], [] num_features = None for i in range(num_shards): (X_s, y_s, w_s, ids_s) = self.get_shard(i) if num_features is None: num_features = X_s.shape[1] X_sparse = sparsify_features(X_s) X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s], ws + [w_s], ids + [np.atleast_1d(np.squeeze(ids_s))]) # Get full dataset in memory (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys), np.vstack(ws), np.concatenate(ids)) # Shuffle in memory num_samples = len(X_sparse) permutation = np.random.permutation(num_samples) X_sparse, y, w, ids = (X_sparse[permutation], y[permutation], w[permutation], ids[permutation]) # Write shuffled shards out to disk for i in range(num_shards): start, stop = i * shard_size, (i + 1) * shard_size (X_sparse_s, y_s, w_s, ids_s) = (X_sparse[start:stop], y[start:stop], w[start:stop], ids[start:stop]) X_s = densify_features(X_sparse_s, num_features) self.set_shard(i, X_s, y_s, w_s, ids_s) time2 = time.time() log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
def featurize(self, protein_file, pockets, pocket_atoms_map, pocket_coords, verbose=False): """ Calculate atomic coodinates. """ import mdtraj protein = mdtraj.load(protein_file) n_pockets = len(pockets) n_residues = len(BindingPocketFeaturizer.residues) res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues))) all_features = np.zeros((n_pockets, n_residues)) for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)): pocket_atoms = pocket_atoms_map[pocket] for ind, atom in enumerate(pocket_atoms): atom_name = str(protein.top.atom(atom)) # atom_name is of format RESX-ATOMTYPE # where X is a 1 to 4 digit number residue = atom_name[:3] if residue not in res_map: log("Warning: Non-standard residue in PDB file", verbose) continue atomtype = atom_name.split("-")[1] all_features[pocket_num, res_map[residue]] += 1 return all_features
def _featurize_complex(self, ligand_pdb_lines, protein_pdb_lines): tempdir = tempfile.mkdtemp() ############################################################## TIMING time1 = time.time() ############################################################## TIMING ligand_pdb_file = os.path.join(tempdir, "ligand.pdb") with open(ligand_pdb_file, "w") as mol_f: mol_f.writelines(ligand_pdb_lines) ############################################################## TIMING time2 = time.time() log("TIMING: Writing ligand took %0.3f s" % (time2-time1), self.verbose) ############################################################## TIMING ############################################################## TIMING time1 = time.time() ############################################################## TIMING protein_pdb_file = os.path.join(tempdir, "protein.pdb") with open(protein_pdb_file, "w") as protein_f: protein_f.writelines(protein_pdb_lines) ############################################################## TIMING time2 = time.time() log("TIMING: Writing protein took %0.3f s" % (time2-time1), self.verbose) ############################################################## TIMING features_dict = self._transform(protein_pdb_file, ligand_pdb_file) shutil.rmtree(tempdir) return features_dict.values()
def _featurize_compounds(self, df, featurizer, parallel=True, worker_pool=None): """Featurize individual compounds. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_smiles = df["smiles"].tolist() if worker_pool is None: features = [] for ind, smiles in enumerate(sample_smiles): if ind % self.log_every_n == 0: log("Featurizing sample %d" % ind, self.verbosity) mol = Chem.MolFromSmiles(smiles) features.append(featurizer.featurize([mol], verbosity=self.verbosity)) else: def featurize_wrapper(smiles, dilled_featurizer): print("Featurizing %s" % smiles) mol = Chem.MolFromSmiles(smiles) featurizer = dill.loads(dilled_featurizer) feature = featurizer.featurize([mol], verbosity=self.verbosity) return feature features = worker_pool.map_sync(featurize_wrapper, sample_smiles) df[featurizer.__class__.__name__] = features
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, threshold=None): """ Computes statistics of model on test data and saves results to csv. """ y = self.dataset.get_labels() y = undo_transforms(y, self.output_transformers) w = self.dataset.get_weights() if not len(metrics): return {} else: mode = metrics[0].mode if mode == "classification": y_pred = self.model.predict_proba(self.dataset, self.output_transformers) y_pred_print = self.model.predict(self.dataset, self.output_transformers).astype(int) else: y_pred = self.model.predict(self.dataset, self.output_transformers) y_pred_print = y_pred multitask_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbosity) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbosity) self.output_statistics(multitask_scores, stats_out) return multitask_scores
def train_valid_test_split(self, dataset, train_dir=None, valid_dir=None, test_dir=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=None, log_every_n=1000): """ Splits self into train/validation/test sets. Returns Dataset objects. """ log("Computing train/valid/test indices", self.verbose) train_inds, valid_inds, test_inds = self.split( dataset, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, log_every_n=log_every_n) if train_dir is None: train_dir = tempfile.mkdtemp() if valid_dir is None: valid_dir = tempfile.mkdtemp() if test_dir is None: test_dir = tempfile.mkdtemp() train_dataset = dataset.select( train_inds, train_dir) if frac_valid != 0: valid_dataset = dataset.select( valid_inds, valid_dir) else: valid_dataset = None test_dataset = dataset.select( test_inds, test_dir) return train_dataset, valid_dataset, test_dataset
def __init__(self, tasks, task_types, model_params, model_dir, model_builder, store_in_memory=False, verbosity=None): self.tasks = tasks self.task_types = task_types self.model_params = model_params self.models = {} self.model_dir = model_dir # If models are TF models, they don't use up RAM, so can keep in memory self.task_models = {} self.task_model_dirs = {} self.model_builder = model_builder self.verbosity = verbosity self.store_in_memory = store_in_memory log("About to initialize singletask to multitask model", self.verbosity, "high") if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.fit_transformers = False for task in self.tasks: task_type = self.task_types[task] task_model_dir = os.path.join(self.model_dir, str(task)) if not os.path.exists(task_model_dir): os.makedirs(task_model_dir) log("Initializing model for task %s" % task, self.verbosity, "high") self.task_model_dirs[task] = task_model_dir
def fit(self, dataset, nb_epoch=10, batch_size=50, pad_batches=False, **kwargs): """ Fits a model on data in a Dataset object. """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): log("Starting epoch %s" % str(epoch + 1), self.verbosity) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, pad_batches=pad_batches): if self.fit_transformers: X_batch, y_batch, w_batch = self.transform_on_batch( X_batch, y_batch, w_batch) if pad_batches: X_batch, y_batch, w_batch, ids_batch = pad_batch( batch_size, X_batch, y_batch, w_batch, ids_batch) losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) log( "Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()), self.verbosity)
def k_fold_split(self, dataset, k, directories=None): """Does K-fold split of dataset.""" log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(k)] else: assert len(directories) == k fold_datasets = [] # rem_dataset is remaining portion of dataset rem_dataset = dataset for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1./(k-fold) fold_dir = directories[fold] fold_inds, rem_inds, _ = self.split( rem_dataset, frac_train=frac_fold, frac_valid=1-frac_fold, frac_test=0) fold_dataset = rem_dataset.select( fold_inds, fold_dir) rem_dir = tempfile.mkdtemp() rem_dataset = rem_dataset.select( rem_inds, rem_dir) fold_datasets.append(fold_dataset) return fold_datasets
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True): """Featurize individual compounds in a numpy array. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features array """ features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array( [1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid] features = np.squeeze(np.array(features)) return features.reshape(-1,)
def train_valid_test_split(self, dataset, train_dir, valid_dir, test_dir, frac_train=.8, frac_valid=.1, frac_test=.1, seed=None, log_every_n=1000, compute_feature_statistics=True): """ Splits self into train/validation/test sets. Returns Dataset objects. """ log("Computing train/valid/test indices", self.verbosity) train_inds, valid_inds, test_inds = self.split( dataset, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, log_every_n=log_every_n) train_dataset = dataset.select( train_dir, train_inds, compute_feature_statistics=compute_feature_statistics) if valid_dir is not None: valid_dataset = dataset.select( valid_dir, valid_inds, compute_feature_statistics=compute_feature_statistics) else: valid_dataset = None test_dataset = dataset.select( test_dir, test_inds, compute_feature_statistics=compute_feature_statistics) return train_dataset, valid_dataset, test_dataset
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array( [1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid] return np.squeeze(np.array(features), axis=1), valid_inds
def compute_metric(self, y_true, y_pred, w=None, n_classes=2, filter_nans=True): """Compute a performance metric for each task. Args: y_true: A list of arrays containing true values for each task. y_pred: A list of arrays containing predicted values for each task. metric: Must be a class that inherits from Metric Returns: A numpy array containing metric values for each task. """ if len(y_true.shape) > 1: n_samples, n_tasks = y_true.shape[0], y_true.shape[1] else: n_samples, n_tasks = y_true.shape[0], 1 if self.mode == "classification": y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes)) else: y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y_true = np.reshape(y_true, (n_samples, n_tasks)) if w is None: w = np.ones_like(y_true) assert y_true.shape[0] == y_pred.shape[0] == w.shape[0] computed_metrics = [] for task in range(n_tasks): y_task = y_true[:, task] if self.mode == "regression": y_pred_task = y_pred[:, task] else: y_pred_task = y_pred[:, task, :] w_task = w[:, task] metric_value = self.compute_singletask_metric( y_task, y_pred_task, w_task) computed_metrics.append(metric_value) log("computed_metrics: %s" % str(computed_metrics), self.verbosity) if n_tasks == 1: computed_metrics = computed_metrics[0] if not self.is_multitask: return computed_metrics else: if filter_nans: computed_metrics = np.array(computed_metrics) computed_metrics = computed_metrics[~np.isnan(computed_metrics )] if self.compute_energy_metric: # TODO(rbharath, joegomes): What is this magic number? force_error = self.task_averager( computed_metrics[1:]) * 4961.47596096 print("Force error (metric: np.mean(%s)): %f kJ/mol/A" % (self.name, force_error)) return computed_metrics[0] else: return self.task_averager(computed_metrics)
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, per_task_metrics=False): """ Computes statistics of model on test data and saves results to csv. Parameters ---------- metrics: list List of dc.metrics.Metric objects csv_out: str, optional Filename to write CSV of model predictions. stats_out: str, optional Filename to write computed statistics. per_task_metrics: bool, optional If true, return computed metric for each task on multitask dataset. """ y = self.dataset.y y = undo_transforms(y, self.output_transformers) w = self.dataset.w if not len(metrics): return {} else: mode = metrics[0].mode if mode == "classification": y_pred = self.model.predict_proba(self.dataset, self.output_transformers) y_pred_print = self.model.predict(self.dataset, self.output_transformers).astype(int) else: y_pred = self.model.predict(self.dataset, self.output_transformers) y_pred_print = y_pred multitask_scores = {} all_task_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbose) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: if per_task_metrics: multitask_scores[metric.name], computed_metrics = metric.compute_metric( y, y_pred, w, per_task_metrics=True) all_task_scores[metric.name] = computed_metrics else: multitask_scores[metric.name] = metric.compute_metric( y, y_pred, w, per_task_metrics=False) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbose) self.output_statistics(multitask_scores, stats_out) if not per_task_metrics: return multitask_scores else: return multitask_scores, all_task_scores
def featurize(self, mols, parallel=False, client_kwargs=None, view_flags=None, verbosity=None, log_every_n=1000): """ Calculate features for molecules. Parameters ---------- mols : iterable RDKit Mol objects. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. """ if self.conformers and isinstance(mols, types.GeneratorType): mols = list(mols) assert verbosity in [None, "low", "high"] if parallel: from IPython.parallel import Client if client_kwargs is None: client_kwargs = {} if view_flags is None: view_flags = {} client = Client(**client_kwargs) client.direct_view().use_dill() # use dill view = client.load_balanced_view() view.set_flags(**view_flags) call = view.map(self._featurize, mols, block=False) features = call.get() # get output from engines call.display_outputs() else: features = [] for i, mol in enumerate(mols): if verbosity is not None and i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mols))) if mol is not None: features.append(self._featurize(mol)) else: features.append(np.array([])) if self.conformers: features = self.conformer_container(mols, features) else: features = np.asarray(features) return features
def __init__( self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, raw_data=None, verbosity=None, reload=False, compute_feature_statistics=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if not os.path.exists(data_dir): os.makedirs(data_dir) self.data_dir = data_dir assert verbosity in [None, "low", "high"] self.verbosity = verbosity if not reload or not os.path.exists(self._get_metadata_filename()): if metadata_rows is not None: self.metadata_df = DiskDataset.construct_metadata( metadata_rows) self.save_to_disk() elif raw_data is not None: metadata_rows = [] ids, X, y, w = raw_data metadata_rows.append( DiskDataset.write_data_to_disk( self.data_dir, "data", tasks, X, y, w, ids, compute_feature_statistics=compute_feature_statistics)) self.metadata_df = DiskDataset.construct_metadata( metadata_rows) self.save_to_disk() else: # Create an empty metadata dataframe to be filled at a later time basename = "metadata" metadata_rows = [ DiskDataset.write_data_to_disk(self.data_dir, basename, tasks) ] self.metadata_df = DiskDataset.construct_metadata( metadata_rows) self.save_to_disk() else: log("Loading pre-existing metadata file.", self.verbosity) if os.path.exists(self._get_metadata_filename()): self.metadata_df = load_from_disk( self._get_metadata_filename()) else: raise ValueError("No metadata found.")
def __init__(self, data_dir, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ self.data_dir = data_dir self.verbose = verbose log("Loading dataset from disk.", self.verbose) self.tasks, self.metadata_df = self.load_metadata()
def compute_model_performance(self, csv_out, stats_file): """ Computes statistics of model on test data and saves results to csv. """ pred_y_df = self.model.predict(self.dataset) log("Saving predictions to %s" % csv_out, self.verbose) pred_y_df.to_csv(csv_out) if self.task_type == "classification": colnames = [ "task_name", "roc_auc_score", "matthews_corrcoef", "recall_score", "accuracy_score" ] elif self.task_type == "regression": colnames = ["task_name", "r2_score", "rms_error"] else: raise ValueError("Unrecognized task type: %s" % self.task_type) performance_df = pd.DataFrame(columns=colnames) y_means = pred_y_df.iterrows().next()[1]["y_means"] y_stds = pred_y_df.iterrows().next()[1]["y_stds"] for i, task_name in enumerate(self.task_names): y = pred_y_df[task_name].values y_pred = pred_y_df["%s_pred" % task_name].values w = pred_y_df["%s_weight" % task_name].values y = undo_transform(y, y_means, y_stds, self.output_transforms) y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms) if self.task_type == "classification": y, y_pred = y[w.nonzero()].astype(int), y_pred[ w.nonzero()].astype(int) # Sometimes all samples have zero weight. In this case, continue. if not len(y): continue auc = compute_roc_auc_scores(y, y_pred) mcc = matthews_corrcoef(y, y_pred) recall = recall_score(y, y_pred) accuracy = accuracy_score(y, y_pred) performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy] elif self.task_type == "regression": try: r2s = r2_score(y, y_pred) rms = np.sqrt(mean_squared_error(y, y_pred)) except ValueError: r2s = np.nan rms = np.nan performance_df.loc[i] = [task_name, r2s, rms] log("Saving model performance scores to %s" % stats_file, self.verbose) performance_df.to_csv(stats_file) return pred_y_df, performance_df
def __init__(self, tasks, model_builder, model_dir=None, verbose=True): super().__init__(self, model_dir=model_dir, verbose=verbose) self.tasks = tasks self.task_model_dirs = {} self.model_builder = model_builder log("About to initialize singletask to multitask model", self.verbose) for task in self.tasks: task_model_dir = os.path.join(self.model_dir, str(task)) if not os.path.exists(task_model_dir): os.makedirs(task_model_dir) log("Initializing directory for task %s" % task, self.verbose) self.task_model_dirs[task] = task_model_dir
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] from rdkit import Chem from rdkit.Chem import rdmolfiles from rdkit.Chem import rdmolops if 'Comet' in str(featurizer.__class__.__qualname__): mols = preprocess_df(sample_elems, NUM_WORKERS) mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1) for chunk in mols_chunks: X, A, L = list(zip(*chunk)) X = np.array(X, dtype=np.uint8) A = np.array(A, dtype=np.float32) L = np.array(L, dtype=np.uint8) max_len = L[-1] X = X[:, :max_len, :] A = A[:, :max_len, :max_len] temp = featurizer._featurize((X, A)) features += list(temp) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.array(features), valid_inds else: for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) # TODO (ytz) this is a bandage solution to reorder the atoms so # that they're always in the same canonical order. Presumably this # should be correctly implemented in the future for graph mols. if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return np.squeeze(np.array(features), axis=1), valid_inds
def shard_generator(): for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1), self.verbose) yield X, y, w, ids
def __init__(self, tasks, model_builder, model_dir=None, verbose=True): super(SingletaskToMultitask, self).__init__( self, model_dir=model_dir, verbose=verbose) self.tasks = tasks self.task_model_dirs = {} self.model_builder = model_builder log("About to initialize singletask to multitask model", self.verbose) for task in self.tasks: task_model_dir = os.path.join(self.model_dir, str(task)) if not os.path.exists(task_model_dir): os.makedirs(task_model_dir) log("Initializing directory for task %s" % task, self.verbose) self.task_model_dirs[task] = task_model_dir
def __init__(self, data_dir, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ self.data_dir = data_dir self.verbose = verbose log("Loading dataset from disk.", self.verbose) if os.path.exists(self._get_metadata_filename()): (self.tasks, self.metadata_df) = load_from_disk(self._get_metadata_filename()) else: raise ValueError("No metadata found on disk.")
def _featurize_mol(self, df, featurizer, parallel=True, field_type="mol", field=None, worker_pool=None): """Featurize individual compounds. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe When featurizing a .sdf file, the 3-D structure should be preserved so we use the rdkit "mol" object created from .sdf instead of smiles string. Some featurizers such as CoulombMatrix also require a 3-D structure. Featurizing from .sdf is currently the only way to perform CM feautization. TODO(rbharath): Needs to be merged with _featurize_compounds """ assert field_type in ["mol", "smiles"] assert field is not None sample_elems = df[field].tolist() if worker_pool is None: features = [] for ind, elem in enumerate(sample_elems): if field_type == "smiles": mol = Chem.MolFromSmiles(elem) else: mol = elem if ind % self.log_every_n == 0: log("Featurizing sample %d" % ind, self.verbosity) features.append( featurizer.featurize([mol], verbosity=self.verbosity)) else: def featurize_wrapper(elem, dilled_featurizer): print("Featurizing %s" % elem) if field_type == "smiles": mol = Chem.MolFromSmiles(smiles) else: mol = elem featurizer = dill.loads(dilled_featurizer) feature = featurizer.featurize([mol], verbosity=self.verbosity) return feature features = worker_pool.map_sync(featurize_wrapper, sample_elems) df[featurizer.__class__.__name__] = features
def compute_model_performance(self, csv_out, stats_file): """ Computes statistics of model on test data and saves results to csv. """ pred_y_df = self.model.predict(self.dataset) log("Saving predictions to %s" % csv_out, self.verbose) pred_y_df.to_csv(csv_out) if self.task_type == "classification": colnames = ["task_name", "roc_auc_score", "matthews_corrcoef", "recall_score", "accuracy_score"] elif self.task_type == "regression": colnames = ["task_name", "r2_score", "rms_error"] else: raise ValueError("Unrecognized task type: %s" % self.task_type) performance_df = pd.DataFrame(columns=colnames) y_means = pred_y_df.iterrows().next()[1]["y_means"] y_stds = pred_y_df.iterrows().next()[1]["y_stds"] for i, task_name in enumerate(self.task_names): y = pred_y_df[task_name].values y_pred = pred_y_df["%s_pred" % task_name].values w = pred_y_df["%s_weight" % task_name].values y = undo_transform(y, y_means, y_stds, self.output_transforms) y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms) if self.task_type == "classification": y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int) # Sometimes all samples have zero weight. In this case, continue. if not len(y): continue auc = compute_roc_auc_scores(y, y_pred) mcc = matthews_corrcoef(y, y_pred) recall = recall_score(y, y_pred) accuracy = accuracy_score(y, y_pred) performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy] elif self.task_type == "regression": try: r2s = r2_score(y, y_pred) rms = np.sqrt(mean_squared_error(y, y_pred)) except ValueError: r2s = np.nan rms = np.nan performance_df.loc[i] = [task_name, r2s, rms] log("Saving model performance scores to %s" % stats_file, self.verbose) performance_df.to_csv(stats_file) return pred_y_df, performance_df
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location. For large datasets, automatically shards into smaller chunks for convenience. Parameters ---------- input_files: list List of input filenames. data_dir: str (Optional) Directory to store featurized dataset. shard_size: int (Optional) Number of examples stored in each shard. """ log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def compute_metric(self, y_true, y_pred, w=None, n_classes=2, filter_nans=True): """Compute a performance metric for each task. Args: y_true: A list of arrays containing true values for each task. y_pred: A list of arrays containing predicted values for each task. metric: Must be a class that inherits from Metric Returns: A numpy array containing metric values for each task. """ if len(y_true.shape) > 1: n_samples, n_tasks = y_true.shape[0], y_true.shape[1] else: n_samples, n_tasks = y_true.shape[0], 1 if self.mode == "classification": y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes)) else: y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y_true = np.reshape(y_true, (n_samples, n_tasks)) if w is None: w = np.ones_like(y_true) assert y_true.shape[0] == y_pred.shape[0] == w.shape[0] computed_metrics = [] for task in range(n_tasks): y_task = y_true[:, task] if self.mode == "regression": y_pred_task = y_pred[:, task] else: y_pred_task = y_pred[:, task, :] w_task = w[:, task] metric_value = self.compute_singletask_metric( y_task, y_pred_task, w_task) computed_metrics.append(metric_value) log("computed_metrics: %s" % str(computed_metrics), self.verbosity) if n_tasks == 1: computed_metrics = computed_metrics[0] if not self.is_multitask: return computed_metrics else: if filter_nans: computed_metrics = np.array(computed_metrics) computed_metrics = computed_metrics[~np.isnan(computed_metrics)] if self.compute_energy_metric: # TODO(rbharath, joegomes): What is this magic number? force_error = self.task_averager(computed_metrics[1:])*4961.47596096 print("Force error (metric: np.mean(%s)): %f kJ/mol/A" % (self.name, force_error)) return computed_metrics[0] else: return self.task_averager(computed_metrics)
def _create_task_datasets(self, dataset): """Make directories to hold data for tasks""" task_data_dirs = [] for task in self.tasks: task_data_dir = os.path.join(self.model_dir, str(task) + "_data") if os.path.exists(task_data_dir): shutil.rmtree(task_data_dir) os.makedirs(task_data_dir) task_data_dirs.append(task_data_dir) task_datasets = self._to_singletask(dataset, task_data_dirs) for task, task_dataset in zip(self.tasks, task_datasets): log("Dataset for task %s has shape %s" % (task, str(task_dataset.get_shape())), self.verbose) return task_datasets
def featurize_map_function(args): ############################################################## TIMING time1 = time.time() ############################################################## TIMING ((loader, shard_size, input_type, data_dir), (shard_num, raw_df_shard)) = args log( "Loading shard %d of size %s from file." % (shard_num + 1, str(shard_size)), loader.verbosity) log("About to featurize shard.", loader.verbosity) write_fn = partial(Dataset.write_dataframe, data_dir=data_dir, featurizer=loader.featurizer, tasks=loader.tasks, mol_id_field=loader.id_field, verbosity=loader.verbosity) ############################################################## TIMING shard_time1 = time.time() ############################################################## TIMING metadata_row = loader._featurize_shard(raw_df_shard, write_fn, shard_num, input_type) ############################################################## TIMING shard_time2 = time.time() log( "TIMING: shard featurization took %0.3f s" % (shard_time2 - shard_time1), loader.verbosity) ############################################################## TIMING ############################################################## TIMING time2 = time.time() log("TIMING: featurization map function took %0.3f s" % (time2 - time1), loader.verbosity) ############################################################## TIMING return metadata_row
def featurize_map_function(args): ############################################################## TIMING time1 = time.time() ############################################################## TIMING ((loader, shard_size, input_type, data_dir), (shard_num, raw_df_shard)) = args log("Loading shard %d of size %s from file." % (shard_num+1, str(shard_size)), loader.verbosity) log("About to featurize shard.", loader.verbosity) write_fn = partial( Dataset.write_dataframe, data_dir=data_dir, featurizer=loader.featurizer, tasks=loader.tasks, mol_id_field=loader.id_field, verbosity=loader.verbosity) ############################################################## TIMING shard_time1 = time.time() ############################################################## TIMING metadata_row = loader._featurize_shard( raw_df_shard, write_fn, shard_num, input_type) ############################################################## TIMING shard_time2 = time.time() log("TIMING: shard featurization took %0.3f s" % (shard_time2-shard_time1), loader.verbosity) ############################################################## TIMING ############################################################## TIMING time2 = time.time() log("TIMING: featurization map function took %0.3f s" % (time2-time1), loader.verbosity) ############################################################## TIMING return metadata_row
def fit(self, dataset, nb_epoch=10, batch_size=50, **kwargs): """ Fits a model on data in a Dataset object. """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): log("Starting epoch %s" % str(epoch + 1), self.verbose) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) log("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()), self.verbose)
def fit(self, dataset, **kwargs): """ Updates all singletask models with new information. Warning: This current implementation is only functional for sklearn models. """ if not isinstance(dataset, DiskDataset): raise ValueError('SingletaskToMultitask only works with DiskDatasets') log("About to create task-specific datasets", self.verbose) task_datasets = self._create_task_datasets(dataset) for ind, task in enumerate(self.tasks): log("Fitting model for task %s" % task, self.verbose) task_model = self.model_builder(self.task_model_dirs[task]) task_model.fit(task_datasets[ind], **kwargs) task_model.save()
def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids
def fit(self, dataset, nb_epoch=10, pad_batches=False, shuffle=False, max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs): """Fit the model. Args: dataset: Dataset object that represents data on disk. max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep; older checkpoints will be deleted. Raises: AssertionError: If model is not in training mode. """ ############################################################## TIMING time1 = time.time() ############################################################## TIMING n_datapoints = len(dataset) batch_size = self.batch_size step_per_epoch = np.ceil(float(n_datapoints)/batch_size) log("Training for %d epochs" % nb_epoch, self.verbosity) with self.train_graph.graph.as_default(): train_op = self.get_training_op( self.train_graph.graph, self.train_graph.loss) with self._get_shared_session(train=True) as sess: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep) # Save an initial checkpoint. saver.save(sess, self._save_path, global_step=0) for epoch in range(nb_epoch): avg_loss, n_batches = 0., 0 if shuffle: log("About to shuffle dataset before epoch start.", self.verbosity) dataset.shuffle() for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=True)): # hardcode pad_batches=True to work around limitations in Tensorflow if ind % log_every_N_batches == 0: log("On batch %d" % ind, self.verbosity) # Run training op. feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b) fetches = self.train_graph.output + [ train_op, self.train_graph.loss] fetched_values = sess.run( fetches, feed_dict=feed_dict) output = fetched_values[:len(self.train_graph.output)] loss = fetched_values[-1] avg_loss += loss y_pred = np.squeeze(np.array(output)) y_b = y_b.flatten() n_batches += 1 saver.save(sess, self._save_path, global_step=epoch) avg_loss = float(avg_loss)/n_batches log('Ending epoch %d: Average loss %g' % (epoch, avg_loss), self.verbosity) # Always save a final checkpoint when complete. saver.save(sess, self._save_path, global_step=epoch+1) ############################################################## TIMING time2 = time.time() print("TIMING: model fitting took %0.3f s" % (time2-time1), self.verbosity)