def output_predictions(dataset: Dataset, y_preds: np.ndarray, csv_out: str) -> None: """Writes predictions to file. Writes predictions made on `dataset` to a specified file on disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows | ID | Task1Name | Task2Name | | ----------- | ------------ | ------------ | | identifer1 | prediction11 | prediction12 | | identifer2 | prediction21 | prediction22 | Parameters ---------- dataset: dc.data.Dataset Dataset on which predictions have been made. y_preds: np.ndarray Predictions to output csv_out: str Name of file to write predictions to. """ data_ids = dataset.ids n_tasks = len(dataset.get_task_names()) y_preds = np.reshape(y_preds, (len(y_preds), n_tasks)) assert len(y_preds) == len(data_ids) with open(csv_out, "w") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(["ID"] + dataset.get_task_names()) for mol_id, y_pred in zip(data_ids, y_preds): csvwriter.writerow([mol_id] + list(y_pred))
def predict(self, dataset: Dataset, transformers: List[Transformer] = []) -> np.ndarray: """ Uses self to make predictions on provided Dataset object. Parameters ---------- dataset: Dataset Dataset to make prediction on transformers: List[Transformer] Transformers that the input data has been transformed by. The output is passed through these transformers to undo the transformations. Returns ------- np.ndarray A numpy array of predictions the model produces. """ y_preds = [] for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_on_batch(X_batch) # Discard any padded predictions y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.concatenate(y_preds) return y_pred
def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float: """ Fits a model on data in a Dataset object. Parameters ---------- dataset: Dataset the Dataset to train on nb_epoch: int the number of epochs to train for Returns ------- float The average loss over the most recent checkpoint interval. """ for epoch in range(nb_epoch): logger.info("Starting epoch %s" % str(epoch + 1)) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) logger.info("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean())) return np.array(losses).mean()
def predict(self, dataset: Dataset, transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]: """ Uses self to make predictions on provided Dataset object. Parameters ---------- dataset: dc.data.Dataset Dataset to make prediction on transformers: list of dc.trans.Transformers Transformers that the input data has been transformed by. The output is passed through these transformers to undo the transformations. Returns ------- a NumPy array of the model produces a single output, or a list of arrays if it produces multiple outputs """ y_preds = [] n_tasks = self.get_num_tasks() ind = 0 for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_on_batch(X_batch) # Discard any padded predictions y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.concatenate(y_preds) return y_pred
def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float: """ Fits a model on data in a Dataset object. Parameters ---------- dataset: Dataset the Dataset to train on nb_epoch: int the number of epochs to train for Returns ------- the average loss over the most recent epoch """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): logger.info("Starting epoch %s" % str(epoch + 1)) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) logger.info("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean())) return np.array(losses).mean()
def __init__(self, dataset: Dataset): super(DuplicateBalancingTransformer, self).__init__(transform_X=True, transform_y=True, transform_w=True, transform_ids=True, dataset=dataset) if len(dataset.get_task_names()) > 1: raise ValueError( "This transformation is only defined for singletask datsets.") # Get the labels/weights y = dataset.y w = dataset.w # Normalize shapes if len(y.shape) == 1: y = np.reshape(y, (len(y), 1)) if len(w.shape) == 1: w = np.reshape(w, (len(w), 1)) if len(y.shape) != 2: raise ValueError("y must be of shape (N,) or (N, n_tasks)") if len(w.shape) != 2: raise ValueError("w must be of shape (N,) or (N, n_tasks)") self.classes = sorted(np.unique(y)) # Remove labels with zero weights y = y[w != 0] N = len(y) class_weights = [] # Note that we may have 0 elements of a given class since we remove those # labels with zero weight. for c in self.classes: # this works because y is 1D c_weight = np.sum(w[y == c]) class_weights.append(c_weight) weight_largest = max(class_weights) # This is the right ratio since int(N/num_c) * num_c \approx N # for all classes duplication_ratio = [ int(weight_largest / float(c_weight)) if c_weight > 0 else 0 for c_weight in class_weights ] self.duplication_ratio = duplication_ratio
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Create a generator that iterates batches for a dataset. Subclasses may override this method to customize how model inputs are generated from the data. Parameters ---------- dataset: Dataset the data to iterate epochs: int the number of times to iterate over the full dataset mode: str allowed values are 'fit' (called during training), 'predict' (called during prediction), and 'uncertainty' (called during uncertainty prediction) deterministic: bool whether to iterate over the dataset in order, or randomly shuffle the data for each epoch pad_batches: bool whether to pad each batch up to this model's preferred batch size Returns ------- a generator that iterates batches, each represented as a tuple of lists: ([inputs], [outputs], [weights]) """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): yield ([X_b], [y_b], [w_b])
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Convert a dataset into the tensors needed for learning. Parameters ---------- dataset: `dc.data.Dataset` Dataset to convert epochs: int, optional (Default 1) Number of times to walk over `dataset` mode: str, optional (Default 'fit') Ignored in this implementation. deterministic: bool, optional (Default True) Whether the dataset should be walked in a deterministic fashion pad_batches: bool, optional (Default True) If true, each returned batch will have size `self.batch_size`. Returns ------- Iterator which walks over the batches """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) inputs = self.compute_features_on_batch(X_b) yield (inputs, [y_b], [w_b])
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Convert a dataset into the tensors needed for learning. Parameters ---------- dataset: `dc.data.Dataset` Dataset to convert epochs: int, optional (Default 1) Number of times to walk over `dataset` mode: str, optional (Default 'fit') Ignored in this implementation. deterministic: bool, optional (Default True) Whether the dataset should be walked in a deterministic fashion pad_batches: bool, optional (Default True) If true, each returned batch will have size `self.batch_size`. Returns ------- Iterator which walks over the batches """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) atom_feat = [] pair_feat = [] atom_split = [] atom_to_pair = [] pair_split = [] start = 0 for im, mol in enumerate(X_b): n_atoms = mol.get_num_atoms() # number of atoms in each molecule atom_split.extend([im] * n_atoms) # index of pair features C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms)) atom_to_pair.append( np.transpose( np.array([C1.flatten() + start, C0.flatten() + start]))) # number of pairs for each atom pair_split.extend(C1.flatten() + start) start = start + n_atoms # atom features atom_feat.append(mol.get_atom_features()) # pair features pair_feat.append( np.reshape(mol.get_pair_features(), (n_atoms * n_atoms, self.n_pair_feat[0]))) inputs = [ np.concatenate(atom_feat, axis=0), np.concatenate(pair_feat, axis=0), np.array(pair_split), np.array(atom_split), np.concatenate(atom_to_pair, axis=0) ] yield (inputs, [y_b], [w_b])
def load_bace(mode="regression", transform=True, split="20-80"): """Load BACE-1 dataset as regression/classification problem.""" reload = True verbosity = "high" regen = False assert split in ["20-80", "80-20"] current_dir = os.path.dirname(os.path.realpath(__file__)) if split == "20-80": dataset_file = os.path.join(current_dir, "../../datasets/desc_canvas_aug30.csv") elif split == "80-20": dataset_file = os.path.join(current_dir, "../../datasets/rev8020split_desc.csv") dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ("[" + ",".join( ["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") crystal_dataset_file = os.path.join( current_dir, "../../datasets/crystal_desc_canvas_aug30.csv") crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0])) #Make directories to store the raw and featurized datasets. base_dir = tempfile.mkdtemp() data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") crystal_dir = os.path.join(base_dir, "crystal") if mode == "regression": bace_tasks = ["pIC50"] elif mode == "classification": bace_tasks = ["Class"] else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) if not reload or not os.path.exists(crystal_dir): crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir) else: crystal_dataset = Dataset(crystal_dir, reload=True) if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir) or not os.path.exists(test_dir)): regen = True splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir, valid_dir, test_dir) else: train_dataset = Dataset(train_dir, reload=True) valid_dataset = Dataset(valid_dir, reload=True) test_dataset = Dataset(test_dir, reload=True) #NOTE THE RENAMING: if split == "20-80": valid_dataset, test_dataset = test_dataset, valid_dataset print("Number of compounds in train set") print(len(train_dataset)) print("Number of compounds in validation set") print(len(valid_dataset)) print("Number of compounds in test set") print(len(test_dataset)) print("Number of compounds in crystal set") print(len(crystal_dataset)) if transform and regen: input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset) ] output_transformers = [] if mode == "regression": output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] else: output_transformers = [] else: input_transformers, output_transformers = [], [] transformers = input_transformers + output_transformers for dataset in [ train_dataset, valid_dataset, test_dataset, crystal_dataset ]: for transformer in transformers: dataset = transformer.transform(dataset) return (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, output_transformers)