def evaluate_error_class2(self, dataset, transformers=[]): """ Evaluate the error in energy and gradient components, forcebalance-style. TODO(rbharath): Should be a subclass PhysicalModel method. Also, need to find a better name for this method (class2 doesn't tell us anything about the semantics of this method. """ y_preds = [] y_train = [] grads = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): # untransformed E is needed for undo_grad_transform energy_batch = self.predict_on_batch(X_batch) grad_batch = self.predict_grad_on_batch(X_batch) grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers) grads.append(grad_batch) y_pred_batch = np.reshape(energy_batch, y_batch.shape) # y_pred_batch gives us the pred E and pred multitask trained gradE y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) # undo transforms on y_batch should know how to handle E and gradE separately y_batch = undo_transforms(y_batch, transformers) y_train.append(y_batch) y_pred = np.vstack(y_preds) y = np.vstack(y_train) grad = np.vstack(grads) n_samples, n_tasks = len(dataset), len(self.tasks) n_atoms = int((n_tasks - 1) / 3) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y = np.reshape(y, (n_samples, n_tasks)) grad_train = y[:, 1:] energy_error = y[:, 0] - y_pred[:, 0] energy_error = np.sqrt(np.mean( energy_error * energy_error)) * 2625.5002 grad = np.reshape(grad, (n_samples, n_atoms, 3)) grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3)) grad_error = grad - grad_train grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096 print("Energy error (RMSD): %f kJ/mol" % energy_error) print("Grad error (RMSD): %f kJ/mol/A" % grad_error) return energy_error, grad_error
def evaluate_error_class2(self, dataset, transformers=[]): """ Evaluate the error in energy and gradient components, forcebalance-style. TODO(rbharath): Should be a subclass PhysicalModel method. Also, need to find a better name for this method (class2 doesn't tell us anything about the semantics of this method. """ y_preds = [] y_train = [] grads = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): # untransformed E is needed for undo_grad_transform energy_batch = self.predict_on_batch(X_batch) grad_batch = self.predict_grad_on_batch(X_batch) grad_batch = undo_grad_transforms(grad_batch, energy_batch, transformers) grads.append(grad_batch) y_pred_batch = np.reshape(energy_batch, y_batch.shape) # y_pred_batch gives us the pred E and pred multitask trained gradE y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) # undo transforms on y_batch should know how to handle E and gradE separately y_batch = undo_transforms(y_batch, transformers) y_train.append(y_batch) y_pred = np.vstack(y_preds) y = np.vstack(y_train) grad = np.vstack(grads) n_samples, n_tasks = len(dataset), len(self.tasks) n_atoms = int((n_tasks - 1) / 3) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y = np.reshape(y, (n_samples, n_tasks)) grad_train = y[:, 1:] energy_error = y[:, 0] - y_pred[:, 0] energy_error = np.sqrt(np.mean(energy_error * energy_error)) * 2625.5002 grad = np.reshape(grad, (n_samples, n_atoms, 3)) grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3)) grad_error = grad - grad_train grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096 print("Energy error (RMSD): %f kJ/mol" % energy_error) print("Grad error (RMSD): %f kJ/mol/A" % grad_error) return energy_error, grad_error
def evaluate_error(self, dataset, transformers=[]): """ Evaluate the error in energy and gradient components, forcebalance-style. TODO(rbharath): This looks like it should be a subclass method for a PhysicalMethod class. forcebalance style errors aren't meaningful for most chem-informatic datasets. """ y_preds = [] y_train = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): y_pred_batch = self.predict_on_batch(X_batch) y_pred_batch = np.reshape(y_pred_batch, y_batch.shape) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_batch = undo_transforms(y_batch, transformers) y_train.append(y_batch) y_pred = np.vstack(y_preds) y = np.vstack(y_train) n_samples, n_tasks = len(dataset), len(self.tasks) n_atoms = int((n_tasks - 1) / 3) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y = np.reshape(y, (n_samples, n_tasks)) grad = y_pred[:, 1:] grad_train = y[:, 1:] energy_error = y[:, 0] - y_pred[:, 0] # convert Hartree to kJ/mol energy_error = np.sqrt(np.mean( energy_error * energy_error)) * 2625.5002 grad = np.reshape(grad, (n_samples, n_atoms, 3)) grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3)) grad_error = grad - grad_train # convert Hartree/bohr to kJ/mol/Angstrom grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096 print("Energy error (RMSD): %f kJ/mol" % energy_error) print("Grad error (RMSD): %f kJ/mol/A" % grad_error) return energy_error, grad_error
def evaluate_error(self, dataset, transformers=[]): """ Evaluate the error in energy and gradient components, forcebalance-style. TODO(rbharath): This looks like it should be a subclass method for a PhysicalMethod class. forcebalance style errors aren't meaningful for most chem-informatic datasets. """ y_preds = [] y_train = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): y_pred_batch = self.predict_on_batch(X_batch) y_pred_batch = np.reshape(y_pred_batch, y_batch.shape) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_batch = undo_transforms(y_batch, transformers) y_train.append(y_batch) y_pred = np.vstack(y_preds) y = np.vstack(y_train) n_samples, n_tasks = len(dataset), len(self.tasks) n_atoms = int((n_tasks - 1) / 3) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) y = np.reshape(y, (n_samples, n_tasks)) grad = y_pred[:, 1:] grad_train = y[:, 1:] energy_error = y[:, 0] - y_pred[:, 0] # convert Hartree to kJ/mol energy_error = np.sqrt(np.mean(energy_error * energy_error)) * 2625.5002 grad = np.reshape(grad, (n_samples, n_atoms, 3)) grad_train = np.reshape(grad_train, (n_samples, n_atoms, 3)) grad_error = grad - grad_train # convert Hartree/bohr to kJ/mol/Angstrom grad_error = np.sqrt(np.mean(grad_error * grad_error)) * 4961.47596096 print("Energy error (RMSD): %f kJ/mol" % energy_error) print("Grad error (RMSD): %f kJ/mol/A" % grad_error) return energy_error, grad_error
def predict_proba(self, dataset, transformers=[], batch_size=None, n_classes=2, pad_batches=False): """ TODO: Do transformers even make sense here? Returns: y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks) """ y_preds = [] n_tasks = self.get_num_tasks() for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_proba_on_batch(X_batch, pad_batch=pad_batches) y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks, n_classes)) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) # The iterbatches does padding with zero-weight examples on the last batch. # Remove padded examples. n_samples = len(dataset) y_pred = y_pred[:n_samples] y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes)) return y_pred
def predict(self, dataset, transformers=[], batch_size=None, pad_batches=False): """ Uses self to make predictions on provided Dataset object. Returns: y_pred: numpy ndarray of shape (n_samples,) """ y_preds = [] n_tasks = self.get_num_tasks() for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_on_batch(X_batch, pad_batch=pad_batches) # Discard any padded predictions y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = np.reshape(y_pred_batch, (n_samples, n_tasks)) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) # The iterbatches does padding with zero-weight examples on the last batch. # Remove padded examples. n_samples = len(dataset) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) # Special case to handle singletasks. if n_tasks == 1: y_pred = np.reshape(y_pred, (n_samples, )) return y_pred
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, threshold=None): """ Computes statistics of model on test data and saves results to csv. """ y = self.dataset.get_labels() y = undo_transforms(y, self.output_transformers) w = self.dataset.get_weights() if not len(metrics): return {} else: mode = metrics[0].mode if mode == "classification": y_pred = self.model.predict_proba(self.dataset, self.output_transformers) y_pred_print = self.model.predict(self.dataset, self.output_transformers).astype(int) else: y_pred = self.model.predict(self.dataset, self.output_transformers) y_pred_print = y_pred multitask_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbosity) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbosity) self.output_statistics(multitask_scores, stats_out) return multitask_scores
def test_fd_grad(self, dataset, transformers=[]): """ Uses self to calculate finite difference gradient on provided Dataset object. Currently only useful if your task is energy and self contains predict_grad_on_batch. TODO(rbharath): This shouldn't be a method of the Model class. Perhaps a method of PhysicalModel subclass. Leaving it in for time-being while refactoring continues. Returns: y_pred: numpy ndarray of shape (n_samples,) """ y_preds = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): for xb in X_batch: num_atoms = xb.shape[0] coords = 3 h = 0.001 fd_batch = [] # Filling a new batch with displaced geometries for i in xrange(num_atoms): for j in xrange(coords): displace = np.zeros((num_atoms, coords)) displace[i][j] += h / 2 fd_batch.append(xb + displace) fd_batch.append(xb - displace) fd_batch = np.asarray(fd_batch) # Predict energy on displaced geometry batch y_pred_batch = self.predict_on_batch(fd_batch) energy = y_pred_batch[:, 0] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_pred_batch = y_pred_batch[:, 0] y_pred_batch = np.reshape(y_pred_batch, (3 * num_atoms, 2)) fd_grads = [] # Calculate numerical gradient by centered finite difference for x in y_pred_batch: fd_grads.append((x[0] - x[1]) / h) fd_grads = np.asarray(fd_grads) fd_grads = np.reshape(fd_grads, (num_atoms, coords)) xb = np.asarray([xb]) y_pred_batch = self.predict_grad_on_batch(xb) y_pred_batch = undo_grad_transforms(energy, y_pred_batch, transformers) # Calculate error between symbolic gradient and numerical gradient y_pred_batch = y_pred_batch - fd_grads #print(y_pred_batch) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) return y_pred
def predict(self, dataset, transformers=[]): """ Uses self to make predictions on provided Dataset object. Returns: y_pred: numpy ndarray of shape (n_samples,) """ y_preds = [] batch_size = self.model_params["batch_size"] n_tasks = len(self.tasks) for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, deterministic=True): n_samples = len(X_batch) y_pred_batch = np.reshape(self.predict_on_batch(X_batch), (n_samples, n_tasks)) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) # The iterbatches does padding with zero-weight examples on the last batch. # Remove padded examples. n_samples, n_tasks = len(dataset), len(self.tasks) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) # Special case to handle singletasks. if n_tasks == 1: y_pred = np.reshape(y_pred, (n_samples, )) return y_pred
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, threshold=None): """ Computes statistics of model on test data and saves results to csv. """ y = self.dataset.get_labels() y = undo_transforms(y, self.transformers) w = self.dataset.get_weights() if not len(metrics): return {} else: mode = metrics[0].mode if mode == "classification": y_pred = self.model.predict_proba(self.dataset, self.transformers) y_pred_print = self.model.predict(self.dataset, self.transformers).astype(int) else: y_pred = self.model.predict(self.dataset, self.transformers) y_pred_print = y_pred multitask_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbosity) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbosity) self.output_statistics(multitask_scores, stats_out) return multitask_scores
def predict(self, dataset, transformers=[]): """ Uses self to make predictions on provided Dataset object. Returns: y_pred: numpy ndarray of shape (n_samples,) """ y_preds = [] batch_size = self.model_params["batch_size"] n_tasks = len(self.tasks) for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, deterministic=True): n_samples = len(X_batch) y_pred_batch = np.reshape(self.predict_on_batch(X_batch), (n_samples, n_tasks)) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) # The iterbatches does padding with zero-weight examples on the last batch. # Remove padded examples. n_samples, n_tasks = len(dataset), len(self.tasks) y_pred = np.reshape(y_pred, (n_samples, n_tasks)) # Special case to handle singletasks. if n_tasks == 1: y_pred = np.reshape(y_pred, (n_samples,)) return y_pred
def test_fd_grad(self, dataset, transformers=[]): """ Uses self to calculate finite difference gradient on provided Dataset object. Currently only useful if your task is energy and self contains predict_grad_on_batch. TODO(rbharath): This shouldn't be a method of the Model class. Perhaps a method of PhysicalModel subclass. Leaving it in for time-being while refactoring continues. Returns: y_pred: numpy ndarray of shape (n_samples,) """ y_preds = [] batch_size = self.model_params["batch_size"] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): for xb in X_batch: num_atoms = xb.shape[0] coords = 3 h = 0.001 fd_batch = [] # Filling a new batch with displaced geometries for i in xrange(num_atoms): for j in xrange(coords): displace = np.zeros((num_atoms, coords)) displace[i][j] += h / 2 fd_batch.append(xb + displace) fd_batch.append(xb - displace) fd_batch = np.asarray(fd_batch) # Predict energy on displaced geometry batch y_pred_batch = self.predict_on_batch(fd_batch) energy = y_pred_batch[:, 0] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_pred_batch = y_pred_batch[:, 0] y_pred_batch = np.reshape(y_pred_batch, (3 * num_atoms, 2)) fd_grads = [] # Calculate numerical gradient by centered finite difference for x in y_pred_batch: fd_grads.append((x[0] - x[1]) / h) fd_grads = np.asarray(fd_grads) fd_grads = np.reshape(fd_grads, (num_atoms, coords)) xb = np.asarray([xb]) y_pred_batch = self.predict_grad_on_batch(xb) y_pred_batch = undo_grad_transforms(energy, y_pred_batch, transformers) # Calculate error between symbolic gradient and numerical gradient y_pred_batch = y_pred_batch - fd_grads # print(y_pred_batch) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) return y_pred
def predict(self, dataset, transformers=[]): """ Prediction for multitask models. """ n_tasks = len(self.tasks) n_samples = len(dataset) y_pred = np.zeros((n_samples, n_tasks)) for ind, task in enumerate(self.tasks): task_model = self.model_builder(self.task_model_dirs[task]) task_model.reload() y_pred[:, ind] = task_model.predict(dataset, []) y_pred = undo_transforms(y_pred, transformers) return y_pred
def predict(self, dataset, transformers=[]): """ Prediction for multitask models. """ n_tasks = len(self.tasks) n_samples = len(dataset) y_pred = np.zeros((n_samples, n_tasks)) for ind, task in enumerate(self.tasks): task_type = self.task_types[task] if self.store_in_memory: task_model = self.task_models[task] else: task_model = self.model_builder( [task], {task: self.task_types[task]}, self.model_params, self.task_model_dirs[task], verbosity=self.verbosity) task_model.reload() y_pred[:, ind] = task_model.predict(dataset, []) y_pred = undo_transforms(y_pred, transformers) return y_pred
def predict(self, dataset, transformers=[]): """ Prediction for multitask models. """ n_tasks = len(self.tasks) n_samples = len(dataset) y_pred = np.zeros((n_samples, n_tasks)) for ind, task in enumerate(self.tasks): task_type = self.task_types[task] if self.store_in_memory: task_model = self.task_models[task] else: task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params, self.task_model_dirs[task], verbosity=self.verbosity) task_model.reload() y_pred[:, ind] = task_model.predict(dataset, []) y_pred = undo_transforms(y_pred, transformers) return y_pred
def predict_proba(self, dataset, transformers=[], n_classes=2): """ TODO: Do transformers even make sense here? Returns: y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks) """ y_preds = [] batch_size = self.model_params["batch_size"] n_tasks = len(self.tasks) for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, deterministic=True): y_pred_batch = self.predict_proba_on_batch(X_batch) batch_size = len(y_batch) y_pred_batch = np.reshape(y_pred_batch, (batch_size, n_tasks, n_classes)) y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.vstack(y_preds) # The iterbatches does padding with zero-weight examples on the last batch. # Remove padded examples. n_samples, n_tasks = len(dataset), len(self.tasks) y_pred = y_pred[:n_samples] y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes)) return y_pred