def fit(self, dataset, nb_epoch=10, batch_size=50, pad_batches=False, **kwargs): """ Fits a model on data in a Dataset object. """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): log("Starting epoch %s" % str(epoch + 1), self.verbosity) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, pad_batches=pad_batches): if self.fit_transformers: X_batch, y_batch, w_batch = self.transform_on_batch( X_batch, y_batch, w_batch) if pad_batches: X_batch, y_batch, w_batch, ids_batch = pad_batch( batch_size, X_batch, y_batch, w_batch, ids_batch) losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) log( "Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()), self.verbosity)
def predict_proba_on_batch(self, support, test_batch): """Make predictions on batch of data.""" n_samples = len(test_batch) padded_test_batch = NumpyDataset( *pad_batch(self.test_batch_size, test_batch.X, test_batch.y, test_batch.w, test_batch.ids)) feed_dict = self.construct_feed_dict(padded_test_batch, support) # Get scores pred, scores = self.sess.run([self.pred_op, self.scores_op], feed_dict=feed_dict) y_pred_batch = to_one_hot(np.round(pred)) return y_pred_batch
def fit(self, dataset): """ Fits a model on data in a Dataset object. """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. batch_size = self.model_params["batch_size"] if "pad_batches" in self.model_params: pad_batches = self.model_params["pad_batches"] else: pad_batches = False for epoch in range(self.model_params["nb_epoch"]): log("Starting epoch %s" % str(epoch + 1), self.verbosity) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size, pad_batches=pad_batches): if self.fit_transformers: X_batch, y_batch, w_batch = self.transform_on_batch(X_batch, y_batch, w_batch) if pad_batches: X_batch, y_batch, w_batch, ids_batch = pad_batch(batch_size, X_batch, y_batch, w_batch, ids_batch) losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) log("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()), self.verbosity)
def test_pad_batches(self): """Test that pad_batch pads batches correctly.""" batch_size = 100 num_features = 10 num_tasks = 5 # Test cases where n_samples < 2*n_samples < batch_size n_samples = 29 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size # Test cases where n_samples < batch_size n_samples = 79 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size # Test case where n_samples == batch_size n_samples = 100 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size # Test case for object featurization. n_samples = 2 X_b = np.array([{"a": 1}, {"b": 2}]) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size # Test case for more complicated object featurization n_samples = 2 X_b = np.array([(1, {"a": 1}), (2, {"b": 2})]) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size # Test case with multidimensional data n_samples = 50 num_atoms = 15 d = 3 X_b = np.zeros((n_samples, num_atoms, d)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples,)) X_out, y_out, w_out, ids_out = pad_batch( batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len(ids_out) == batch_size
def test_pad_batches(self): """Test that pad_batch pads batches correctly.""" batch_size = 100 num_features = 10 num_tasks = 5 # Test cases where n_samples < 2*n_samples < batch_size n_samples = 29 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size # Test cases where n_samples < batch_size n_samples = 79 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size # Test case where n_samples == batch_size n_samples = 100 X_b = np.zeros((n_samples, num_features)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size # Test case for object featurization. n_samples = 2 X_b = np.array([{"a": 1}, {"b": 2}]) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size # Test case for more complicated object featurization n_samples = 2 X_b = np.array([(1, {"a": 1}), (2, {"b": 2})]) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size # Test case with multidimensional data n_samples = 50 num_atoms = 15 d = 3 X_b = np.zeros((n_samples, num_atoms, d)) y_b = np.zeros((n_samples, num_tasks)) w_b = np.zeros((n_samples, num_tasks)) ids_b = np.zeros((n_samples, )) X_out, y_out, w_out, ids_out = pad_batch(batch_size, X_b, y_b, w_b, ids_b) assert len(X_out) == len(y_out) == len(w_out) == len( ids_out) == batch_size