def fit(self, training_data): """Train the network. Parameters ---------- training_data : list of pairs In each pair, the first element should be a list of items from the vocabulary (for the NLI task, this is the concatenation of the premise and hypothesis), and the second element should be the one-hot label vector. Attributes ---------- self.output_dim : int Set based on the length of the labels in `training_data`. self.W_xh : np.array Dense connections between the word representations and the hidden layers self.W_hh : np.array Dense connections between the hidden representations. self.W_hy : np.array Dense connections from the final hidden layer to the output layer. """ self.output_dim = len(training_data[0][1]) self.W_xh = randmatrix(self.word_dim, self.hidden_dim) self.W_hh = randmatrix(self.hidden_dim, self.hidden_dim) self.W_hy = randmatrix(self.hidden_dim, self.output_dim) # SGD: iteration = 0 error = sys.float_info.max while error > self.epsilon and iteration < self.maxiter: error = 0.0 random.shuffle(training_data) for seq, labels in training_data: self._forward_propagation(seq) # Cross-entropy error reduces to log(prediction-for-correct-label): error += -np.log(self.y[np.argmax(labels)]) # Back-prop: d_W_hy, d_W_hh, d_W_xh = self._backward_propagation(seq, labels) # Updates: self.W_hy -= self.eta * d_W_hy self.W_hh -= self.eta * d_W_hh self.W_xh -= self.eta * d_W_xh iteration += 1 if self.display_progress: # Report the average error: error /= len(training_data) progress_bar("Finished epoch %s of %s; error is %s" % (iteration, self.maxiter, error)) if self.display_progress: sys.stderr.write('\n')
def fit(self, training_data): """The training algorithm. Parameters ---------- training_data : list A list of (example, label) pairs, where `example` and `label` are both np.array instances. Attributes ---------- self.x : the input layer self.h : the hidden layer self.y : the output layer self.W1 : dense weight connection from self.x to self.h self.W2 : dense weight connection from self.h to self.y Both self.W1 and self.W2 have the bias as their final column. The following attributes are created here for efficiency but used only in `backward_propagation`: self.y_err : vector of output errors self.x_err : vector of input errors """ # Dimensions determined by the data: self.input_dim = len(training_data[0][0]) self.output_dim = len(training_data[0][1]) # Parameter initialization: self.x = np.ones(self.input_dim + 1) # +1 for the bias self.h = np.ones(self.hidden_dim + 1) # +1 for the bias self.y = np.ones(self.output_dim) self.W1 = utils.randmatrix(self.input_dim + 1, self.hidden_dim) self.W2 = utils.randmatrix(self.hidden_dim + 1, self.output_dim) self.y_err = np.zeros(self.output_dim) self.x_err = np.zeros(self.input_dim + 1) # SGD: iteration = 0 error = sys.float_info.max while error > self.epsilon and iteration < self.maxiter: error = 0.0 random.shuffle(training_data) for ex, labels in training_data: self.forward_propagation(ex) error += self.backward_propagation(labels) iteration += 1 if self.display_progress: utils.progress_bar('completed iteration %s; error is %s' % (iteration, error)) if self.display_progress: sys.stderr.write('\n')
def fit(self, training_data): """The training algorithm. Parameters ---------- training_data : list A list of (example, label) pairs, where `example` and `label` are both np.array instances. Attributes ---------- self.x : the input layer self.h : the hidden layer self.y : the output layer self.W1 : dense weight connection from self.x to self.h self.W2 : dense weight connection from self.h to self.y Both self.W1 and self.W2 have the bias as their final column. The following attributes are created here for efficiency but used only in `backward_propagation`: self.y_err : vector of output errors self.x_err : vector of input errors """ # Dimensions determined by the data: self.input_dim = len(training_data[0][0]) self.output_dim = len(training_data[0][1]) # Parameter initialization: self.x = np.ones(self.input_dim+1) # +1 for the bias self.h = np.ones(self.hidden_dim+1) # +1 for the bias self.y = np.ones(self.output_dim) self.W1 = utils.randmatrix(self.input_dim+1, self.hidden_dim) self.W2 = utils.randmatrix(self.hidden_dim+1, self.output_dim) self.y_err = np.zeros(self.output_dim) self.x_err = np.zeros(self.input_dim+1) # SGD: iteration = 0 error = sys.float_info.max while error > self.epsilon and iteration < self.maxiter: error = 0.0 random.shuffle(training_data) for ex, labels in training_data: self.forward_propagation(ex) error += self.backward_propagation(labels) iteration += 1 if self.display_progress: utils.progress_bar('completed iteration %s; error is %s' % (iteration, error)) if self.display_progress: sys.stderr.write('\n')
def test_np_autoencoder(hidden_activation, d_hidden_activation): model = Autoencoder( max_iter=10, hidden_dim=2, hidden_activation=hidden_activation, d_hidden_activation=d_hidden_activation) # A tiny dataset so that we can run `fit` and set all the model # parameters: X = utils.randmatrix(5, 5) model.fit(X) # Use the first example for the check: ex = X[0] label = X[0] # Forward and backward to get the gradients: hidden, pred = model.forward_propagation(ex) d_W_hy, d_b_hy, d_W_xh, d_b_xh = model.backward_propagation( hidden, pred, ex, label) # Model parameters to check: param_pairs = ( ('W_hy', d_W_hy), ('b_hy', d_b_hy), ('W_xh', d_W_xh), ('b_xh', d_b_xh) ) gradient_check(param_pairs, model, ex, label)
def test_np_shallow_neural_classifier_gradients(hidden_activation, d_hidden_activation): model = ShallowNeuralClassifier( max_iter=10, hidden_activation=hidden_activation, d_hidden_activation=d_hidden_activation) # A tiny dataset so that we can run `fit` and set all the model # parameters: X = utils.randmatrix(5, 2) y = np.random.choice((0,1), 5) model.fit(X, y) # Use the first example for the check: ex = X[0] label = model._onehot_encode([y[0]])[0] # Forward and backward to get the gradients: hidden, pred = model.forward_propagation(ex) d_W_hy, d_b_hy, d_W_xh, d_b_xh = model.backward_propagation( hidden, pred, ex, label) # Model parameters to check: param_pairs = ( ('W_hy', d_W_hy), ('b_hy', d_b_hy), ('W_xh', d_W_xh), ('b_xh', d_b_xh) ) gradient_check(param_pairs, model, ex, label)
def test_tf_autoencoder(): """Just makes sure that this code will run; it doesn't check that it is creating good models. """ X = utils.randmatrix(20, 50) ae = TfAutoencoder(hidden_dim=5, max_iter=100) ae.fit(X) ae.predict(X)
def test_torch_autoencoder_save_load(): X = utils.randmatrix(20, 50) mod = torch_autoencoder.TorchAutoencoder(hidden_dim=5, max_iter=2) mod.fit(X) mod.predict(X) with tempfile.NamedTemporaryFile(mode='wb') as f: name = f.name mod.to_pickle(name) mod2 = torch_autoencoder.TorchAutoencoder.from_pickle(name) mod2.predict(X) mod2.fit(X)
def test_torch_autoencoder(pandas): """Just makes sure that this code will run; it doesn't check that it is creating good models. """ X = utils.randmatrix(20, 50) if pandas: X = pd.DataFrame(X) ae = torch_autoencoder.TorchAutoencoder(hidden_dim=5, max_iter=100) H = ae.fit(X) ae.predict(X) H_is_pandas = isinstance(H, pd.DataFrame) assert H_is_pandas == pandas
def weight_init(m, n): """Uses the Xavier Glorot method for initializing the weights of an `m` by `n` matrix. Parameters ---------- m : int Row dimension n : int Column dimension Returns ------- np.array, shape `(m, n)` """ x = np.sqrt(6.0 / (m + n)) return randmatrix(m, n, lower=-x, upper=x)
def fit(self, df): """ Learn the GloVe matrix. Parameters ---------- df : pd.DataFrame or np.array, shape `(n_vocab, n_vocab)` This should be a matrix of (possibly scaled) co-occcurrence counts. Returns ------- pd.DataFrame or np.array, shape `(n_vocab, self.n)` The type will be the same as the user's `df`. If it's a `pd.DataFrame`, the index will be the same as `df.index`. """ X = self.convert_input_to_array(df) m = X.shape[0] # Parameters: W = utils.randmatrix(m, self.n) # Word weights. C = utils.randmatrix(m, self.n) # Context weights. B = utils.randmatrix(2, m) # Word and context biases. # Precomputable GloVe values: X_log = utils.log_of_array_ignoring_zeros(X) X_weights = (np.minimum(X, self.xmax) / self.xmax)**self.alpha # eq. (9) # Learning: indices = list(range(m)) for iteration in range(self.max_iter): epoch_error = 0.0 random.shuffle(indices) for i, j in itertools.product(indices, indices): if X[i, j] > 0.0: weight = X_weights[i, j] # Cost is J' based on eq. (8) in the paper: diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j] fdiff = diff * weight # Gradients: wgrad = fdiff * C[j] cgrad = fdiff * W[i] wbgrad = fdiff wcgrad = fdiff # Updates: W[i] -= self.eta * wgrad C[j] -= self.eta * cgrad B[0, i] -= self.eta * wbgrad B[1, j] -= self.eta * wcgrad # One-half squared error term: epoch_error += 0.5 * weight * (diff**2) epoch_error /= m if epoch_error <= self.tol: utils.progress_bar( "Converged on iteration {} with error {}".format( iteration, epoch_error, self.display_progress)) break utils.progress_bar("Finished epoch {} of {}; error is {}".format( iteration, self.max_iter, epoch_error, self.display_progress)) # Return the sum of the word and context matrices, per the advice # in section 4.2: G = W + C self.embedding = self.convert_output(G, df) return self.embedding
import torch.nn as nn import utils from test_torch_model_base import PARAMS_WITH_TEST_VALUES as BASE_PARAMS from torch_tree_nn import TorchTreeNN from torch_tree_nn import simple_example from np_tree_nn import TreeNN from np_tree_nn import simple_example as np_simple_example __author__ = "Christopher Potts" __version__ = "CS224u, Stanford, Spring 2021" utils.fix_random_seeds() PARAMS_WITH_TEST_VALUES = [["embed_dim", 10], ["embedding", utils.randmatrix(4, 10)], ["hidden_activation", nn.ReLU()], ['freeze_embedding', True]] PARAMS_WITH_TEST_VALUES += BASE_PARAMS @pytest.fixture def dataset(): vocab = ["1", "+", "2", "$UNK"] train = [ "(odd 1)", "(even 2)", "(even (odd 1) (neutral (neutral +) (odd 1)))", "(odd (odd 1) (neutral (neutral +) (even 2)))", "(odd (even 2) (neutral (neutral +) (odd 1)))", "(even (even 2) (neutral (neutral +) (even 2)))",
def glove(df, n=100, xmax=100, alpha=0.75, max_iter=100, eta=0.05, tol=1e-4, display_progress=True): """Basic GloVe. This is mainly here as a reference implementation. We recommend using `mittens.GloVe` instead. Parameters ---------- df : pd.DataFrame or np.array This must be a square matrix. n : int (default: 100) The dimensionality of the output vectors. xmax : int (default: 100) Words with frequency greater than this are given weight 1.0. Words with frequency under this are given weight (c/xmax)**alpha where c is their count in mat (see the paper, eq. (9)). alpha : float (default: 0.75) Exponent in the weighting function (see the paper, eq. (9)). max_iter : int (default: 100) Number of training epochs. eta : float (default: 0.05) Controls the rate of SGD weight updates. tol : float (default: 1e-4) Stopping criterion for the loss. display_progress : bool (default: True) Whether to print iteration number and current error to stdout. Returns ------- pd.DataFrame With dimension `(df.shape[0], n)` """ X = df.values if isinstance(df, pd.DataFrame) else df m = X.shape[0] # Parameters: W = utils.randmatrix(m, n) # Word weights. C = utils.randmatrix(m, n) # Context weights. B = utils.randmatrix(2, m) # Word and context biases. # Precomputable GloVe values: X_log = utils.log_of_array_ignoring_zeros(X) X_weights = (np.minimum(X, xmax) / xmax)**alpha # eq. (9) # Learning: indices = list(range(m)) for iteration in range(max_iter): error = 0.0 random.shuffle(indices) for i, j in itertools.product(indices, indices): if X[i, j] > 0.0: weight = X_weights[i, j] # Cost is J' based on eq. (8) in the paper: diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j] fdiff = diff * weight # Gradients: wgrad = fdiff * C[j] cgrad = fdiff * W[i] wbgrad = fdiff wcgrad = fdiff # Updates: W[i] -= eta * wgrad C[j] -= eta * cgrad B[0, i] -= eta * wbgrad B[1, j] -= eta * wcgrad # One-half squared error term: error += 0.5 * weight * (diff**2) error /= m if display_progress: if error < tol: utils.progress_bar("Stopping at iteration {} with " "error {}".format(iteration, error)) break else: utils.progress_bar("Iteration {}: error {}".format( iteration, error)) if display_progress: sys.stderr.write('\n') # Return the sum of the word and context matrices, per the advice # in section 4.2: G = W + C if isinstance(df, pd.DataFrame): G = pd.DataFrame(G, index=df.index) return G
def random_matrix(): return utils.randmatrix(20, 50)
def weight_init(m, n): """Uses the Xavier Glorot method for initializing the weights of an `m` by `n` matrix. """ x = np.sqrt(6.0 / (m + n)) return utils.randmatrix(m, n, lower=-x, upper=x)
def glove(mat, rownames, n=100, xmax=100, alpha=0.75, iterations=100, learning_rate=0.05, display_progress=True): """Basic GloVe. Parameters ---------- mat : 2d np.array This must be a square count matrix. rownames : list of str or None Not used; it's an argument only for consistency with other methods defined here. n : int (default: 100) The dimensionality of the output vectors. xmax : int (default: 100) Words with frequency greater than this are given weight 1.0. Words with frequency under this are given weight (c/xmax)**alpha where c is their count in mat (see the paper, eq. (9)). alpha : float (default: 0.75) Exponent in the weighting function (see the paper, eq. (9)). iterations : int (default: 100) Number of training epochs. learning_rate : float (default: 0.05) Controls the rate of SGD weight updates. display_progress : bool (default: True) Whether to print iteration number and current error to stdout. Returns ------- (np.array, list of str) The first member is the learned GloVe matrix and the second is `rownames` (unchanged). """ m = mat.shape[0] W = utils.randmatrix(m, n) # Word weights. C = utils.randmatrix(m, n) # Context weights. B = utils.randmatrix(2, m) # Word and context biases. indices = list(range(m)) for iteration in range(iterations): error = 0.0 random.shuffle(indices) for i, j in itertools.product(indices, indices): if mat[i,j] > 0.0: # Weighting function from eq. (9) weight = (mat[i,j] / xmax)**alpha if mat[i,j] < xmax else 1.0 # Cost is J' based on eq. (8) in the paper: diff = np.dot(W[i], C[j]) + B[0,i] + B[1,j] - np.log(mat[i,j]) fdiff = diff * weight # Gradients: wgrad = fdiff * C[j] cgrad = fdiff * W[i] wbgrad = fdiff wcgrad = fdiff # Updates: W[i] -= (learning_rate * wgrad) C[j] -= (learning_rate * cgrad) B[0,i] -= (learning_rate * wbgrad) B[1,j] -= (learning_rate * wcgrad) # One-half squared error term: error += 0.5 * weight * (diff**2) if display_progress: utils.progress_bar("iteration %s: error %s" % (iteration, error)) if display_progress: sys.stderr.write('\n') # Return the sum of the word and context matrices, per the advice # in section 4.2: return (W + C, rownames)
def test_randmatrix(): X = utils.randmatrix(10, 20) assert X.shape == (10, 20)
def fit(self, training_data): """Train the network. Parameters ---------- training_data : list of pairs In each pair, the first element should be a list of items from the vocabulary (for the NLI task, this is the concatenation of the premise and hypothesis), and the second element should be the one-hot label vector. Attributes ---------- self.output_dim : int Set based on the length of the labels in `training_data`. self.W_xh : np.array Dense connections between the word representations and the hidden layers. Random initialization. self.W_hh : np.array Dense connections between the hidden representations. Random initialization. self.W_hy : np.array Dense connections from the final hidden layer to the output layer. Random initialization. self.b : np.array Output bias. Initialized to all 0. """ self.output_dim = len(training_data[0][1]) self.W_xh = randmatrix(self.word_dim, self.hidden_dim) self.W_hh = randmatrix(self.hidden_dim, self.hidden_dim) self.W_hy = randmatrix(self.hidden_dim, self.output_dim) self.b = np.zeros(self.output_dim) # SGD: iteration = 0 error = sys.float_info.max while error > self.epsilon and iteration < self.maxiter: error = 0.0 random.shuffle(training_data) for seq, labels in training_data: self._forward_propagation(seq) # Cross-entropy error reduces to log(prediction-for-correct-label): error += -np.log(self.y[np.argmax(labels)]) # Back-prop: d_W_hy, d_b, d_W_hh, d_W_xh = self._backward_propagation( seq, labels) # Updates: self.W_hy -= self.eta * d_W_hy self.b -= self.eta * d_b self.W_hh -= self.eta * d_W_hh self.W_xh -= self.eta * d_W_xh iteration += 1 if self.display_progress: # Report the average error: error /= len(training_data) progress_bar("Finished epoch %s of %s; error is %s" % (iteration, self.maxiter, error)) if self.display_progress: sys.stderr.write('\n')