Beispiel #1
0
    def fit(self, training_data):
        """Train the network.

        Parameters
        ----------
        training_data : list of pairs
            In each pair, the first element should be a list of items
            from the vocabulary (for the NLI task, this is the
            concatenation of the premise and hypothesis), and the
            second element should be the one-hot label vector.

        Attributes
        ----------
        self.output_dim : int
            Set based on the length of the labels in `training_data`.
        
        self.W_xh : np.array
            Dense connections between the word representations
            and the hidden layers

        self.W_hh : np.array
            Dense connections between the hidden representations.

        self.W_hy : np.array
            Dense connections from the final hidden layer to
            the output layer.
    
        """              
        self.output_dim = len(training_data[0][1])
        self.W_xh = randmatrix(self.word_dim, self.hidden_dim)
        self.W_hh = randmatrix(self.hidden_dim, self.hidden_dim)
        self.W_hy = randmatrix(self.hidden_dim, self.output_dim)
        # SGD:
        iteration = 0
        error = sys.float_info.max
        while error > self.epsilon and iteration < self.maxiter:
            error = 0.0
            random.shuffle(training_data)
            for seq, labels in training_data:
                self._forward_propagation(seq)
                # Cross-entropy error reduces to log(prediction-for-correct-label):
                error += -np.log(self.y[np.argmax(labels)])
                # Back-prop:
                d_W_hy, d_W_hh, d_W_xh = self._backward_propagation(seq, labels)
                # Updates:
                self.W_hy -= self.eta * d_W_hy
                self.W_hh -= self.eta * d_W_hh
                self.W_xh -= self.eta * d_W_xh
            iteration += 1
            if self.display_progress:
                # Report the average error:
                error /= len(training_data)
                progress_bar("Finished epoch %s of %s; error is %s" % (iteration, self.maxiter, error))
        if self.display_progress:
            sys.stderr.write('\n')
 def fit(self, training_data):
     """The training algorithm. 
     
     Parameters
     ----------
     training_data : list
         A list of (example, label) pairs, where `example`
         and `label` are both np.array instances.
     
     Attributes
     ----------
     self.x : the input layer 
     self.h : the hidden layer
     self.y : the output layer
     self.W1 : dense weight connection from self.x to self.h
     self.W2 : dense weight connection from self.h to self.y
     
     Both self.W1 and self.W2 have the bias as their final column.
     
     The following attributes are created here for efficiency but 
     used only in `backward_propagation`:
     
     self.y_err : vector of output errors
     self.x_err : vector of input errors
     
     """
     # Dimensions determined by the data:
     self.input_dim = len(training_data[0][0])
     self.output_dim = len(training_data[0][1])
     # Parameter initialization:
     self.x = np.ones(self.input_dim + 1)  # +1 for the bias
     self.h = np.ones(self.hidden_dim + 1)  # +1 for the bias
     self.y = np.ones(self.output_dim)
     self.W1 = utils.randmatrix(self.input_dim + 1, self.hidden_dim)
     self.W2 = utils.randmatrix(self.hidden_dim + 1, self.output_dim)
     self.y_err = np.zeros(self.output_dim)
     self.x_err = np.zeros(self.input_dim + 1)
     # SGD:
     iteration = 0
     error = sys.float_info.max
     while error > self.epsilon and iteration < self.maxiter:
         error = 0.0
         random.shuffle(training_data)
         for ex, labels in training_data:
             self.forward_propagation(ex)
             error += self.backward_propagation(labels)
         iteration += 1
         if self.display_progress:
             utils.progress_bar('completed iteration %s; error is %s' %
                                (iteration, error))
     if self.display_progress:
         sys.stderr.write('\n')
 def fit(self, training_data): 
     """The training algorithm. 
     
     Parameters
     ----------
     training_data : list
         A list of (example, label) pairs, where `example`
         and `label` are both np.array instances.
     
     Attributes
     ----------
     self.x : the input layer 
     self.h : the hidden layer
     self.y : the output layer
     self.W1 : dense weight connection from self.x to self.h
     self.W2 : dense weight connection from self.h to self.y
     
     Both self.W1 and self.W2 have the bias as their final column.
     
     The following attributes are created here for efficiency but 
     used only in `backward_propagation`:
     
     self.y_err : vector of output errors
     self.x_err : vector of input errors
     
     """
     # Dimensions determined by the data:
     self.input_dim = len(training_data[0][0])
     self.output_dim = len(training_data[0][1])
     # Parameter initialization:
     self.x = np.ones(self.input_dim+1)  # +1 for the bias                                         
     self.h = np.ones(self.hidden_dim+1) # +1 for the bias        
     self.y = np.ones(self.output_dim)        
     self.W1 = utils.randmatrix(self.input_dim+1, self.hidden_dim)
     self.W2 = utils.randmatrix(self.hidden_dim+1, self.output_dim)        
     self.y_err = np.zeros(self.output_dim)
     self.x_err = np.zeros(self.input_dim+1)
     # SGD:
     iteration = 0
     error = sys.float_info.max
     while error > self.epsilon and iteration < self.maxiter:            
         error = 0.0
         random.shuffle(training_data)
         for ex, labels in training_data:
             self.forward_propagation(ex)
             error += self.backward_propagation(labels)
         iteration += 1
         if self.display_progress:
             utils.progress_bar('completed iteration %s; error is %s' % (iteration, error))
     if self.display_progress:
         sys.stderr.write('\n')
Beispiel #4
0
def test_np_autoencoder(hidden_activation, d_hidden_activation):
    model = Autoencoder(
        max_iter=10,
        hidden_dim=2,
        hidden_activation=hidden_activation,
        d_hidden_activation=d_hidden_activation)
    # A tiny dataset so that we can run `fit` and set all the model
    # parameters:
    X = utils.randmatrix(5, 5)
    model.fit(X)
    # Use the first example for the check:
    ex = X[0]
    label = X[0]
    # Forward and backward to get the gradients:
    hidden, pred = model.forward_propagation(ex)
    d_W_hy, d_b_hy, d_W_xh, d_b_xh = model.backward_propagation(
        hidden, pred, ex, label)
    # Model parameters to check:
    param_pairs = (
        ('W_hy', d_W_hy),
        ('b_hy', d_b_hy),
        ('W_xh', d_W_xh),
        ('b_xh', d_b_xh)
    )
    gradient_check(param_pairs, model, ex, label)
Beispiel #5
0
def test_np_shallow_neural_classifier_gradients(hidden_activation, d_hidden_activation):
    model = ShallowNeuralClassifier(
        max_iter=10,
        hidden_activation=hidden_activation,
        d_hidden_activation=d_hidden_activation)
    # A tiny dataset so that we can run `fit` and set all the model
    # parameters:
    X = utils.randmatrix(5, 2)
    y = np.random.choice((0,1), 5)
    model.fit(X, y)
    # Use the first example for the check:
    ex = X[0]
    label = model._onehot_encode([y[0]])[0]
    # Forward and backward to get the gradients:
    hidden, pred = model.forward_propagation(ex)
    d_W_hy, d_b_hy, d_W_xh, d_b_xh = model.backward_propagation(
        hidden, pred, ex, label)
    # Model parameters to check:
    param_pairs = (
        ('W_hy', d_W_hy),
        ('b_hy', d_b_hy),
        ('W_xh', d_W_xh),
        ('b_xh', d_b_xh)
    )
    gradient_check(param_pairs, model, ex, label)
Beispiel #6
0
def test_tf_autoencoder():
    """Just makes sure that this code will run; it doesn't check that
    it is creating good models.
    """
    X = utils.randmatrix(20, 50)
    ae = TfAutoencoder(hidden_dim=5, max_iter=100)
    ae.fit(X)
    ae.predict(X)
Beispiel #7
0
def test_torch_autoencoder_save_load():
    X = utils.randmatrix(20, 50)
    mod = torch_autoencoder.TorchAutoencoder(hidden_dim=5, max_iter=2)
    mod.fit(X)
    mod.predict(X)
    with tempfile.NamedTemporaryFile(mode='wb') as f:
        name = f.name
        mod.to_pickle(name)
        mod2 = torch_autoencoder.TorchAutoencoder.from_pickle(name)
        mod2.predict(X)
        mod2.fit(X)
Beispiel #8
0
def test_torch_autoencoder(pandas):
    """Just makes sure that this code will run; it doesn't check that
    it is creating good models.
    """
    X = utils.randmatrix(20, 50)
    if pandas:
        X = pd.DataFrame(X)
    ae = torch_autoencoder.TorchAutoencoder(hidden_dim=5, max_iter=100)
    H = ae.fit(X)
    ae.predict(X)
    H_is_pandas = isinstance(H, pd.DataFrame)
    assert H_is_pandas == pandas
Beispiel #9
0
    def weight_init(m, n):
        """Uses the Xavier Glorot method for initializing the weights
        of an `m` by `n` matrix.

        Parameters
        ----------
        m : int
            Row dimension
        n : int
            Column dimension

        Returns
        -------
        np.array, shape `(m, n)`

        """
        x = np.sqrt(6.0 / (m + n))
        return randmatrix(m, n, lower=-x, upper=x)
Beispiel #10
0
    def fit(self, df):
        """
        Learn the GloVe matrix.

        Parameters
        ----------
        df : pd.DataFrame or np.array, shape `(n_vocab, n_vocab)`
            This should be a matrix of (possibly scaled) co-occcurrence
            counts.

        Returns
        -------
        pd.DataFrame or np.array, shape `(n_vocab, self.n)`
           The type will be the same as the user's `df`. If it's a
           `pd.DataFrame`, the index will be the same as `df.index`.

        """
        X = self.convert_input_to_array(df)
        m = X.shape[0]
        # Parameters:
        W = utils.randmatrix(m, self.n)  # Word weights.
        C = utils.randmatrix(m, self.n)  # Context weights.
        B = utils.randmatrix(2, m)  # Word and context biases.
        # Precomputable GloVe values:
        X_log = utils.log_of_array_ignoring_zeros(X)
        X_weights = (np.minimum(X, self.xmax) /
                     self.xmax)**self.alpha  # eq. (9)
        # Learning:
        indices = list(range(m))
        for iteration in range(self.max_iter):
            epoch_error = 0.0
            random.shuffle(indices)
            for i, j in itertools.product(indices, indices):
                if X[i, j] > 0.0:
                    weight = X_weights[i, j]
                    # Cost is J' based on eq. (8) in the paper:
                    diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j]
                    fdiff = diff * weight
                    # Gradients:
                    wgrad = fdiff * C[j]
                    cgrad = fdiff * W[i]
                    wbgrad = fdiff
                    wcgrad = fdiff
                    # Updates:
                    W[i] -= self.eta * wgrad
                    C[j] -= self.eta * cgrad
                    B[0, i] -= self.eta * wbgrad
                    B[1, j] -= self.eta * wcgrad
                    # One-half squared error term:
                    epoch_error += 0.5 * weight * (diff**2)

            epoch_error /= m

            if epoch_error <= self.tol:
                utils.progress_bar(
                    "Converged on iteration {} with error {}".format(
                        iteration, epoch_error, self.display_progress))
                break

            utils.progress_bar("Finished epoch {} of {}; error is {}".format(
                iteration, self.max_iter, epoch_error, self.display_progress))

        # Return the sum of the word and context matrices, per the advice
        # in section 4.2:
        G = W + C
        self.embedding = self.convert_output(G, df)
        return self.embedding
Beispiel #11
0
import torch.nn as nn
import utils

from test_torch_model_base import PARAMS_WITH_TEST_VALUES as BASE_PARAMS
from torch_tree_nn import TorchTreeNN
from torch_tree_nn import simple_example
from np_tree_nn import TreeNN
from np_tree_nn import simple_example as np_simple_example

__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2021"

utils.fix_random_seeds()

PARAMS_WITH_TEST_VALUES = [["embed_dim", 10],
                           ["embedding", utils.randmatrix(4, 10)],
                           ["hidden_activation",
                            nn.ReLU()], ['freeze_embedding', True]]

PARAMS_WITH_TEST_VALUES += BASE_PARAMS


@pytest.fixture
def dataset():
    vocab = ["1", "+", "2", "$UNK"]

    train = [
        "(odd 1)", "(even 2)", "(even (odd 1) (neutral (neutral +) (odd 1)))",
        "(odd (odd 1) (neutral (neutral +) (even 2)))",
        "(odd (even 2) (neutral (neutral +) (odd 1)))",
        "(even (even 2) (neutral (neutral +) (even 2)))",
def glove(df,
          n=100,
          xmax=100,
          alpha=0.75,
          max_iter=100,
          eta=0.05,
          tol=1e-4,
          display_progress=True):
    """Basic GloVe. This is mainly here as a reference implementation.
    We recommend using `mittens.GloVe` instead.

    Parameters
    ----------
    df : pd.DataFrame or np.array
        This must be a square matrix.
    n : int (default: 100)
        The dimensionality of the output vectors.
    xmax : int (default: 100)
        Words with frequency greater than this are given weight 1.0.
        Words with frequency under this are given weight (c/xmax)**alpha
        where c is their count in mat (see the paper, eq. (9)).
    alpha : float (default: 0.75)
        Exponent in the weighting function (see the paper, eq. (9)).
    max_iter : int (default: 100)
        Number of training epochs.
    eta : float (default: 0.05)
        Controls the rate of SGD weight updates.
    tol : float (default: 1e-4)
        Stopping criterion for the loss.
    display_progress : bool (default: True)
        Whether to print iteration number and current error to stdout.

    Returns
    -------
    pd.DataFrame
        With dimension `(df.shape[0], n)`

    """
    X = df.values if isinstance(df, pd.DataFrame) else df
    m = X.shape[0]
    # Parameters:
    W = utils.randmatrix(m, n)  # Word weights.
    C = utils.randmatrix(m, n)  # Context weights.
    B = utils.randmatrix(2, m)  # Word and context biases.
    # Precomputable GloVe values:
    X_log = utils.log_of_array_ignoring_zeros(X)
    X_weights = (np.minimum(X, xmax) / xmax)**alpha  # eq. (9)
    # Learning:
    indices = list(range(m))
    for iteration in range(max_iter):
        error = 0.0
        random.shuffle(indices)
        for i, j in itertools.product(indices, indices):
            if X[i, j] > 0.0:
                weight = X_weights[i, j]
                # Cost is J' based on eq. (8) in the paper:
                diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j]
                fdiff = diff * weight
                # Gradients:
                wgrad = fdiff * C[j]
                cgrad = fdiff * W[i]
                wbgrad = fdiff
                wcgrad = fdiff
                # Updates:
                W[i] -= eta * wgrad
                C[j] -= eta * cgrad
                B[0, i] -= eta * wbgrad
                B[1, j] -= eta * wcgrad
                # One-half squared error term:
                error += 0.5 * weight * (diff**2)
        error /= m
        if display_progress:
            if error < tol:
                utils.progress_bar("Stopping at iteration {} with "
                                   "error {}".format(iteration, error))
                break
            else:
                utils.progress_bar("Iteration {}: error {}".format(
                    iteration, error))
    if display_progress:
        sys.stderr.write('\n')
    # Return the sum of the word and context matrices, per the advice
    # in section 4.2:
    G = W + C
    if isinstance(df, pd.DataFrame):
        G = pd.DataFrame(G, index=df.index)
    return G
def random_matrix():
    return utils.randmatrix(20, 50)
Beispiel #14
0
 def weight_init(m, n):
     """Uses the Xavier Glorot method for initializing the weights
     of an `m` by `n` matrix.
     """
     x = np.sqrt(6.0 / (m + n))
     return utils.randmatrix(m, n, lower=-x, upper=x)
Beispiel #15
0
def glove(mat, rownames, n=100, xmax=100, alpha=0.75, 
          iterations=100, learning_rate=0.05, 
          display_progress=True):
    """Basic GloVe. 
    
    Parameters
    ----------
    mat : 2d np.array
        This must be a square count matrix.
        
    rownames : list of str or None
        Not used; it's an argument only for consistency with other methods 
        defined here.
        
    n : int (default: 100)
        The dimensionality of the output vectors.
    
    xmax : int (default: 100)
        Words with frequency greater than this are given weight 1.0.
        Words with frequency under this are given weight (c/xmax)**alpha
        where c is their count in mat (see the paper, eq. (9)).
        
    alpha : float (default: 0.75)
        Exponent in the weighting function (see the paper, eq. (9)).
    
    iterations : int (default: 100)
        Number of training epochs.
        
    learning_rate : float (default: 0.05)
        Controls the rate of SGD weight updates.
        
    display_progress : bool (default: True) 
        Whether to print iteration number and current error to stdout.
        
    Returns
    -------
    (np.array, list of str)
       The first member is the learned GloVe matrix and the second is
       `rownames` (unchanged).

    """        
    m = mat.shape[0]
    W = utils.randmatrix(m, n) # Word weights.
    C = utils.randmatrix(m, n) # Context weights.
    B = utils.randmatrix(2, m) # Word and context biases.
    indices = list(range(m))
    for iteration in range(iterations):
        error = 0.0        
        random.shuffle(indices)
        for i, j in itertools.product(indices, indices):
            if mat[i,j] > 0.0:     
                # Weighting function from eq. (9)
                weight = (mat[i,j] / xmax)**alpha if mat[i,j] < xmax else 1.0
                # Cost is J' based on eq. (8) in the paper:
                diff = np.dot(W[i], C[j]) + B[0,i] + B[1,j] - np.log(mat[i,j])                
                fdiff = diff * weight                
                # Gradients:
                wgrad = fdiff * C[j]
                cgrad = fdiff * W[i]
                wbgrad = fdiff
                wcgrad = fdiff
                # Updates:
                W[i] -= (learning_rate * wgrad) 
                C[j] -= (learning_rate * cgrad) 
                B[0,i] -= (learning_rate * wbgrad) 
                B[1,j] -= (learning_rate * wcgrad)                 
                # One-half squared error term:                              
                error += 0.5 * weight * (diff**2)
        if display_progress:
            utils.progress_bar("iteration %s: error %s" % (iteration, error))
    if display_progress:
        sys.stderr.write('\n')
    # Return the sum of the word and context matrices, per the advice 
    # in section 4.2:
    return (W + C, rownames)
Beispiel #16
0
def test_randmatrix():
    X = utils.randmatrix(10, 20)
    assert X.shape == (10, 20)
Beispiel #17
0
    def fit(self, training_data):
        """Train the network.

        Parameters
        ----------
        training_data : list of pairs
            In each pair, the first element should be a list of items
            from the vocabulary (for the NLI task, this is the
            concatenation of the premise and hypothesis), and the
            second element should be the one-hot label vector.

        Attributes
        ----------
        self.output_dim : int
            Set based on the length of the labels in `training_data`.
        
        self.W_xh : np.array
            Dense connections between the word representations
            and the hidden layers. Random initialization.

        self.W_hh : np.array
            Dense connections between the hidden representations.
            Random initialization.

        self.W_hy : np.array
            Dense connections from the final hidden layer to
            the output layer. Random initialization.

        self.b : np.array
            Output bias. Initialized to all 0.
    
        """
        self.output_dim = len(training_data[0][1])
        self.W_xh = randmatrix(self.word_dim, self.hidden_dim)
        self.W_hh = randmatrix(self.hidden_dim, self.hidden_dim)
        self.W_hy = randmatrix(self.hidden_dim, self.output_dim)
        self.b = np.zeros(self.output_dim)
        # SGD:
        iteration = 0
        error = sys.float_info.max
        while error > self.epsilon and iteration < self.maxiter:
            error = 0.0
            random.shuffle(training_data)
            for seq, labels in training_data:
                self._forward_propagation(seq)
                # Cross-entropy error reduces to log(prediction-for-correct-label):
                error += -np.log(self.y[np.argmax(labels)])
                # Back-prop:
                d_W_hy, d_b, d_W_hh, d_W_xh = self._backward_propagation(
                    seq, labels)
                # Updates:
                self.W_hy -= self.eta * d_W_hy
                self.b -= self.eta * d_b
                self.W_hh -= self.eta * d_W_hh
                self.W_xh -= self.eta * d_W_xh
            iteration += 1
            if self.display_progress:
                # Report the average error:
                error /= len(training_data)
                progress_bar("Finished epoch %s of %s; error is %s" %
                             (iteration, self.maxiter, error))
        if self.display_progress:
            sys.stderr.write('\n')