Beispiel #1
0
    def get_gradient(self,
                     starting_gradient=None,
                     cost=None,
                     additional_cost=None):
        """
        This method allows you to define the gradient for this model manually. It should either work with a provided
        starting gradient (from upstream layers/models), or grab the training cost if no start gradient is provided.

        Theano's subgraph gradient function specified here:
        http://deeplearning.net/software/theano/library/gradient.html#theano.gradient.subgraph_grad

        .. warning::
            If the gradients of cost with respect to any of the start variables is already part of the
            start dictionary, then it may be counted twice with respect
            to wrt (`get_params()`) and end (`get_inputs()`).

        You should only implement this method if you want to manually define your gradients for the model.

        Parameters
        ----------
        starting_gradient : dictionary of {variable: known_gradient}, optional
            The starting, known gradients for parameters.
        cost : theano expression, optional
            The cost expression to use when calculating the gradients. Defaults to `get_train_cost()`.
        additional_cost : theano expression, optional
            Any additional cost to add to the gradient.

        Returns
        -------
        tuple
            (Gradient with respect to params, gradient with respect to inputs)
        """
        # check if starting gradients was provided.
        # if there are known gradients to start, use those instead of the cost for this model
        if starting_gradient is not None:
            params_grad, next_starting_grad = theano.subgraph_grad(
                wrt=self.get_params(),
                end=raise_to_list(self.get_inputs()),
                start=starting_gradient,
                cost=additional_cost,
                details=False)
        # otherwise, just use this model's cost to determine gradient
        else:
            # use the cost if it was given
            cost = cost or self.get_train_cost()
            if additional_cost is not None:
                cost = T.sum(cost, additional_cost)
            params_grad, next_starting_grad = theano.subgraph_grad(
                wrt=self.get_params(),
                end=raise_to_list(self.get_inputs()),
                cost=cost,
                details=False)
        return (OrderedDict(zip(self.get_params(), params_grad)),
                OrderedDict(
                    zip(raise_to_list(self.get_inputs()), next_starting_grad)))
Beispiel #2
0
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss, givens):
    """
    Compiles functions that compute the gradients for each block given the preceding block.
    :return:
    """
    grad_fns = []
    for i in range(len(param_blocks) - 1, -1, -1):
        if i > 0:
            end = split_outputs[i - 1]
        else:
            end = []

        if i < len(split_outputs):
            # Create gradient variables for all split vars
            start = OrderedDict()
            for s in split_outputs[i]:
                start[s] = like(s)
            start_vars = list(start.values())
        else:
            start = None
            start_vars = []

        if start is None:
            grads, next = subgraph_grad(
                start=start,
                end=end,
                cost=loss,
                wrt=param_blocks[i]
            )
            # Create the grad function
            grad_fns.append(theano.function(
                inputs=input_vars + start_vars,
                outputs=[loss] + grads + next,
                on_unused_input='ignore',
                givens=givens
            ))
        else:
            grads, next = subgraph_grad(
                start=start,
                end=end,
                wrt=param_blocks[i]
            )
            # Create the grad function
            grad_fns.append(theano.function(
                inputs=input_vars + start_vars,
                outputs=grads + next,
                on_unused_input='ignore',
                givens=givens
            ))

    return grad_fns[::-1]
Beispiel #3
0
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss,
                           givens):
    """Compiles functions that compute the gradients for each block.

    Args:
        split_outputs: The split nodes.
        param_blocks: The parameters for each block.
        input_vars: The input variables to the network.
        loss: The training loss.
        givens: A dictionary of given variable values (computed during a
            dedicated forward pass).
    """
    grad_fns = []
    for i in range(len(param_blocks) - 1, -1, -1):
        if i > 0:
            end = split_outputs[i - 1]
        else:
            end = []

        if i < len(split_outputs):
            # Create gradient variables for all split vars
            start = collections.OrderedDict()
            for s in split_outputs[i]:
                start[s] = _like(s)
            start_vars = list(start.values())
        else:
            start = None
            start_vars = []

        if start is None:
            grads, out_grads = subgraph_grad(end=end,
                                             cost=loss,
                                             wrt=param_blocks[i])
            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=[loss] + grads + out_grads,
                                on_unused_input='ignore'))
        else:
            grads, out_grads = subgraph_grad(start=start,
                                             end=end,
                                             wrt=param_blocks[i])
            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=grads + out_grads,
                                on_unused_input='ignore',
                                givens=givens))

    return grad_fns[::-1]
Beispiel #4
0
    def get_gradient(self, starting_gradient=None, cost=None, additional_cost=None):
        """
        This method allows you to define the gradient for this model manually. It should either work with a provided
        starting gradient (from upstream layers/models), or grab the training cost if no start gradient is provided.

        Theano's subgraph gradient function specified here:
        http://deeplearning.net/software/theano/library/gradient.html#theano.gradient.subgraph_grad

        .. warning::
            If the gradients of cost with respect to any of the start variables is already part of the
            start dictionary, then it may be counted twice with respect
            to wrt (`get_params()`) and end (`get_inputs()`).

        You should only implement this method if you want to manually define your gradients for the model.

        Parameters
        ----------
        starting_gradient : dictionary of {variable: known_gradient}, optional
            The starting, known gradients for parameters.
        cost : theano expression, optional
            The cost expression to use when calculating the gradients. Defaults to `get_train_cost()`.
        additional_cost : theano expression, optional
            Any additional cost to add to the gradient.

        Returns
        -------
        tuple
            (Gradient with respect to params, gradient with respect to inputs)
        """
        # check if starting gradients was provided.
        # if there are known gradients to start, use those instead of the cost for this model
        if starting_gradient is not None:
            params_grad, next_starting_grad = theano.subgraph_grad(wrt=self.get_params(),
                                                                   end=raise_to_list(self.get_inputs()),
                                                                   start=starting_gradient,
                                                                   cost=additional_cost,
                                                                   details=False)
        # otherwise, just use this model's cost to determine gradient
        else:
            # use the cost if it was given
            cost = cost or self.get_train_cost()
            if additional_cost is not None:
                cost = T.sum(cost, additional_cost)
            params_grad, next_starting_grad = theano.subgraph_grad(wrt=self.get_params(),
                                                                   end=raise_to_list(self.get_inputs()),
                                                                   cost=cost,
                                                                   details=False)
        return (OrderedDict(zip(self.get_params(), params_grad)),
                OrderedDict(zip(raise_to_list(self.get_inputs()), next_starting_grad)))
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss,
                           givens):
    """
    Compiles functions that compute the gradients for each block given the preceding block.
    :return:
    """
    grad_fns = []
    for i in range(len(param_blocks) - 1, -1, -1):
        if i > 0:
            end = split_outputs[i - 1]
        else:
            end = []

        if i < len(split_outputs):
            # Create gradient variables for all split vars
            start = OrderedDict()
            for s in split_outputs[i]:
                start[s] = like(s)
            start_vars = list(start.values())
        else:
            start = None
            start_vars = []

        if start is None:
            grads, next = subgraph_grad(start=start,
                                        end=end,
                                        cost=loss,
                                        wrt=param_blocks[i])
            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=[loss] + grads + next,
                                on_unused_input='ignore',
                                givens=givens))
        else:
            grads, next = subgraph_grad(start=start,
                                        end=end,
                                        wrt=param_blocks[i])
            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=grads + next,
                                on_unused_input='ignore',
                                givens=givens))

    return grad_fns[::-1]
Beispiel #6
0
def test_subgraph_grad():

    # Tests that the grad method with no known_grads
    # matches what happens if you use successive subgraph_grads

    x = theano.tensor.fvector('x')
    t = theano.tensor.fvector('t')
    w1 = theano.shared(np.random.randn(3, 4))
    w2 = theano.shared(np.random.randn(4, 2))
    a1 = theano.tensor.tanh(theano.tensor.dot(x, w1))
    a2 = theano.tensor.tanh(theano.tensor.dot(a1, w2))
    cost2 = theano.tensor.sqr(a2 - t).sum()
    cost2 += theano.tensor.sqr(w2.sum())
    cost1 = theano.tensor.sqr(w1.sum())

    params = [[w2], [w1]]
    costs = [cost2, cost1]
    grad_ends = [[a1], [x]]

    inputs = [t, x]
    rng = np.random.RandomState([2012, 11, 15])
    values = [rng.randn(2), rng.randn(3)]
    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]

    wrt = [w2, w1]
    cost = cost2 + cost1
    true_grads = theano.grad(cost, wrt)
    true_grads = theano.function(inputs, true_grads)
    true_grads = true_grads(*values)
    from theano.compat.python2x import OrderedDict
    next_grad = None
    param_grads = []
    for i in xrange(2):
        param_grad, next_grad = theano.subgraph_grad(wrt=params[i],
                                                     end=grad_ends[i],
                                                     start=next_grad,
                                                     cost=costs[i])
        next_grad = OrderedDict(zip(grad_ends[i], next_grad))
        param_grads.extend(param_grad)

    pgrads = theano.function(inputs, param_grads)
    pgrads = pgrads(*values)

    for true_grad, pgrad in zip(true_grads, pgrads):
        assert (np.sum(np.abs(true_grad - pgrad)) < 0.00001)
Beispiel #7
0
def test_subgraph_grad():

    # Tests that the grad method with no known_grads
    # matches what happens if you use successive subgraph_grads

    x = theano.tensor.fvector('x')
    t = theano.tensor.fvector('t')
    w1 = theano.shared(np.random.randn(3,4))
    w2 = theano.shared(np.random.randn(4,2))
    a1 = theano.tensor.tanh(theano.tensor.dot(x,w1))
    a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2))
    cost2 = theano.tensor.sqr(a2 - t).sum() 
    cost2 += theano.tensor.sqr(w2.sum())
    cost1 = theano.tensor.sqr(w1.sum())
    
    params = [[w2],[w1]]
    costs = [cost2,cost1]
    grad_ends = [[a1], [x]]
    
    inputs = [t, x]
    rng = np.random.RandomState([2012, 11, 15])
    values = [rng.randn(2), rng.randn(3)]
    values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)]

    wrt = [w2, w1]
    cost = cost2 + cost1
    true_grads = theano.grad(cost, wrt)
    true_grads = theano.function(inputs, true_grads)
    true_grads = true_grads(*values)
    from theano.compat.python2x import OrderedDict
    next_grad = None
    param_grads = []
    for i in xrange(2):
        param_grad, next_grad = theano.subgraph_grad(
            wrt=params[i], end=grad_ends[i], 
            start=next_grad, cost=costs[i]
        )
        next_grad = OrderedDict(zip(grad_ends[i], next_grad))
        param_grads.extend(param_grad)
    
    pgrads = theano.function(inputs, param_grads)
    pgrads = pgrads(*values)
    
    for true_grad, pgrad in zip(true_grads, pgrads):
        assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
Beispiel #8
0
def compile_grad_descent_functions(bn_updates, split_updates, split_outputs,
                                   param_blocks, input_vars, loss, givens,
                                   update_fn):
    """Compiles functions that perform a gradient descent stop for each block.

    This function is complementary to `compile_grad_functions`.

    Args:
        bn_updates: A dictionary for updating the BN statistics.
        split_updates: A dictionary containing the update ops for the
            intermediate split outputs.
        split_outputs: The split nodes.
        param_blocks: The parameters for each block.
        input_vars: The input variables to the network.
        loss: The training loss.
        givens: A dictionary of given variable values (computed during a
            dedicated forward pass).
        update_fn: A lasagne update function that takes a list of gradient and
            parameters and returns a theano update dict.
    """
    grad_fns = []
    for i in range(len(param_blocks) - 1, -1, -1):
        if i > 0:
            end = split_outputs[i - 1]
        else:
            end = []

        if i < len(split_outputs):
            # Create gradient variables for all split vars
            start = collections.OrderedDict()
            for s in split_outputs[i]:
                start[s] = _like(s)
            start_vars = list(start.values())
        else:
            start = None
            start_vars = []

        if start is None:
            grads, out_grads = subgraph_grad(
                end=end,
                cost=loss,
                wrt=param_blocks[i],
            )

            # Compute the gradient descent update
            updates = update_fn(loss_or_grads=grads, params=param_blocks[i])

            # Update the BN statistics and store the intermediate outputs.
            updates.update(bn_updates)
            updates.update(split_updates)

            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=[loss] + out_grads,
                                updates=updates,
                                on_unused_input='ignore'))
        else:
            grads, out_grads = subgraph_grad(start=start,
                                             end=end,
                                             wrt=param_blocks[i])

            # Compute the gradient descent update
            updates = update_fn(loss_or_grads=grads, params=param_blocks[i])

            # Create the grad function
            grad_fns.append(
                theano.function(inputs=input_vars + start_vars,
                                outputs=out_grads,
                                updates=updates,
                                on_unused_input='ignore',
                                givens=givens))

    return grad_fns