Beispiel #1
0
    def test_disconnected_paths(self):
        # Test that taking gradient going through a disconnected
        # path rasises an exception
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5),
                       dtype=config.floatX)

        x = T.matrix('x')

        # This MUST raise a DisconnectedInputError error.
        # This also rasies an additional warning from gradients.py.
        self.assertRaises(gradient.DisconnectedInputError, gradient.grad,
                          gradient.disconnected_grad(x).sum(), x)

        # This MUST NOT raise a DisconnectedInputError error.
        y = gradient.grad((x + gradient.disconnected_grad(x)).sum(), x)

        a = T.matrix('a')
        b = T.matrix('b')
        y = a + gradient.disconnected_grad(b)
        # This MUST raise a DisconnectedInputError error.
        # This also rasies an additional warning from gradients.py.
        self.assertRaises(gradient.DisconnectedInputError,
                          gradient.grad, y.sum(), b)

        # This MUST NOT raise a DisconnectedInputError error.
        gradient.grad(y.sum(), a)
Beispiel #2
0
    def test_disconnected_paths(self):
        # Test that taking gradient going through a disconnected
        # path rasises an exception
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5),
                       dtype=config.floatX)

        x = T.matrix('x')

        # This MUST raise a DisconnectedInputError error.
        # This also rasies an additional warning from gradients.py.
        self.assertRaises(gradient.DisconnectedInputError, gradient.grad,
                          gradient.disconnected_grad(x).sum(), x)

        # This MUST NOT raise a DisconnectedInputError error.
        y = gradient.grad((x + gradient.disconnected_grad(x)).sum(), x)

        a = T.matrix('a')
        b = T.matrix('b')
        y = a + gradient.disconnected_grad(b)
        # This MUST raise a DisconnectedInputError error.
        # This also rasies an additional warning from gradients.py.
        self.assertRaises(gradient.DisconnectedInputError,
                          gradient.grad, y.sum(), b)

        # This MUST NOT raise a DisconnectedInputError error.
        gradient.grad(y.sum(), a)
Beispiel #3
0
    def functions(self, sequence_length):
        key = (sequence_length)

        if key not in self.cache:
            logging.info("Need to construct graph for sequence_length=%d..." % (sequence_length))

            # creating network input variable nodes
            correct_inputs = t.ftensor3("correct input")
            noise_inputs = t.ftensor3("noise input")
            learning_rate = t.fscalar("learning rate")

            # creating op nodes for firing the network
            correct_score, correct_prehidden = self.score(correct_inputs)
            noise_score, noise_prehidden = self.score(noise_inputs)

            # creating op nodes for the pairwise ranking cost function
            loss = t.clip(1 - correct_score + noise_score, 0, 1e999)
            total_loss = t.sum(loss)

            # the necessary cost function gradients
            parameters_gradient = grad(total_loss, list(self.parameters))
            correct_inputs_gradient = grad(total_loss, correct_inputs)
            noise_inputs_gradient = grad(total_loss, noise_inputs)

            # setting network inputs
            predict_inputs = [correct_inputs]
            train_inputs = [correct_inputs, noise_inputs, learning_rate]
            verbose_predict_inputs = predict_inputs

            # setting network outputs
            predict_outputs = [correct_score]
            train_outputs = [correct_inputs_gradient, noise_inputs_gradient, loss, correct_score, noise_score]
            verbose_predict_outputs = [correct_score, correct_prehidden]

            nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs))
            logging.info("About to compile prediction function over %d ops [nodes]..." % nnodes)
            predict = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE)
            logging.info("...done constructing graph for sequence_length=%d" % (sequence_length))

            nnodes = len(theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs))
            logging.info("About to compile verbose prediction function over %d ops [nodes]..." % nnodes)
            verbose_predict = theano.function(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE)
            logging.info("...done constructing graph for sequence_length=%d" % (sequence_length))

            nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs))
            logging.info("About to compile training function over %d ops [nodes]..." % nnodes)
            train = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip(list(self.parameters), parameters_gradient)])
            logging.info("...done constructing graph for sequence_length=%d" % (sequence_length))

            self.cache[key] = (predict, train, verbose_predict)

        return self.cache[key]
Beispiel #4
0
    def grad(self, inputs, output_grads):
        # OpFromGraph doesn't implement a connection_pattern, so for
        # now we regard all inputs and outputs as connected. This will
        # compute the right numerical value for the gradients but
        # could fail to raise the disconnected inputs error in some
        # cases.
        if hasattr(self, "grad_ops"):
            grad_ops = self.grad_ops
        else:
            gs = G.grad(cost=None,
                        known_grads=dict(zip(self.new_outputs, output_grads)),
                        wrt=self.new_inputs,
                        disconnected_inputs='ignore')

            grad_ops = []
            for g in gs:
                if g is None:
                    grad_ops.append(lambda *args: None)
                else:
                    # It is normal if some inputs are not needed in order
                    # to compute the gradient, so we ignore them.
                    grad_ops.append(
                        OpFromGraph(self.new_inputs + output_grads, [g],
                                    on_unused_input='ignore'))
            self.grad_ops = grad_ops

        return [go(*(inputs + output_grads)) for go in grad_ops]
Beispiel #5
0
    def grad(self, inputs, output_grads):
        # OpFromGraph doesn't implement a connection_pattern, so for
        # now we regard all inputs and outputs as connected. This will
        # compute the right numerical value for the gradients but
        # could fail to raise the disconnected inputs error in some
        # cases.
        if hasattr(self, "grad_ops"):
            grad_ops = self.grad_ops
        else:
            gs = G.grad(cost=None,
                        known_grads=dict(zip(self.new_outputs, output_grads)),
                        wrt=self.new_inputs,
                        disconnected_inputs='ignore')

            grad_ops = []
            for g in gs:
                if g is None:
                    grad_ops.append(lambda *args: None)
                else:
                    # It is normal if some inputs are not needed in order
                    # to compute the gradient, so we ignore them.
                    grad_ops.append(OpFromGraph(self.new_inputs + output_grads,
                                                [g],
                                                on_unused_input='ignore'))
            self.grad_ops = grad_ops

        return [go(*(inputs + output_grads)) for go in grad_ops]
Beispiel #6
0
    def compute_nll_upper_bound(self, seq_length, validation=False):
        #############
        # Inference
        (enc_mu, enc_sig, prior_mu, prior_sig, dec_bin), updates = \
            self.inference(self.orch, self.piano, seq_length)

        #############
        # Cost
        dec_bin_non_zero = T.switch(dec_bin > 0, dec_bin, 1e-30)  # Avoid log zero
        recon = T.sum(T.nnet.binary_crossentropy(dec_bin_non_zero, self.orch), axis=1)
        # binary_crossentropy = nll for binary input. Sum along input dimension, mean along time (i.e. batch)
        # for real-valued units, use GaussianNLL
        kl = KLGaussianGaussian(enc_mu, enc_sig, prior_mu, prior_sig)
        # Mean over batches
        recon_term = T.mean(recon)
        kl_term = T.mean(kl)
        # Note that instead of maximazing the neg log-lik upper bound,
        # We here minimize the log-lik upper bound
        cost = recon_term + kl_term

        if not validation:
            #############
            # Gradient
            gparams = G.grad(cost, self.params_dico.values())
            #############
            # Updates
            updates_train = self.optimizer.get_updates(self.params_dico.values(), gparams, updates)
            #############
            # Cost
            return cost, updates_train
        else:
            return cost, recon_term, kl_term, dec_bin, updates
Beispiel #7
0
    def test_disconnected_nan(self):

        # test that connection_pattern can prevent getting NaN

        # Op1 has two outputs, f and g
        # x is connected to f but not to g
        class Op1(theano.gof.Op):
            def make_node(self, x):
                return theano.Apply(self, inputs=[x], outputs=[x.type(), theano.tensor.scalar()])

            def connection_pattern(self, node):
                return [[True, False]]

            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like()]

        # Op2 has two inputs, f and g
        # Its gradient with respect to g is not defined
        class Op2(theano.gof.Op):
            def make_node(self, f, g):
                return theano.Apply(self, inputs=[f, g], outputs=[theano.tensor.scalar()])

            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like(), NullType()()]

        x = theano.tensor.vector()
        f, g = Op1()(x)
        cost = Op2()(f, g)

        # cost is differentiable wrt x
        # but we can't tell that without using Op1's connection pattern
        # looking at the theano graph alone, g is an ancestor of cost
        # and has x as an ancestor, so we must compute its gradient

        g = gradient.grad(cost, x)
Beispiel #8
0
 def inner_function(*args):
     idx = args[0]
     expr = args[1]
     rvals = []
     for inp in args[2:]:
         rval = grad(expr[idx],
                     inp,
                     consider_constant=consider_constant,
                     disconnected_inputs=disconnected_inputs)
         rvals.append(rval)
     return rvals
Beispiel #9
0
def jacobian(expression, wrt, consider_constant=None,
             disconnected_inputs='raise'):
    '''
    similar implementation as in theano.gradient, but ignore not empty updates 
    (because when you use it in lasagna there is should be some update and it is ok)
    '''
    from theano.tensor import arange
    # Check inputs have the right format
    assert isinstance(expression, Variable), \
        "tensor.jacobian expects a Variable as `expression`"
    assert expression.ndim < 2, \
        ("tensor.jacobian expects a 1 dimensional variable as "
         "`expression`. If not use flatten to make it a vector")

    using_list = isinstance(wrt, list)
    using_tuple = isinstance(wrt, tuple)

    if isinstance(wrt, (list, tuple)):
        wrt = list(wrt)
    else:
        wrt = [wrt]

    if expression.ndim == 0:
        # expression is just a scalar, use grad
        return format_as(using_list, using_tuple,
                         grad(expression,
                              wrt,
                              consider_constant=consider_constant,
                              disconnected_inputs=disconnected_inputs))

    def inner_function(*args):
        idx = args[0]
        expr = args[1]
        rvals = []
        for inp in args[2:]:
            rval = grad(expr[idx],
                        inp,
                        consider_constant=consider_constant,
                        disconnected_inputs=disconnected_inputs)
            rvals.append(rval)
        return rvals
    # Computing the gradients does not affect the random seeds on any random
    # generator used n expression (because during computing gradients we are
    # just backtracking over old values. (rp Jan 2012 - if anyone has a
    # counter example please show me)
    jacobs, updates = theano.scan(inner_function,
                                  sequences=arange(expression.shape[0]),
                                  non_sequences=[expression] + wrt)
#the only difference from theano implementation -- no assertion for updates
#     assert not updates, \
#         ("Scan has returned a list of updates. This should not "
#          "happen! Report this to theano-users (also include the "
#          "script that generated the error)")
    return format_as(using_list, using_tuple, jacobs)
Beispiel #10
0
    def __init__(self,
                 intpic_parameters=None,
                 case_costs=None,
                 pics=None,
                 case_labels=None,
                 batch_size=None,
                 pic_size=None,
                 label_count=None,
                 **kwargs):
        super(IntpicGradientDescent, self).__init__(**kwargs)
        center_val = 0.5
        self.input_pics = pics
        self.case_costs = case_costs
        self.batch_size = batch_size
        self.label_count = label_count
        self.intpic_parameters = intpic_parameters
        self.jacobians = self._compute_jacobians()
        self.gradpics = OrderedDict([
            (param, _create_intpic_histogram_for(param, pic_size, label_count))
            for param in self.intpic_parameters
        ])
        self.intpics = OrderedDict([
            (param, _create_intpic_histogram_for(param, pic_size, label_count))
            for param in self.intpic_parameters
        ])
        # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx)
        # attributed_pics = tensor.batched_tensordot(
        #     tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count),
        #     pics[:, 0, :, :], axes=0)
        zeroed_pics = pics - 0.5
        attributed_pics = tensor.batched_tensordot(tensor.extra_ops.to_one_hot(
            case_labels.flatten(), label_count),
                                                   zeroed_pics[:, 0, :, :],
                                                   axes=0)
        self.gradpic_updates = OrderedDict([
            _create_gradpic_updates(self.gradpics[param],
                                    self.jacobians[param], attributed_pics)
            for param in self.intpic_parameters
        ])
        self.add_updates(self.gradpic_updates)

        intensity_pics = (zeroed_pics * gradient.grad(case_costs.mean(), pics))
        attributed_i_pics = tensor.batched_tensordot(
            tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count),
            intensity_pics[:, 0, :, :],
            axes=0)

        self.intpic_updates = OrderedDict([
            _create_intensity_updates(self.intpics[param],
                                      self.jacobians[param], attributed_i_pics)
            for param in self.intpic_parameters
        ])
        self.add_updates(self.intpic_updates)
Beispiel #11
0
def generate_adv_example(embedded, loss, perturb_scale):
    # embedded: [n_examples, input_length, feature_dim]

    grad = gradient.grad(loss, embedded)
    grad = gradient.disconnected_grad(grad)

    shifted = embedded + T.max(T.abs_(embedded)) + 1.0
    grad_dim = (shifted / shifted).sum(axis=(1, 2)).mean(
        axis=0)  # grad dim for each example
    sqrt_grad_dim = T.sqrt(grad_dim)  # sqrt(input_length * emb_dim)
    perturb = perturb_scale * sqrt_grad_dim * _scale_unit_l2(grad)

    return embedded + perturb
Beispiel #12
0
    def __init__(self, intpic_parameters=None,
            case_costs=None, pics=None, case_labels=None,
            batch_size=None, pic_size=None, label_count=None, **kwargs):
        super(IntpicGradientDescent, self).__init__(**kwargs)
        center_val = 0.5
        self.input_pics = pics
        self.case_costs = case_costs
        self.batch_size = batch_size
        self.label_count = label_count
        self.intpic_parameters = intpic_parameters
        self.jacobians = self._compute_jacobians()
        self.gradpics = OrderedDict(
          [(param, _create_intpic_histogram_for(param, pic_size, label_count))
                for param in self.intpic_parameters])
        self.intpics = OrderedDict(
          [(param, _create_intpic_histogram_for(param, pic_size, label_count))
                for param in self.intpic_parameters])
        # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx)
        # attributed_pics = tensor.batched_tensordot(
        #     tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count),
        #     pics[:, 0, :, :], axes=0)
        zeroed_pics = pics - 0.5
        attributed_pics = tensor.batched_tensordot(
            tensor.extra_ops.to_one_hot(
                case_labels.flatten(), label_count),
            zeroed_pics[:, 0, :, :],
            axes=0)
        self.gradpic_updates = OrderedDict(
            [_create_gradpic_updates(
                self.gradpics[param],
                self.jacobians[param],
                attributed_pics) for param in self.intpic_parameters])
        self.add_updates(self.gradpic_updates)

        intensity_pics = (zeroed_pics *
                gradient.grad(case_costs.mean(), pics))
        attributed_i_pics = tensor.batched_tensordot(
            tensor.extra_ops.to_one_hot(
                case_labels.flatten(), label_count),
            intensity_pics[:, 0, :, :],
            axes=0)

        self.intpic_updates = OrderedDict(
            [_create_intensity_updates(
                self.intpics[param],
                self.jacobians[param],
                attributed_i_pics) for param in self.intpic_parameters])
        self.add_updates(self.intpic_updates)
Beispiel #13
0
    def test_disconnected_nan(self):

        # test that connection_pattern can prevent getting NaN

        # Op1 has two outputs, f and g
        # x is connected to f but not to g
        class Op1(theano.gof.Op):
            __props__ = ()

            def make_node(self, x):
                return theano.Apply(self,
                                    inputs=[x],
                                    outputs=[x.type(),
                                             theano.tensor.scalar()])

            def connection_pattern(self, node):
                return [[True, False]]

            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like()]

        # Op2 has two inputs, f and g
        # Its gradient with respect to g is not defined
        class Op2(theano.gof.Op):
            __props__ = ()

            def make_node(self, f, g):
                return theano.Apply(self,
                                    inputs=[f, g],
                                    outputs=[theano.tensor.scalar()])

            def grad(self, inputs, output_grads):
                return [inputs[0].zeros_like(), NullType()()]

        x = theano.tensor.vector()
        f, g = Op1()(x)
        cost = Op2()(f, g)

        # cost is differentiable wrt x
        # but we can't tell that without using Op1's connection pattern
        # looking at the theano graph alone, g is an ancestor of cost
        # and has x as an ancestor, so we must compute its gradient

        g = gradient.grad(cost, x)
Beispiel #14
0
    def test_grad(self):
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX)

        x = T.matrix("x")

        expressions_gradients = [
            (x * gradient.disconnected_grad(x), x),
            (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)),
            (x ** 2 * gradient.disconnected_grad(x), 2 * x ** 2),
        ]

        for expr, expr_grad in expressions_gradients:
            g = gradient.grad(expr.sum(), x)
            # gradient according to theano
            f = theano.function([x], g, on_unused_input="ignore")
            # desired gradient
            f2 = theano.function([x], expr_grad, on_unused_input="ignore")

            assert np.allclose(f(a), f2(a))
Beispiel #15
0
    def test_grad(self):
        T = theano.tensor
        a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX)

        x = T.matrix("x")

        expressions_gradients = [
            (x * gradient.disconnected_grad(x), x),
            (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)),
            (x**2 * gradient.disconnected_grad(x), 2 * x**2),
        ]

        for expr, expr_grad in expressions_gradients:
            g = gradient.grad(expr.sum(), x)
            # gradient according to theano
            f = theano.function([x], g, on_unused_input="ignore")
            # desired gradient
            f2 = theano.function([x], expr_grad, on_unused_input="ignore")

            assert np.allclose(f(a), f2(a))
Beispiel #16
0
    def test_grad_disconnected(self):
        # tests corner cases of gradient for shape and alloc

        x = theano.tensor.vector(name="x")
        total = x.sum()
        total.name = "total"
        num_elements = x.shape[0]
        num_elements.name = "num_elements"
        silly_vector = theano.tensor.alloc(total / num_elements, num_elements)
        silly_vector.name = "silly_vector"
        cost = silly_vector.sum()
        cost.name = "cost"
        # note that cost simplifies to be the same as "total"
        g = gradient.grad(cost, x, add_names=False)
        # we still need to pass in x because it determines the shape of
        # the output
        f = theano.function([x], g)
        rng = np.random.RandomState([2012, 9, 5])
        x = np.cast[x.dtype](rng.randn(3))
        g = f(x)
        assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
Beispiel #17
0
def test_grad_disconnected():

    #tests corner cases of gradient for shape and alloc

    x = theano.tensor.vector(name='x')
    total = x.sum()
    total.name = 'total'
    num_elements = x.shape[0]
    num_elements.name = 'num_elements'
    silly_vector = theano.tensor.alloc(total / num_elements, num_elements)
    silly_vector.name = 'silly_vector'
    cost = silly_vector.sum()
    cost.name = 'cost'
    #note that cost simplifies to be the same as "total"
    g = gradient.grad(cost, x, add_names=False)
    #we still need to pass in x because it determines the shape of the output
    f = theano.function([x], g)
    rng = np.random.RandomState([2012, 9, 5])
    x = np.cast[x.dtype](rng.randn(3))
    g = f(x)
    assert np.allclose(g, np.ones(x.shape, dtype=x.dtype))
Beispiel #18
0
    def __init__(self, inputs, outputs, grad_depth=1, **kwargs):
        if not isinstance(outputs, list):
            raise TypeError('outputs must be list', outputs)
        for i in inputs + outputs:
            if not isinstance(i, gof.Variable):
                raise TypeError(
                        'inputs and outputs must be Variable instances', i)
        if 'updates' in kwargs:
            raise TypeError('updates are not allowed in kwargs')

        # TODO: the graph may have implicit inputs like
        #       SharedVariable instances.
        #       what impact to they have on the validity of this Op?
        self.fn = orig_function(inputs, outputs, **kwargs)
        self.inputs = inputs
        self.outputs = outputs
        self.input_types = [input.type for input in inputs]
        self.output_types = [output.type for output in outputs]

        if grad_depth > 0:
            output_grads = [t() for t in self.output_types]
            # OpFromGraph doesn't implement a connection_pattern, so for now we regard
            # all inputs and outputs as connected. This will compute the right numerical
            # value for the gradients but could fail to raise the disconnected inputs error
            # in some cases.
            gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
                    wrt=self.inputs, disconnected_inputs='ignore')
            self.grad_ops = []
            for g in gs:
                if g is None:
                    self.grad_ops.append(lambda *args: None)
                else:
                    # It is normal if some inputs are not needed in order
                    # to compute the gradient, so we ignore them.
                    self.grad_ops.append(OpFromGraph(inputs + output_grads,
                                                     [g],
                                                     grad_depth=grad_depth - 1,
                                                     on_unused_input='ignore'))
Beispiel #19
0
 def __init__(self,
              synpic_parameters=None,
              case_costs=None,
              pics=None,
              case_labels=None,
              batch_size=None,
              pic_size=None,
              label_count=None,
              **kwargs):
     kwargs.setdefault("before_training", True)
     center_val = 0.5
     self.input_pics = pics
     self.case_costs = case_costs
     self.batch_size = batch_size
     self.label_count = label_count
     self.synpic_parameters = synpic_parameters
     self.jacobians = self._compute_jacobians()
     self.synpics = OrderedDict([
         (param, _create_synpic_histogram_for(param, pic_size, label_count))
         for param in self.synpic_parameters
     ])
     # attributes pics: (cases, picy, picx) to (cases, labels, picy, picx)
     # attributed_pics = tensor.batched_tensordot(
     #     tensor.extra_ops.to_one_hot(case_labels.flatten(), label_count),
     #     pics[:, 0, :, :], axes=0)
     zeroed_pics = pics - 0.5
     focused_pics = zeroed_pics * abs(gradient.grad(case_costs.mean(),
                                                    pics))
     attributed_pics = tensor.batched_tensordot(tensor.extra_ops.to_one_hot(
         case_labels.flatten(), label_count),
                                                focused_pics[:, 0, :, :],
                                                axes=0)
     self.synpic_updates = OrderedDict([
         _create_synpic_updates(self.synpics[param], self.jacobians[param],
                                attributed_pics)
         for param in self.synpic_parameters
     ])
     super(SynpicExtension, self).__init__(**kwargs)
Beispiel #20
0
 def make_grad_func(X):
     Z = theano.tensor.dot(X, W) + b
     H = theano.tensor.nnet.sigmoid(Z)
     cost = H.sum()
     g = gradient.grad(cost, X)
     return theano.function([X, W, b], g, on_unused_input="ignore")
Beispiel #21
0
def main(save_to):
    batch_size = 500
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    convnet = create_lenet_5()

    mnist_test = MNIST(("test", ), sources=['features', 'targets'])
    basis_init = create_fair_basis(mnist_test, 10, 2)

    # b = shared_floatx(basis)
    # random_init = numpy.rand.random(100, 1000)
    # r = shared_floatx(random_init)
    # rn = r / r.norm(axis=1)
    # x = tensor.dot(rn, tensor.shape_padright(b))
    x = shared_floatx(basis_init)

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cg = ComputationGraph([probs])
    outs = VariableFilter(roles=[OUTPUT], bricks=[Convolutional,
                                                  Linear])(cg.variables)

    # Create an interior activation model
    model = Model([probs] + outs)

    # Load it with trained parameters
    params = load_parameters(open(save_to, 'rb'))
    model.set_parameter_values(params)

    learning_rate = shared_floatx(0.01, 'learning_rate')
    unit = shared_floatx(0, 'unit', dtype='int64')
    negate = False
    suffix = '_negsynth.jpg' if negate else '_synth.jpg'
    for output in outs:
        layer = get_brick(output)
        # For now, skip masks -for some reason they are always NaN
        iterations = 10000
        layername = layer.parents[0].name + '-' + layer.name
        # if layername != 'noisylinear_2-linear':
        #     continue
        dims = layer.get_dims(['output'])[0]
        if negate:
            measure = -output
        else:
            measure = output
        measure = measure[(slice(0, basis_init.shape[0]), ) + (slice(None), ) *
                          (measure.ndim - 1)]
        if isinstance(dims, numbers.Integral):
            dims = (dims, )
            costvec = -tensor.log(
                tensor.nnet.softmax(measure)[:, unit].flatten())
        else:
            flatout = measure.flatten(ndim=3)
            maxout = flatout.max(axis=2)
            costvec = -tensor.log(
                tensor.nnet.softmax(maxout)[:, unit].flatten())
        # Add a regularization to favor gray images.
        # cost = costvec.sum() + (x - 0.5).norm(2) * (
        #         10.0 / basis_init.shape[0])
        cost = costvec.sum()
        grad = gradient.grad(cost, x)
        stepx = x - learning_rate * grad
        normx = stepx / tensor.shape_padright(
            stepx.flatten(ndim=2).max(axis=1), n_ones=3)
        newx = tensor.clip(normx, 0, 1)
        newx = newx[(slice(0, basis_init.shape[0]), ) + (slice(None), ) *
                    (newx.ndim - 1)]
        fn = theano.function([], [cost], updates=[(x, newx)])
        filmstrip = Filmstrip(basis_init.shape[-2:],
                              (dims[0], basis_init.shape[0]),
                              background='red')
        for u in range(dims[0]):
            unit.set_value(u)
            x.set_value(basis_init)
            print('layer', layername, 'unit', u)
            for index in range(iterations):
                c = fn()[0]
                if index % 1000 == 0:
                    print('cost', c)
                    result = x.get_value()
                    for i2 in range(basis_init.shape[0]):
                        filmstrip.set_image((u, i2), result[i2, :, :, :])
                    filmstrip.save(layername + suffix)
            result = x.get_value()
            for index in range(basis_init.shape[0]):
                filmstrip.set_image((u, index), result[index, :, :, :])
            filmstrip.save(layername + suffix)
Beispiel #22
0
def evaluate_lenet5(datasets_=datasets, learning_rate=[17./(3**i) for i in range(6)], n_epochs=42,
                    nkerns=[12, 12, 0, 0], batch_size=1,
                    patience=200000, filter_shape=[3, 3, 0],
                    poolsize=[2, 2, 0] ):
	
	rng = numpy.random.RandomState(23455)

	train_set_x, train_set_y = datasets_[0]
	test_set_x, test_set_y = datasets_[1]

	n_train_batches = train_set_x.get_value(borrow=True).shape[0]
	n_test_batches = test_set_x.get_value(borrow=True).shape[0]
	n_train_batches //= batch_size
	n_test_batches //= batch_size

	index = T.lscalar()  # index to a [mini]batch

	x = T.matrix('x')   # the data is presented as images
	y = T.ivector('y')  # the labels are presented as 1D vector of
						# [int] labels

	######################
	# BUILD ACTUAL MODEL #
	######################
	print('... building the model')

	image_shape = (batch_size, dim_vals[0], dim_vals[1], dim_vals[2])
	
	layer0_input = x.reshape( image_shape )
	
	# Construct the first convolutional pooling layer:
	# filtering reduces the image size to (264-5+1 , 264-5+1) = (260, 260)
	# maxpooling reduces this further to (260/2, 260/2) = (130, 130)
	# 4D output tensor is thus of shape (batch_size, nkerns[0], 130, 130)
	layer0 = LeNetConvPoolLayer(
		rng,
		input=layer0_input,
		image_shape=image_shape,
		filter_shape=(nkerns[0], dim_vals[0], filter_shape[0], filter_shape[0]),
		poolsize=(poolsize[0], poolsize[0])
	)
	
	# Construct the second convolutional pooling layer
	# filtering reduces the image size to (130-3+1, 130-3+1) = (128, 128)
	# maxpooling reduces this further to (128/2, 128/2) = (64, 64)
	# 4D output tensor is thus of shape (batch_size, nkerns[1], 64, 64)
	layer1_input_shape = (dim_val+1-filter_shape[0]) / poolsize[0]
	layer1 = LeNetConvPoolLayer(
		rng,
		input=layer0.output,
		image_shape=(batch_size, nkerns[0], layer1_input_shape, layer1_input_shape),
		filter_shape=(nkerns[1], nkerns[0], filter_shape[1], filter_shape[1]),
		poolsize=(poolsize[1], poolsize[1])
	)
	
	# the HiddenLayer being fully-connected, it operates on 2D matrices of
	# shape (batch_size, num_pixels) (i.e matrix of rasterized images).
	# This will generate a matrix of shape (batch_size, nkerns[3] * 31 * 31)
	layer2_input = layer1.output.flatten(2)

	layer2_input_shape = (layer1_input_shape+1-filter_shape[1]) / poolsize[1]
	layer2 = HiddenLayer(
		rng,
		input=layer2_input,
		n_in=nkerns[1] * layer2_input_shape * layer2_input_shape,
		n_out=500,
		activation=T.tanh
	)
	
	# the HiddenLayer being fully-connected, it operates on 2D matrices of
	# shape (batch_size, num_pixels) (i.e matrix of rasterized images).
	# This will generate a matrix of shape (batch_size, nkerns[3] * 31 * 31)
	# layer3_input = layer2.output
# 
# 	layer3 = HiddenLayer(
# 		rng,
# 		input=layer3_input,
# 		n_in=1000,
# 		n_out=500,
# 		activation=T.tanh
# 	)
	
	layer4 = LogisticRegression(input=layer2.output, n_in=500, n_out=13)

	cost = layer4.negative_log_likelihood(y)

	test_model = theano.function(
		[index],
		layer4.errors(y),
		givens={
			x: test_set_x[index * batch_size: (index + 1) * batch_size],
			y: test_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	test_model_on_train = theano.function(
		[index],
		layer4.errors(y),
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)

	params = layer4.params + layer2.params + layer1.params + layer0.params

	grads = grad(cost, params)

	updates_0 = [
		(param_i, param_i - learning_rate[0] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	updates_1 = [
		(param_i, param_i - learning_rate[1] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	updates_2 = [
		(param_i, param_i - learning_rate[2] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	updates_3 = [
		(param_i, param_i - learning_rate[3] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	updates_4 = [
		(param_i, param_i - learning_rate[4] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	updates_5 = [
		(param_i, param_i - learning_rate[5] * grad_i)
		for param_i, grad_i in zip(params, grads)
	]
	
	# updates_6 = [
# 		(param_i, param_i - learning_rate[6] * grad_i)
# 		for param_i, grad_i in zip(params, grads)
# 	]
	
	train_model_0 = theano.function(
		[index],
		cost,
		updates=updates_0,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	train_model_1 = theano.function(
		[index],
		cost,
		updates=updates_1,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	train_model_2 = theano.function(
		[index],
		cost,
		updates=updates_2,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	train_model_3 = theano.function(
		[index],
		cost,
		updates=updates_3,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	train_model_4 = theano.function(
		[index],
		cost,
		updates=updates_4,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	train_model_5 = theano.function(
		[index],
		cost,
		updates=updates_5,
		givens={
			x: train_set_x[index * batch_size: (index + 1) * batch_size],
			y: train_set_y[index * batch_size: (index + 1) * batch_size]
		}
	)
	
	# train_model_6 = theano.function(
# 		[index],
# 		cost,
# 		updates=updates_6,
# 		givens={
# 			x: train_set_x[index * batch_size: (index + 1) * batch_size],
# 			y: train_set_y[index * batch_size: (index + 1) * batch_size]
# 		}
# 	)
	

	###############
	# TRAIN MODEL #
	###############
	print('... training')
	# early-stopping parameters
	start_time = timeit.default_timer()

	epoch = 0
	done_looping = False
	cblr = [n_epochs*i/len(learning_rate) for i in range(len(learning_rate)+1)]
	for i in range( len(learning_rate) ):
		while (epoch in range(cblr[i],cblr[i+1])) and (not done_looping):
			epoch = epoch + 1
			for minibatch_index in range(n_train_batches):

				iter = (epoch - 1) * n_train_batches + minibatch_index

				if iter % 1000 == 0:
					print('training @ iter = ', iter)
				if i == 0:
					cost_ij = train_model_0( minibatch_index )
				elif i == 1:
					cost_ij = train_model_1( minibatch_index )
				elif i == 2:
					cost_ij = train_model_2( minibatch_index )
				elif i == 3:
					cost_ij = train_model_3( minibatch_index )
				elif i == 4:
					cost_ij = train_model_4( minibatch_index )
				elif i == 5:
					cost_ij = train_model_5( minibatch_index )
				# elif i == 6:
# 					cost_ij = train_model_6( minibatch_index )

				if patience <= iter:
					done_looping = True
					break

	end_time = timeit.default_timer()
	print('Optimization complete.')
	print(('The code for file ' +
		   os.path.split(__file__)[1] +
		   ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
	# test it on the training set
	test_losses = [
		test_model_on_train(i)
		for i in range(n_train_batches)
	]
	test_score = numpy.mean(test_losses)
	print(('     epoch %i, minibatch %i/%i, test error on training set '
		   '%f %%') %
		  (epoch, minibatch_index + 1, n_train_batches,
		   test_score * 100.))
	error_on_train.append( test_score * 100. )
		   
	# test it on the test set
	test_losses = [
		test_model(i)
		for i in range(n_test_batches)
	]
	test_score = numpy.mean(test_losses)
	print(('     epoch %i, minibatch %i/%i, test error '
		   '%f %%') %
		  (epoch, minibatch_index + 1, n_train_batches,
		   test_score * 100.))
	error_on_test.append( test_score * 100. )
Beispiel #23
0
 def make_grad_func(X):
     Z = theano.tensor.dot(X, W) + b
     H = theano.tensor.nnet.sigmoid(Z)
     cost = H.sum()
     g = gradient.grad(cost, X)
     return theano.function([X, W, b], g, on_unused_input='ignore')
Beispiel #24
0
def main(save_to):
    batch_size = 365
    feature_maps = [6, 16]
    mlp_hiddens = [120, 84]
    conv_sizes = [5, 5]
    pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations, 1, image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='valid',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info("Input dim: {} {} {}".format(
        *convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(
                i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32')
    layers = [l for l in convnet.layers if isinstance(l, Convolutional)]
    mnist_test = MNIST(("test",), sources=['features', 'targets'])
    basis_init = create_fair_basis(mnist_test, 10, 50)
    basis_set = make_shifted_basis(basis_init, convnet, layers)

    for layer, basis in zip(layers, basis_set):
        # basis is 5d:
        # (probed_units, base_cases, 1-c, 28-y, 28-x)
        b = shared_floatx(basis)
        # coefficients is 2d:
        # (probed_units, base_cases)
        coefficients = shared_floatx(
                numpy.ones(basis.shape[0:2],
                    dtype=theano.config.floatX))
        # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x)
        prod = tensor.shape_padright(coefficients, 3) * b
        # x is 4d: (probed_units, 1-c, 28-y, 28-x)
        ux = prod.sum(axis=1)
        x = tensor.clip(ux /
                tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3),
                0, 1)

        # Normalize input and apply the convnet
        probs = convnet.apply(x)
        cg = ComputationGraph([probs])
        outs = VariableFilter(
                roles=[OUTPUT], bricks=[layer])(cg.variables)

        # Create an interior activation model
        model = Model([probs] + outs)

        # Load it with trained parameters
        params = load_parameters(open(save_to, 'rb'))
        model.set_parameter_values(params)

        learning_rate = shared_floatx(0.03, 'learning_rate')
        # We will try to do all units at once.
        # unit = shared_floatx(0, 'unit', dtype='int64')
        # But we are only doing one layer at once.
        output = outs[0]
        dims = layer.get_dims(['output'])[0]
        if isinstance(dims, numbers.Integral):
            # FC case: output is 2d: (probed_units, units)
            dims = (dims, )
            unitrange = tensor.arange(dims[0])
            costvec = -tensor.log(
                    tensor.nnet.softmax(output)[unitrange, unitrage].
                    flatten())
        else:
            # Conv case: output is 4d: (probed_units, units, y, x)
            unitrange = tensor.arange(dims[0])
            print('dims is', dims)
            costvec = -tensor.log(tensor.nnet.softmax(output[
                unitrange, unitrange, dims[1] // 2, dims[2] // 2]).
                flatten())
        cost = costvec.sum()
        # grad is dims (probed_units, basis_size)
        grad = gradient.grad(cost, coefficients)
        stepc = coefficients # - learning_rate * grad
        newc = stepc / tensor.shape_padright(stepc.mean(axis=1))
        fn = theano.function([], [cost, x], updates=[(coefficients, newc)])
        filmstrip = Filmstrip(
            random_init.shape[-2:], (dims[0], 1),
            background='red')
        layer = get_brick(output)
        learning_rate.set_value(0.1)
        for index in range(20000):
            c, result = fn()
            if index % 1000 == 0:
                learning_rate.set_value(numpy.cast[theano.config.floatX](
                    learning_rate.get_value() * 0.8))
                print('cost', c)
                for u in range(dims[0]):
                    filmstrip.set_image((u, 0), result[u,:,:,:])
                    filmstrip.save(layer.name + '_stroke.jpg')
            for u in range(dims[0]):
                filmstrip.set_image((u, 0), result[u,:,:,:])
            filmstrip.save(layer.name + '_stroke.jpg')
Beispiel #25
0
    def functions(self, sequence_length):
        key = (sequence_length)

        if key not in self.cache:
            logging.info("Need to construct graph for sequence_length=%d..." %
                         (sequence_length))

            # creating network input variable nodes
            correct_inputs = t.ftensor3("correct input")
            noise_inputs = t.ftensor3("noise input")
            learning_rate = t.fscalar("learning rate")

            # creating op nodes for firing the network
            correct_score, correct_prehidden = self.score(correct_inputs)
            noise_score, noise_prehidden = self.score(noise_inputs)

            # creating op nodes for the pairwise ranking cost function
            loss = t.clip(1 - correct_score + noise_score, 0, 1e999)
            total_loss = t.sum(loss)

            # the necessary cost function gradients
            parameters_gradient = grad(total_loss, list(self.parameters))
            correct_inputs_gradient = grad(total_loss, correct_inputs)
            noise_inputs_gradient = grad(total_loss, noise_inputs)

            # setting network inputs
            predict_inputs = [correct_inputs]
            train_inputs = [correct_inputs, noise_inputs, learning_rate]
            verbose_predict_inputs = predict_inputs

            # setting network outputs
            predict_outputs = [correct_score]
            train_outputs = [
                correct_inputs_gradient, noise_inputs_gradient, loss,
                correct_score, noise_score
            ]
            verbose_predict_outputs = [correct_score, correct_prehidden]

            nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs))
            logging.info(
                "About to compile prediction function over %d ops [nodes]..." %
                nnodes)
            predict = theano.function(predict_inputs,
                                      predict_outputs,
                                      mode=COMPILE_MODE)
            logging.info("...done constructing graph for sequence_length=%d" %
                         (sequence_length))

            nnodes = len(
                theano.gof.graph.ops(verbose_predict_inputs,
                                     verbose_predict_outputs))
            logging.info(
                "About to compile verbose prediction function over %d ops [nodes]..."
                % nnodes)
            verbose_predict = theano.function(verbose_predict_inputs,
                                              verbose_predict_outputs,
                                              mode=COMPILE_MODE)
            logging.info("...done constructing graph for sequence_length=%d" %
                         (sequence_length))

            nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs))
            logging.info(
                "About to compile training function over %d ops [nodes]..." %
                nnodes)
            train = theano.function(
                train_inputs,
                train_outputs,
                mode=COMPILE_MODE,
                updates=[(p, p - learning_rate * gp) for p, gp in zip(
                    list(self.parameters), parameters_gradient)])
            logging.info("...done constructing graph for sequence_length=%d" %
                         (sequence_length))

            self.cache[key] = (predict, train, verbose_predict)

        return self.cache[key]
Beispiel #26
0
def assign_step_methods(model,
                        step=None,
                        methods=STEP_METHODS,
                        step_kwargs=None):
    """Assign model variables to appropriate step methods.

    Passing a specified model will auto-assign its constituent stochastic
    variables to step methods based on the characteristics of the variables.
    This function is intended to be called automatically from `sample()`, but
    may be called manually. Each step method passed should have a
    `competence()` method that returns an ordinal competence value
    corresponding to the variable passed to it. This value quantifies the
    appropriateness of the step method for sampling the variable.

    Parameters
    ----------
    model : Model object
        A fully-specified model object
    step : step function or vector of step functions
        One or more step functions that have been assigned to some subset of
        the model's parameters. Defaults to None (no assigned variables).
    methods : vector of step method classes
        The set of step methods from which the function may choose. Defaults
        to the main step methods provided by PyMC3.
    step_kwargs : dict
        Parameters for the samplers. Keys are the lower case names of
        the step method, values a dict of arguments.

    Returns
    -------
    methods : list
        List of step methods associated with the model's variables.
    """
    steps = []
    assigned_vars = set()

    if step is not None:
        try:
            steps += list(step)
        except TypeError:
            steps.append(step)
        for step in steps:
            try:
                assigned_vars = assigned_vars.union(set(step.vars))
            except AttributeError:
                for method in step.methods:
                    assigned_vars = assigned_vars.union(set(method.vars))

    # Use competence classmethods to select step methods for remaining
    # variables
    selected_steps = defaultdict(list)
    for var in model.free_RVs:
        if var not in assigned_vars:
            # determine if a gradient can be computed
            has_gradient = var.dtype not in discrete_types
            if has_gradient:
                try:
                    tg.grad(model.logpt, var)
                except (AttributeError, NotImplementedError,
                        tg.NullTypeGradError):
                    has_gradient = False
            # select the best method
            selected = max(methods,
                           key=lambda method, var=var, has_gradient=
                           has_gradient: method._competence(var, has_gradient))
            pm._log.info('Assigned {0} to {1}'.format(selected.__name__, var))
            selected_steps[selected].append(var)

    return instantiate_steppers(model, steps, selected_steps, step_kwargs)
Beispiel #27
0
def main(save_to):
    batch_size = 500
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    convnet = create_lenet_5()

    mnist_test = MNIST(("test",), sources=['features', 'targets'])
    basis_init = create_fair_basis(mnist_test, 10, 2)

    # b = shared_floatx(basis)
    # random_init = numpy.rand.random(100, 1000)
    # r = shared_floatx(random_init)
    # rn = r / r.norm(axis=1)
    # x = tensor.dot(rn, tensor.shape_padright(b))
    x = shared_floatx(basis_init)

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cg = ComputationGraph([probs])
    outs = VariableFilter(
            roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)

    # Create an interior activation model
    model = Model([probs] + outs)

    # Load it with trained parameters
    params = load_parameters(open(save_to, 'rb'))
    model.set_parameter_values(params)

    learning_rate = shared_floatx(0.01, 'learning_rate')
    unit = shared_floatx(0, 'unit', dtype='int64')
    negate = False
    suffix = '_negsynth.jpg' if negate else '_synth.jpg'
    for output in outs:
        layer = get_brick(output)
        # For now, skip masks -for some reason they are always NaN
        iterations = 10000
        layername = layer.parents[0].name + '-' + layer.name
        # if layername != 'noisylinear_2-linear':
        #     continue
        dims = layer.get_dims(['output'])[0]
        if negate:
            measure = -output
        else:
            measure = output
        measure = measure[(slice(0, basis_init.shape[0]), ) +
                (slice(None),) * (measure.ndim - 1)]
        if isinstance(dims, numbers.Integral):
            dims = (dims, )
            costvec = -tensor.log(tensor.nnet.softmax(
                measure)[:,unit].flatten())
        else:
            flatout = measure.flatten(ndim=3)
            maxout = flatout.max(axis=2)
            costvec = -tensor.log(tensor.nnet.softmax(
                maxout)[:,unit].flatten())
        # Add a regularization to favor gray images.
        # cost = costvec.sum() + (x - 0.5).norm(2) * (
        #         10.0 / basis_init.shape[0])
        cost = costvec.sum()
        grad = gradient.grad(cost, x)
        stepx = x - learning_rate * grad
        normx = stepx / tensor.shape_padright(
                stepx.flatten(ndim=2).max(axis=1), n_ones=3)
        newx = tensor.clip(normx, 0, 1)
        newx = newx[(slice(0, basis_init.shape[0]), ) +
                (slice(None),) * (newx.ndim - 1)]
        fn = theano.function([], [cost], updates=[(x, newx)])
        filmstrip = Filmstrip(
            basis_init.shape[-2:], (dims[0], basis_init.shape[0]),
            background='red')
        for u in range(dims[0]):
            unit.set_value(u)
            x.set_value(basis_init)
            print('layer', layername, 'unit', u)
            for index in range(iterations):
                c = fn()[0]
                if index % 1000 == 0:
                    print('cost', c)
                    result = x.get_value()
                    for i2 in range(basis_init.shape[0]):
                        filmstrip.set_image((u, i2), result[i2,:,:,:])
                    filmstrip.save(layername + suffix)
            result = x.get_value()
            for index in range(basis_init.shape[0]):
                filmstrip.set_image((u, index), result[index,:,:,:])
            filmstrip.save(layername + suffix)
Beispiel #28
0
def main(save_to):
    batch_size = 365
    feature_maps = [6, 16]
    mlp_hiddens = [120, 84]
    conv_sizes = [5, 5]
    pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # The above are from LeCun's paper. The blocks example had:
    #    feature_maps = [20, 50]
    #    mlp_hiddens = [500]

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    1,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='valid',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32')
    layers = [l for l in convnet.layers if isinstance(l, Convolutional)]
    mnist_test = MNIST(("test", ), sources=['features', 'targets'])
    basis_init = create_fair_basis(mnist_test, 10, 50)
    basis_set = make_shifted_basis(basis_init, convnet, layers)

    for layer, basis in zip(layers, basis_set):
        # basis is 5d:
        # (probed_units, base_cases, 1-c, 28-y, 28-x)
        b = shared_floatx(basis)
        # coefficients is 2d:
        # (probed_units, base_cases)
        coefficients = shared_floatx(
            numpy.ones(basis.shape[0:2], dtype=theano.config.floatX))
        # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x)
        prod = tensor.shape_padright(coefficients, 3) * b
        # x is 4d: (probed_units, 1-c, 28-y, 28-x)
        ux = prod.sum(axis=1)
        x = tensor.clip(
            ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0,
            1)

        # Normalize input and apply the convnet
        probs = convnet.apply(x)
        cg = ComputationGraph([probs])
        outs = VariableFilter(roles=[OUTPUT], bricks=[layer])(cg.variables)

        # Create an interior activation model
        model = Model([probs] + outs)

        # Load it with trained parameters
        params = load_parameters(open(save_to, 'rb'))
        model.set_parameter_values(params)

        learning_rate = shared_floatx(0.03, 'learning_rate')
        # We will try to do all units at once.
        # unit = shared_floatx(0, 'unit', dtype='int64')
        # But we are only doing one layer at once.
        output = outs[0]
        dims = layer.get_dims(['output'])[0]
        if isinstance(dims, numbers.Integral):
            # FC case: output is 2d: (probed_units, units)
            dims = (dims, )
            unitrange = tensor.arange(dims[0])
            costvec = -tensor.log(
                tensor.nnet.softmax(output)[unitrange, unitrage].flatten())
        else:
            # Conv case: output is 4d: (probed_units, units, y, x)
            unitrange = tensor.arange(dims[0])
            print('dims is', dims)
            costvec = -tensor.log(
                tensor.nnet.softmax(output[unitrange, unitrange, dims[1] // 2,
                                           dims[2] // 2]).flatten())
        cost = costvec.sum()
        # grad is dims (probed_units, basis_size)
        grad = gradient.grad(cost, coefficients)
        stepc = coefficients  # - learning_rate * grad
        newc = stepc / tensor.shape_padright(stepc.mean(axis=1))
        fn = theano.function([], [cost, x], updates=[(coefficients, newc)])
        filmstrip = Filmstrip(random_init.shape[-2:], (dims[0], 1),
                              background='red')
        layer = get_brick(output)
        learning_rate.set_value(0.1)
        for index in range(20000):
            c, result = fn()
            if index % 1000 == 0:
                learning_rate.set_value(numpy.cast[theano.config.floatX](
                    learning_rate.get_value() * 0.8))
                print('cost', c)
                for u in range(dims[0]):
                    filmstrip.set_image((u, 0), result[u, :, :, :])
                    filmstrip.save(layer.name + '_stroke.jpg')
            for u in range(dims[0]):
                filmstrip.set_image((u, 0), result[u, :, :, :])
            filmstrip.save(layer.name + '_stroke.jpg')