Example #1
0
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
        inputs = shared(inputs_val)
        dCdH = shared(dCdH_val)

        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
                                             WShape=filters_shape,
                                             d=subsample)
        img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
        topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3))
        if (subsample == (1, 1, 1)):
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
                                                                     topgrad)
        else:
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                img, topgrad, shape=filters_shape[1:4])
        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
        f_ref = theano.function([], conv)
        f = theano.function([], conv_gemm, mode=mode_with_gpu)

        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)
    def test_compare_1D_and_2D_upsampling_values(self):
        """Compare 1D and 2D upsampling

        This method verifies the bilinear upsampling done by using
        1D and 2D kernels will generate the same result.

        """
        # checking upsampling with ratio 5
        input_x = np.random.rand(5, 4, 6, 7).astype(theano.config.floatX)
        mat_1D = bilinear_upsampling(input=input_x, ratio=5,
                                     batch_size=5, num_input_channels=4,
                                     use_1D_kernel=True)
        mat_2D = bilinear_upsampling(input=input_x, ratio=5,
                                     batch_size=5, num_input_channels=4,
                                     use_1D_kernel=False)
        f_1D = theano.function([], mat_1D, mode=self.compile_mode)
        f_2D = theano.function([], mat_2D, mode=self.compile_mode)
        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)

        # checking upsampling with ratio 8
        input_x = np.random.rand(12, 11, 10, 7).astype(theano.config.floatX)
        mat_1D = bilinear_upsampling(input=input_x, ratio=8,
                                     batch_size=12, num_input_channels=11,
                                     use_1D_kernel=True)
        mat_2D = bilinear_upsampling(input=input_x, ratio=8,
                                     batch_size=12, num_input_channels=11,
                                     use_1D_kernel=False)
        f_1D = theano.function([], mat_1D, mode=self.compile_mode)
        f_2D = theano.function([], mat_2D, mode=self.compile_mode)
        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
def visualize_states(hidden_states, updates,
                     train_stream, valid_stream,
                     args):

    # Get all the hidden_states
    filter_states = VariableFilter(theano_name_regex="hidden_state_.*")
    all_states = filter_states(hidden_states)
    all_states = sorted(all_states, key=lambda var: var.name[-1])

    # Get all the hidden_cells
    filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*")
    all_cells = filter_cells(hidden_states)
    all_cells = sorted(all_cells, key=lambda var: var.name[-1])

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    # Compile the function
    logger.info("The compilation of the function has started")
    if args.rnn_type == "lstm" and args.visualize_cells:
        compiled = theano.function(inputs=ComputationGraph(all_cells).inputs,
                                   outputs=all_cells,
                                   givens=givens, updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    else:
        compiled = theano.function(inputs=ComputationGraph(all_states).inputs,
                                   outputs=all_states,
                                   givens=givens, updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))

    # Plot the function
    plot("hidden_state", train_stream, compiled, args)
Example #4
0
def test_in_transit():
    t = np.linspace(-20, 20, 1000)
    m_planet = np.array([0.3, 0.5])
    m_star = 1.45
    r_star = 1.5
    orbit = KeplerianOrbit(
        m_star=m_star,
        r_star=r_star,
        t0=np.array([0.5, 17.4]),
        period=np.array([10.0, 5.3]),
        ecc=np.array([0.1, 0.8]),
        omega=np.array([0.5, 1.3]),
        m_planet=m_planet,
    )

    r_pl = np.array([0.1, 0.03])
    coords = theano.function([], orbit.get_relative_position(t))()
    r2 = coords[0]**2 + coords[1]**2
    inds = theano.function([], orbit.in_transit(t, r=r_pl))()

    m = np.isin(np.arange(len(t)), inds)
    in_ = r2[inds] <= ((r_star + r_pl)**2)[None, :]
    in_ &= coords[2][inds] > 0
    assert np.all(np.any(in_, axis=1))

    out = r2[~m] > ((r_star + r_pl)**2)[None, :]
    out |= coords[2][~m] <= 0
    assert np.all(out)
    def test_bilinear_kernel_1D(self):
        """Test 1D kernels used in bilinear upsampling

        This method tests the correctness of the
        1D kernel values used in bilinear upsampling
        for some upsampling ratios.

        """
        rat = tensor.iscalar()
        kernel_ten = bilinear_kernel_1D(ratio=rat, normalize=False)
        f_ten = theano.function([rat], kernel_ten)

        kernel_ten_norm = bilinear_kernel_1D(ratio=rat, normalize=True)
        f_ten_norm = theano.function([rat], kernel_ten_norm)

        for ratio in [2, 3, 4, 5, 6, 7, 8, 9]:
            # getting the un-normalized kernel
            kernel = bilinear_kernel_1D(ratio=ratio, normalize=False)
            f = theano.function([], kernel)
            kernel_1D = self.numerical_kernel_1D(ratio)
            utt.assert_allclose(kernel_1D, f())
            utt.assert_allclose(kernel_1D, f_ten(ratio))

            # getting the normalized kernel
            kernel = bilinear_kernel_1D(ratio=ratio, normalize=True)
            f = theano.function([], kernel)
            kernel_1D = kernel_1D / float(ratio)
            utt.assert_allclose(kernel_1D, f())
            utt.assert_allclose(kernel_1D, f_ten_norm(ratio))
Example #6
0
    def _compile_models(self):
        tn_x, _ = self.datasets[0]
        v_x, _ = self.datasets[1]
        tt_x, _ = self.datasets[2]

        tn_model = theano.function(
            inputs=[self.index],
            outputs=self.cost,
            updates=self.updates,
            givens={
                self.x: tn_x[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            }
        )
        v_model = theano.function(
            inputs=[self.index],
            outputs=self.learner.error,
            givens={
                self.x: v_x[self.index * self.batch_size: (self.index + 1) * self.batch_size],
            }
        )
        tt_model = theano.function(
            inputs=[self.index],
            outputs=self.learner.error,
            givens={
                self.x: tt_x[self.index * self.batch_size: (self.index + 1) * self.batch_size],
            }
        )
        return [tn_model, v_model, tt_model]
Example #7
0
    def define_train_test_funcs(self):
        activation = self.layers[len(self.layers) - 1].activation
        self.Y = T.matrix("Y")
        pYs = T.reshape(activation, (self.maskY.shape[0] * self.batch_size, self.out_size))
        tYs =  T.reshape(self.Y, (self.maskY.shape[0] * self.batch_size, self.out_size))
        cost = self.categorical_crossentropy(pYs, tYs)
        
        gparams = []
        for param in self.params:
            #gparam = T.grad(cost, param)
            gparam = T.clip(T.grad(cost, param), -10, 10)
            gparams.append(gparam)

        lr = T.scalar("lr")
        # eval(): string to function
        optimizer = eval(self.optimizer)
        updates = optimizer(self.params, gparams, lr)

        #updates = sgd(self.params, gparams, lr)
        #updates = momentum(self.params, gparams, lr)
        #updates = rmsprop(self.params, gparams, lr)
        #updates = adagrad(self.params, gparams, lr)
        #updates = dadelta(self.params, gparams, lr)
        #updates = adam(self.params, gparams, lr)
        
        self.train = theano.function(inputs = [self.X, self.maskX, self.Y, self.maskY, lr, self.batch_size],
                                               givens = {self.is_train : np.cast['int32'](1)},
                                               outputs = cost,
                                               updates = updates)
        self.predict = theano.function(inputs = [self.X, self.maskX, self.batch_size],
                                                 givens = {self.is_train : np.cast['int32'](0)},
                                                 outputs = activation)
Example #8
0
    def test_examples_8(self):
        from theano import shared
        # Force the dtype to int64 to work correctly on 32 bit computer.
        # Otherwise, it create by default a int32 on 32 bit computer.
        state = shared(0)
        inc = T.iscalar('inc')
        accumulator = function([inc], state, updates=[(state, state+inc)])

        assert state.get_value()       == array(0)
        assert accumulator(1)          == array(0)
        assert state.get_value()       == array(1)
        assert accumulator(300)        == array(1)
        assert state.get_value()       == array(301)

        state.set_value(-1)
        assert accumulator(3)          == array(-1)
        assert state.get_value()       == array(2)

        decrementor = function([inc], state, updates=[(state, state-inc)])
        assert decrementor(2)          == array(2)
        assert state.get_value()       == array(0)

        fn_of_state = state * 2 + inc
        # The type of foo must match the shared variable we are replacing
        # with the ``givens``
        foo = T.scalar(dtype=state.dtype)
        skip_shared = function([inc, foo], fn_of_state,
                               givens=[(state, foo)])
        assert skip_shared(1, 3)       == array(7)
        assert state.get_value()       == array(0)
Example #9
0
    def test_copy_random_state(self):

        class Graph():
            def __init__(self, seed=123):
                self.rng = RandomStreams(seed)
                self.y = self.rng.uniform(size=(1,))

        g1 = Graph(seed=123)
        f1 = theano.function([], g1.y)

        g2 = Graph(seed=987)
        f2 = theano.function([], g2.y)

        #print 'By default, the two functions are out of sync.'
        v1 =  f1()
        v2 =  f2()

        def copy_random_state(g1, g2):
            if isinstance(g1.rng, MRG_RandomStreams):
                g2.rng.rstate = g1.rng.rstate
            for (su1, su2) in zip(g1.rng.state_updates, g2.rng.state_updates):
                su2[0].set_value(su1[0].get_value())

        #print 'We now copy the state of the theano random number generators.'
        copy_random_state(g1, g2)
        v3 = f1()
        v4 = f2()
        assert numpy.allclose(v1, 0.72803009)
        assert numpy.allclose(v2, 0.55056769)
        assert numpy.allclose(v3, 0.59044123)
        assert numpy.allclose(v4, 0.59044123)
def sgd(lr, tparams, grads, x, mask, y, cost):
    """ Stochastic Gradient Descent

    :note: A more complicated version of sgd then needed.  This is
        done like that for adadelta and rmsprop.

    """
    # New set of shared variable that will contain the gradient
    # for a mini-batch.
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
               for k, p in tparams.items()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    # Function that computes gradients for a mini-batch, but do not
    # updates the weights.
    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup,
                                    name='sgd_f_grad_shared')

    pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]

    # Function that updates the weights from the previously computed
    # gradient.
    f_update = theano.function([lr], [], updates=pup,
                               name='sgd_f_update')

    return f_grad_shared, f_update
Example #11
0
def adam(lr, tparams, grads, inp, cost):
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup)

    lr0 = 0.0002
    b1 = 0.1
    b2 = 0.001
    e = 1e-8

    updates = []

    i = theano.shared(numpy.float32(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr0 * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')

    return f_grad_shared, f_update
Example #12
0
def test_dnn_conv_merge():
    # This test that we merge correctly multiple dnn_conv.
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img_shp = [2, 5, 6, 8]
    kern_shp = [3, 5, 5, 6]
    img = T.ftensor4('img')
    kern = T.ftensor4('kern')
    out = T.ftensor4('out')
    desc = dnn.GpuDnnConvDesc(
        border_mode='valid')(kern.shape)

    # Test forward op
    o1 = dnn.dnn_conv(img, kern)
    o2 = dnn.dnn_conv(img, kern)
    f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
    d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'),
               numpy.random.rand(*kern_shp).astype('float32'))
    topo = f.maker.fgraph.toposort()
    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1

    # Test grad w op
    o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
    o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc)
    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1

    # Test grad i op
    o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
    o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc)
    f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
Example #13
0
def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)
    val = numpy.random.rand(10, 11).astype("float32")
    res = f(val)
    utt.assert_allclose(res, val.flatten())
    assert res.shape == val.flatten().shape
    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
    val = numpy.random.rand(10, 11).astype("float32")
    res = f(val)
    utt.assert_allclose(res, val.flatten())
    assert res.shape == val.flatten().shape
    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]

    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
    val = numpy.random.rand(10, 11).astype("float32")
    res = f(val)
    utt.assert_allclose(res, val)
    assert res.shape == val.shape
    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]

    m = theano.tensor.tensor3()
    f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu)
    val = numpy.random.rand(10, 11, 12).astype("float32")
    res = f(val)
    utt.assert_allclose(res, val.reshape(10, -1))
    assert res.shape == val.reshape(10, -1).shape
    assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
Example #14
0
def test_local_gpualloc_memset_0():
    i = theano.tensor.iscalar()
    z = numpy.zeros((1,), dtype='float32')
    o = numpy.ones((1,), dtype='float32')
    ones = numpy.ones((2,), dtype='float32')

    # Test with 0
    a = gpu_alloc(z, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 0).all()

    # Test with 1
    a = gpu_alloc(o, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 1).all()

    # Test with 1, 1
    a = gpu_alloc(ones, i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(2)) == 1).all()
Example #15
0
def test_pooling_opt():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    x = T.fmatrix()

    f = theano.function(
        [x],
        pool_2d(x, ds=(2, 2), mode='average_inc_pad',
                ignore_border=True),
        mode=mode_with_gpu)

    assert any([isinstance(n.op, dnn.GpuDnnPool)
                for n in f.maker.fgraph.toposort()])

    f(numpy.zeros((10, 10), dtype='float32'))

    f = theano.function(
        [x],
        T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad',
                       ignore_border=True).sum(),
               x),
        mode=mode_with_gpu.including("cudnn"))

    assert any([isinstance(n.op, dnn.GpuDnnPoolGrad)
                for n in f.maker.fgraph.toposort()])

    f(numpy.zeros((10, 10), dtype='float32'))
Example #16
0
def sdg(lr,params,grads,x,mask,y,cost):
    '''随机梯度下降
    参数:
        lr:学习速率
        params: 网络中的参数
        grads: 梯度
        x,y: 输入数据
        cost: 损失
    返回:
        两个theano函数:
        1. f_grad_shared计算梯度,输出误差
        2. f_update更新权重
    '''
    #新的shared变量,包含mini_batch的梯度
    gshared=[theano.shared(p.get_value()*0.,name='%s_grad' %k) for k,p in params.items()]
    gsup=[(gs,g) for gs,g in zip(gshared,grads)]
    
    #计算mini_batch的梯度的function,但是不更新权重
    f_grad_shared=theano.function([x,mask,y],cost,updates=gsup,name='sgd_f_grad_shared')
    
    pup=[(p,p-lr*g) for p,g in zip(params.values(),gshared)]
    #计算权重更新的函数
    f_update=theano.function([lr],[],updates=pup,name='sgd_f_update')
    
    return f_grad_shared,f_update
Example #17
0
def adadelta(lr,tparams,grads,x,mask,y,cost):
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    #梯度更新字典
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Example #18
0
    def build_model(self, data, batch_size=256):
        # create a neural network
        rng = np.random.RandomState(7919)
        t_X = T.matrix(dtype=theano.config.floatX)
        t_y = T.vector(dtype=theano.config.floatX)
	t_learning_rate = T.scalar(dtype=theano.config.floatX)

        layer1 = NeuralNetworkLayer(rng, t_X, 8, 256, T.nnet.relu)
        layer2 = NeuralNetworkLayer(rng, layer1.output, 256, 256, T.nnet.relu)
        layer3 = NeuralNetworkLayer(rng, layer2.output, 256, 1, T.nnet.relu)
        output = T.cast((layer3.output + 0.5), 'int32')

        cost = T.sum((layer3.output - t_y.reshape((batch_size, 1))) ** 2)
        params = layer3.params + layer2.params + layer1.params
        grads = T.grad(cost, params)

        updates = [
            (param_i, param_i - t_learning_rate * grad_i)
            for param_i, grad_i in zip(params, grads)
        ]

        self.train_model = theano.function(
            [ t_learning_rate ],
            cost,
            updates=updates,
            givens={
                t_X: data.data_X,
                t_y: data.data_y,
            }
        )

        # evaluation function
        self.forward = theano.function([ t_X ], output)
def build_train_valid(l_out):
    params = nn.layers.get_all_params(l_out, regularizable = True)
    wc_term = 0.5 * sum(T.sum(param ** 2) for param in params)
    
    x_batch = T.tensor4('x', theano.config.floatX)
    y_batch = T.matrix('y', 'int32')
    train_output = nn.layers.get_output(l_out, x_batch)
    train_loss = nn.objectives.binary_crossentropy(train_output, y_batch)
    train_loss = nn.objectives.aggregate(train_loss, mode = 'mean')
    train_loss += wc * wc_term
    params = nn.layers.get_all_params(l_out, trainable = True)

    valid_output = nn.layers.get_output(l_out, x_batch, deterministic = True)

    lr = theano.shared(np.float32(lr_schedule(0)))
    updates = nn.updates.nesterov_momentum(train_loss, params, lr, momentum)

    x_shared = nn.utils.shared_empty(dim = len(input_dims))
    y_shared = nn.utils.shared_empty(dim = 2, dtype = 'int32')
    idx = T.scalar('idx', 'int32')
    givens = {x_batch: x_shared[idx * batch_size:(idx + 1) * batch_size],
              y_batch: y_shared[idx * batch_size:(idx + 1) * batch_size]}

    iter_train = theano.function([idx], [train_loss, train_output],
                                 givens = givens,
                                 updates = updates)
    
    givens = {x_batch: x_shared[idx * batch_size:(idx + 1) * batch_size]}
    iter_valid = theano.function([idx], valid_output, givens = givens)
    
    return x_shared, y_shared, idx, lr, iter_train, iter_valid
Example #20
0
 def initialise_model(self, X_train, y_train):
     print 'Initialising model...'
     self.input_shape = X_train.shape[1]
     input_var = T.matrix('inputs')
     target_var = T.matrix('targets')
     
     if self.normalise:
         y_train = self.normalise_y(y_train, reset = True)
         X_train = self.normalise_X(X_train, reset = True)
 
     # Create neural network model
     self.network = self.build_custom_mlp(input_var)
     prediction = lasagne.layers.get_output(self.network)
     loss = lasagne.objectives.squared_error(prediction, target_var)
     loss = loss.mean()
     params = lasagne.layers.get_all_params(self.network, trainable=True)
     updates = lasagne.updates.nesterov_momentum(loss, params, 
                                         learning_rate=self.learning_rate, 
                                         momentum=self.momentum)
     test_prediction = lasagne.layers.get_output(self.network,
                                                 deterministic=True)
     test_loss = lasagne.objectives.squared_error(test_prediction,
                                                  target_var)
     test_loss = test_loss.mean()
     self.train_fn = theano.function([input_var, target_var], loss, 
                                updates=updates, allow_input_downcast=True)
     self.predict_output = theano.function([input_var],
                                           outputs=test_prediction,
                                           allow_input_downcast=True)
     self.initialised = True
	def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}):
		self.embedding_dim = embedding_dim
		self.num_hidden_layers = num_hidden_layers
		self.hidden_dim = hidden_dim
		self.in_dropout_p = in_dropout_p
		self.hidden_dropout_p = update_hyperparams
	
		print >> sys.stderr, 'Building computation graph for discriminator...'		
		self.input_var = T.matrix('input')
		self.target_var = T.matrix('targer')

		self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in')
		self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, 0.2)
		self.layers = [self.l_in, self.l_in_dr]
		for i in xrange(self.num_hidden_layers):
			l_hid = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i)))
			l_hid_dr = lasagne.layers.DropoutLayer(l_hid, 0.5)
			self.layers.append(l_hid)
			self.layers.append(l_hid_dr)
		self.l_preout = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=1, nonlinearity=None, name='l_preout'))
		self.l_out = lasagne.layers.NonlinearityLayer(self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out')

		self.prediction = lasagne.layers.get_output(self.l_out)
		self.loss = lasagne.objectives.binary_crossentropy(self.prediction, self.target_var).mean()
		self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean()

		self.params = lasagne.layers.get_all_params(self.l_out, trainable=True)
		self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams)

		print >> sys.stderr, 'Compiling discriminator...'
		self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates)
		self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
Example #22
0
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[],
             exclude_params=set([])):
    '''Adadelta'''
    zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
        for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(
        inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile)

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
        for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir)
        if p.name not in exclude_params]

    if not isinstance(lr, list): lr = [lr]
    f_update = theano.function(lr, [], updates=ru2up+param_up,
        on_unused_input='ignore', profile=profile)

    return f_grad_shared, f_update
Example #23
0
    def time_linker(name, linker):
        steps_a = 5
        steps_b = 100
        x = tensor.vector()
        a = build_graph(x,steps_a)
        b = build_graph(x,steps_b)


        f_a = function([x], a,
                mode=Mode(optimizer=None, linker=linker()),
                #profile='f_a speed test %s'%name,
                )
        f_b = function([x], b,
                mode=Mode(optimizer=None, linker=linker()),
                #profile='f_b speed test %s'%name,
                )

        print f_a([2.0, 3.0])
        t0 = time.time()
        print f_a([2.0, 3.0])
        t1 = time.time()

        print f_b([2.0, 3.0])

        t2 = time.time()
        print f_b([2.0, 3.0])
        t3 = time.time()

        t_a = t1 - t0
        t_b = t3 - t2

        print "%s takes %f s/Kop" % (
                name,
                (1000*(t_b-t_a) / (steps_b - steps_a)))
Example #24
0
def test_opt_gpujoin_onlyajoin():
    # from a bug in normal sampling
    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32')
    a = cuda.shared_constructor(_a)
    b = cuda.shared_constructor(_b)

    c = tensor.join(1, a, b)

    f = theano.function([], c, mode=mode_with_gpu)

    f()

    graph_nodes = f.maker.fgraph.toposort()

    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
    assert isinstance(graph_nodes[-2].op, cuda.GpuJoin)

    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))

    # test mixed dtype
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float64')
    b = theano.tensor.constant(_b)

    c = tensor.join(1, a, b)

    f = theano.function([], c, mode=mode_with_gpu)

    f()

    graph_nodes = f.maker.fgraph.toposort()
    assert isinstance(graph_nodes[-1].op, theano.tensor.Join)

    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
    def architecture(self, cons, code_layer):
        """Build up the architecture by theano"""
        for i in range(len(self.layers)-1):
            # Initialize shared variables
            init_w = cons*np.random.randn(self.layers[i], self.layers[i+1])
            self.weights.append(th.shared(init_w))
            init_bias = cons*np.random.randn(self.layers[i+1])
            self.biases.append(th.shared(init_bias))

            # Building architecture
            a_before = T.dot(self.a_n[i], self.weights[i]) + \
                self.biases[i].dimshuffle('x', 0)
            a_next = self.activ(a_before)
            self.a_n.append(a_next)

        # help the optimization
        for param in (self.weights+self.biases):
            self.auxiliary.append(th.shared(np.zeros(param.get_value().shape)))

        self.encode = th.function([self.x], self.a_n[code_layer])
        self.decode = th.function([self.a_n[code_layer]], self.a_n[-1])

        # Calculate the cost and gradients
        Cost = (T.sum((self.a_n[-1]-self.y_hat)**2))/self.batch
        params = self.weights + self.biases
        grads = T.grad(Cost, params, disconnected_inputs='ignore')

        # Update parameters
        update_query = self.update(params, grads, self.auxiliary)
        self.gradient_2 = th.function(inputs=[self.x, self.y_hat],
                                      updates=update_query, outputs=Cost)
Example #26
0
def test_local_assert_no_cpu_op():
    numpy.random.seed(1)
    m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32")
    ms = cuda.shared_constructor(m, name="m_shared")
    out = theano.tensor.tanh(ms).dot(ms.T)

    mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_0")
    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_1")

    old = config.assert_no_cpu_op
    old2 = config.on_opt_error
    # If the flag is raise
    try:
        config.assert_no_cpu_op = 'raise'
        config.on_opt_error = 'ignore'

        assert_raises(AssertionError, theano.function,
                        [], out, mode=mode_local_assert)
    finally:
        config.assert_no_cpu_op = old
        config.on_opt_error = old2

    # If the flag is ignore
    try:
        config.assert_no_cpu_op = 'ignore'
        theano.function([], out, mode=mode_local_assert)
    finally:
        config.assert_no_cpu_op = old
Example #27
0
def test_alloc_memset_0():
    i = tensor.iscalar()
    z = numpy.zeros((1,), dtype='float32')
    o = numpy.ones((1,), dtype='float32')
    ones = numpy.ones((2,), dtype='float32')

    # Test with 0
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 0).all()

    # Test with 1
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(6)) == 1).all()

    # Test with 1, 1
    a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i)
    f = theano.function([i], a, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert isinstance(topo[0].op, basic_ops.GpuAlloc)
    assert not topo[0].op.memset_0
    assert (numpy.asarray(f(2)) == 1).all()
Example #28
0
    def test_pattern_output(self):
        #print (self.phi.W.get_value(borrow=True))
        assert (self.phi.W.get_value(borrow=True).shape == (self.n,self.d))
        self.phi.W.set_value ( np.array([[1,0]]).T )
        assert (self.phi.W.get_value(borrow=True).shape == (self.n,self.d))

        assert (self.psi.W.get_value(borrow=True).shape == (self.d,self.num_classes))
        self.psi.W.set_value ( np.array([[1,]]) )
        assert (self.psi.W.get_value(borrow=True).shape == (self.d,self.num_classes))

#        assert (self.beta.W.get_value(borrow=True).shape == (self.d, self.m))
#        # [1,1] means that we will project the intermediate representation
#        # onto both dimensions of the output representation
#        self.beta.W.set_value ( np.array([[1,1]]) )
#        assert (self.beta.W.get_value(borrow=True).shape == (self.d, self.m))
        
        test_prediction = lasagne.layers.get_output(self.pattern, deterministic=True)
        test_fn = theano.function([self.input_var], test_prediction)
        X_hat = test_fn(self.X)
        assert ( np.all(X_hat == self.S) )
        
#        self.phi1 = test_prediction
#        self.phi2 = lasagne.layers.get_output(self.pattern, self.side_var, deterministic=True)
        beta_prediction = self.pattern.get_beta_output_for(self.input_var, self.side_var, deterministic=True)
        beta_fn = theano.function([self.input_var, self.side_var], beta_prediction)
        C_hat = beta_fn(self.X, self.CX)
        assert ( np.all(C_hat == self.Cy) )
Example #29
0
def test_in_transit_circ():
    t = np.linspace(-20, 20, 1000)
    m_planet = np.array([0.3, 0.5])
    m_star = 1.45
    r_star = 1.5
    orbit = KeplerianOrbit(
        m_star=m_star,
        r_star=r_star,
        t0=np.array([0.5, 17.4]),
        period=np.array([10.0, 5.3]),
        ecc=np.array([0.0, 0.0]),
        omega=np.array([0.0, 0.0]),
        m_planet=m_planet,
    )
    orbit_circ = KeplerianOrbit(
        m_star=m_star,
        r_star=r_star,
        t0=np.array([0.5, 17.4]),
        period=np.array([10.0, 5.3]),
        m_planet=m_planet,
    )

    r_pl = np.array([0.1, 0.03])
    inds = theano.function([], orbit.in_transit(t, r=r_pl))()
    inds_circ = theano.function([], orbit_circ.in_transit(t, r=r_pl))()
    assert np.all(inds == inds_circ)
Example #30
0
  def __init__(self, kernel, max_iter = 10, max_diff = None):
    """

    :param kernel: a function with a signature (expected, observed) -> a similarity measure
    that accepts symbolic theano expressions and returns them accordingly.
    See `crayimage.hotornot.em.kernels` for examples.
    :param max_iter: maximal number of iteration
    :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`.
    If None the check is not performed.
    """
    self.original_shape = None

    self.kernel = kernel
    self.max_iter = max_iter
    self.max_diff = max_diff

    self.X = theano.shared(
      np.zeros(shape=(0, 0), dtype='float32')
    )

    self.weights = theano.shared(
      np.ones(shape=(0, ), dtype='float32')
    )

    canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights)

    weights_updates = self.kernel(canonical, self.X)
    weights_diff = T.max(abs(weights_updates - self.weights))

    upd = {
      self.weights : weights_updates
    }

    self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd)
    self.get_canonical = theano.function([], canonical)
Example #31
0
def test_doctorAI(
	modelFile='model.txt',
	seqFile='seq.txt',
	inputDimSize=20000,
	labelFile='label.txt',
	numClass=500,
	timeFile='',
	predictTime=False,
	useLogTime=True,
	hiddenDimSize=[200,200],
	batchSize=100,
	logEps=1e-8,
	mean_duration=20.0,
	verbose=False
):
	options = locals().copy()

	if len(timeFile) > 0: useTime = True
	else: useTime = False
	options['useTime'] = useTime

	models = np.load(modelFile)
	tparams = init_tparams(models)

	print('build model ... ',)
	if predictTime:
		x, t, mask, codePred, timePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code')
		predict_time = theano.function(inputs=[x,t,mask], outputs=timePred, name='predict_time')
	elif useTime:
		x, t, mask, codePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code')
	else:
		x, mask, codePred = build_model(tparams, options)
		predict_code = theano.function(inputs=[x,mask], outputs=codePred, name='predict_code')

	options['inputDimSize']=models['W_emb'].shape[0]
	options['numClass']=models['b_output'].shape[0]
	print('load data ... ', )
	testSet = load_data(seqFile, labelFile, timeFile)
	n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize)))
	print('done')

	predVec = []
	trueVec = []
	predTimeVec = []
	trueTimeVec = []
	iteration = 0
	for batchIndex in range(n_batches):
		tempX = testSet[0][batchIndex*batchSize: (batchIndex+1)*batchSize]
		tempY = testSet[1][batchIndex*batchSize: (batchIndex+1)*batchSize]
		if predictTime:
			tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize]
			x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options)
			codeResults = predict_code(x, t, mask)
			timeResults = predict_time(x, t, mask)
		elif useTime:
			tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize]
			x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options)
			codeResults = predict_code(x, t, mask)
		else:
			x, mask, lengths = padMatrixWithoutTime(tempX, options)
			codeResults = predict_code(x, mask)

		for i in range(codeResults.shape[1]):
			tensorMatrix = codeResults[:,i,:]
			thisY = tempY[i][1:]
			for timeIndex in range(lengths[i]):
				if len(thisY[timeIndex]) == 0: continue
				trueVec.append(thisY[timeIndex])
				output = tensorMatrix[timeIndex]
				predVec.append(list(zip(*heapq.nlargest(30, enumerate(output), key=operator.itemgetter(1))))[0])

		if predictTime:
			for i in range(timeResults.shape[1]):
				timeVec = timeResults[:,i]
				trueTimeVec.extend(tempT[i][1:])
				for timeIndex in range(lengths[i]):
					predTimeVec.append(timeVec[timeIndex])

		if (iteration % 10 == 0) and verbose: print('iteration:%d/%d' % (iteration, n_batches))
		iteration += 1
		if iteration == 10: break
			
	recall = recallTop(trueVec, predVec)
	print('recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2]))

	if predictTime: 
		r_squared = calculate_r_squared(trueTimeVec, predTimeVec, options)
		print('R2:%f' % r_squared)
Example #32
0
    l_action_formed = lasagne.layers.ReshapeLayer(input_layer=l_action,
                                        shape=(N_BATCH, N_TIME_STEPS, N_ACTIONS))


    # Cost function is mean squared error
    input = T.tensor3('input')
    target_output = T.tensor3('target_output')

    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    #
    action_prediction = theano.function([input], l_action_formed.get_output(input))


    all_params = lasagne.layers.get_all_params(l_action_formed)

    records = []
    for time in xrange(50):
        records.append([])
        _all_params = lasagne.layers.get_all_params(l_action_formed)
        _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1)))


        baseline = None
        num_parameters = 4 # five parameters
        epsilon = 3 # initial number sigma
        sigma_list = ones(num_parameters) * epsilon
Example #33
0
    def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None):
        self.optimizer = optimizers.get(optimizer)

        self.loss = objectives.get(loss)
        weighted_loss = weighted_objective(objectives.get(loss))

        # input of model
        self.X_train = self.get_input(train=True)
        self.X_test = self.get_input(train=False)

        self.y_train = self.get_output(train=True)
        self.y_test = self.get_output(train=False)

        # target of model
        self.y = T.zeros_like(self.y_train)

        self.weights = T.ones_like(self.y_train)

        if hasattr(self.layers[-1], "get_output_mask"):
            mask = self.layers[-1].get_output_mask()
        else:
            mask = None
        train_loss = weighted_loss(self.y, self.y_train, self.weights, mask)
        test_loss = weighted_loss(self.y, self.y_test, self.weights, mask)

        train_loss.name = 'train_loss'
        test_loss.name = 'test_loss'
        self.y.name = 'y'

        if class_mode == "categorical":
            train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1)))
            test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1)))

        elif class_mode == "binary":
            train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)))
            test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)))
        else:
            raise Exception("Invalid class mode:" + str(class_mode))
        self.class_mode = class_mode
        self.theano_mode = theano_mode

        for r in self.regularizers:
            train_loss = r(train_loss)
        updates = self.optimizer.get_updates(self.trainable_params, self.constraints, train_loss)
        updates += self.updates

        if type(self.X_train) == list:
            train_ins = self.X_train + [self.y, self.weights]
            test_ins = self.X_test + [self.y, self.weights]
            predict_ins = self.X_test
        else:
            train_ins = [self.X_train, self.y, self.weights]
            test_ins = [self.X_test, self.y, self.weights]
            predict_ins = [self.X_test]

        self._train = theano.function(train_ins, train_loss, updates=updates,
                                      allow_input_downcast=True, mode=theano_mode)
        self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates,
                                               allow_input_downcast=True, mode=theano_mode)
        self._predict = theano.function(predict_ins, self.y_test,
                                        allow_input_downcast=True, mode=theano_mode)
        self._test = theano.function(test_ins, test_loss,
                                     allow_input_downcast=True, mode=theano_mode)
        self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy],
                                              allow_input_downcast=True, mode=theano_mode)
Example #34
0
 def make_grad_func(X):
     Z = theano.tensor.dot(X, W) + b
     H = theano.tensor.nnet.sigmoid(Z)
     cost = H.sum()
     g = gradient.grad(cost, X)
     return theano.function([X, W, b], g, on_unused_input="ignore")
Example #35
0
 def test_undefined_grad_func(self):
     # tests that function compilation catches undefined grads in the graph
     a = theano.tensor.vector()
     b = theano.gradient.grad_undefined(theano.tensor.add, 0, a)
     with pytest.raises(TypeError):
         theano.function([a], b, on_unused_input="ignore")
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, sent_vector_size, dim, mode, answer_module,
                 input_mask_mode, memory_hops, l2, normalize_attention,
                 batch_norm, dropout, dropout_in, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {None: 0}
        self.ivocab = {0: None}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.sent_vector_size = sent_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.dropout_in = dropout_in

        self.max_inp_sent_len = 0
        self.max_q_len = 0
        """
        #To Use All Vocab
        self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0}
        self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'}
        #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0}
        #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'}
        #"""

        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.imatrix('input_var')
        self.q_var = T.ivector('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        self.attentions = []

        self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len)
        self.pe_matrix_q = self.pe_matrix(self.max_q_len)

        print "==> building input module"

        #positional encoder weights
        self.W_pe = nn_utils.normal_param(std=0.1,
                                          shape=(self.vocab_size, self.dim))

        #biGRU input fusion weights
        self.W_inp_res_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_res_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.inp_sent_reps, _ = theano.scan(fn=self.sum_pos_encodings_in,
                                            sequences=self.input_var)

        self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps)

        self.inp_c = self.input_module_full(self.inp_sent_reps)

        self.q_q = self.sum_pos_encodings_q(self.q_var)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.memory_hops, self.dim,
                                                4 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1,
                                         shape=(self.memory_hops, 1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0,
                                           shape=(
                                               self.memory_hops,
                                               self.dim,
                                           ))
        self.b_2 = nn_utils.constant_param(value=0.0,
                                           shape=(
                                               self.memory_hops,
                                               1,
                                           ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            self.mem_weight_num = int(iter - 1)
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in[self.mem_weight_num],
                                self.W_mem_res_hid[self.mem_weight_num],
                                self.b_mem_res[self.mem_weight_num],
                                self.W_mem_upd_in[self.mem_weight_num],
                                self.W_mem_upd_hid[self.mem_weight_num],
                                self.b_mem_upd[self.mem_weight_num],
                                self.W_mem_hid_in[self.mem_weight_num],
                                self.W_mem_hid_hid[self.mem_weight_num],
                                self.b_mem_hid[self.mem_weight_num]))

        last_mem_raw = memory[-1].dimshuffle(('x', 0))

        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # add conditional ending?
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))

            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_pe,
            self.W_inp_res_in_fwd,
            self.W_inp_res_hid_fwd,
            self.b_inp_res_fwd,
            self.W_inp_upd_in_fwd,
            self.W_inp_upd_hid_fwd,
            self.b_inp_upd_fwd,
            self.W_inp_hid_in_fwd,
            self.W_inp_hid_hid_fwd,
            self.b_inp_hid_fwd,
            self.W_inp_res_in_bwd,
            self.W_inp_res_hid_bwd,
            self.b_inp_res_bwd,
            self.W_inp_upd_in_bwd,
            self.W_inp_upd_hid_bwd,
            self.b_inp_upd_bwd,
            self.W_inp_hid_in_bwd,
            self.W_inp_hid_hid_bwd,
            self.b_inp_hid_bwd,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adam(self.loss,
                                       self.params,
                                       learning_rate=0.0001,
                                       beta1=0.5)  #from DCGAN paper

        self.attentions = T.stack(self.attentions)
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss, self.attentions],
                updates=updates,
                on_unused_input='warn',
                allow_input_downcast=True)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var
            ],
            outputs=[self.prediction, self.loss, self.attentions],
            on_unused_input='warn',
            allow_input_downcast=True)
Example #37
0
    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage

        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
Example #38
0
    def test_machine_translation(self):
        """
        This test case comes from https://github.com/rizar/scan-grad-speed and
        is an example of actual computation done with scan in the context of
        machine translation

        'dim' has been reduced from 1000 to 5 to make the test run faster
        """

        # Parameters from an actual machine tranlation run
        batch_size = 80
        seq_len = 50
        n_words = 80 * 50
        dim = 5

        # Weight matrices
        U = theano.shared(
            numpy.random.normal(size=(dim, dim),
                                scale=0.0001).astype(config.floatX))
        U.name = 'U'
        V = theano.shared(U.get_value())
        V.name = 'V'
        W = theano.shared(U.get_value())
        W.name = 'W'

        # Variables and their values
        x = T.tensor3('x')
        x_value = numpy.random.normal(size=(seq_len, batch_size, dim),
                                      scale=0.0001).astype(config.floatX)

        ri = T.tensor3('ri')
        ri_value = x_value

        zi = T.tensor3('zi')
        zi_value = x_value

        init = T.alloc(numpy.cast[config.floatX](0), batch_size, dim)

        def rnn_step1(
                # sequences
                x,
                ri,
                zi,
                # outputs_info
                h):
            pre_r = ri + h.dot(U)
            pre_z = zi + h.dot(V)
            r = T.nnet.sigmoid(pre_r)
            z = T.nnet.sigmoid(pre_z)

            after_r = r * h
            pre_h = x + after_r.dot(W)
            new_h = T.tanh(pre_h)

            res_h = z * new_h + (1 - z) * h
            return res_h

        # Compile the function twice, once with the optimization and once
        # without
        opt_mode = mode.including("scan")
        h, _ = theano.scan(rnn_step1,
                           sequences=[x, ri, zi],
                           n_steps=seq_len,
                           outputs_info=init,
                           name='fpass1',
                           mode=opt_mode)
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_opt = theano.function(inputs=[x, ri, zi],
                                outputs=grad1,
                                mode=opt_mode)

        no_opt_mode = mode.excluding("scanOp_pushout_output")
        h, _ = theano.scan(rnn_step1,
                           sequences=[x, ri, zi],
                           n_steps=seq_len,
                           outputs_info=init,
                           name='fpass1',
                           mode=no_opt_mode)
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_no_opt = theano.function(inputs=[x, ri, zi],
                                   outputs=grad1,
                                   mode=no_opt_mode)

        # Validate that the optimization has been applied
        scan_node_grad = [
            node for node in f_opt.maker.fgraph.toposort()
            if isinstance(node.op, Scan)
        ][1]

        for output in scan_node_grad.op.outputs:
            assert not (
                isinstance(output.owner.op, T.elemwise.Elemwise)
                and any([isinstance(i, T.Dot) for i in output.owner.inputs]))

        # Compare the outputs of the two functions on the same input data.
        f_opt_output = f_opt(x_value, ri_value, zi_value)
        f_no_opt_output = f_no_opt(x_value, ri_value, zi_value)
        utt.assert_allclose(f_opt_output, f_no_opt_output)
# updates from ADAM
updates = Adam(cost, params)

###########################################################
###########################################################

############ THEANO FUNC. FOR TRAINING, VAL., ETC.  #######
###########################################################

print '....compiling training and testing functions'

train_model = theano.function([sent, phonemes],
                              outputs=[cost, encoder_cost, cross_entropy_cost],
                              updates=updates,
                              givens={
                                  x: sent[:-1],
                                  ahead: sent[1:],
                                  y: phonemes[:-1]
                              })

probe_model = theano.function([sent, phonemes],
                              outputs=[cost, encoder_cost, cross_entropy_cost],
                              givens={
                                  x: sent[:-1],
                                  ahead: sent[1:],
                                  y: phonemes[:-1]
                              })

validate_model = theano.function(
    inputs=[sent, phonemes],
    outputs=[cost, encoder_cost, cross_entropy_cost],
Example #40
0
def train_doctorAI(
	seqFile='seqFile.txt',
	inputDimSize=20000,
	labelFile='labelFile.txt',
	numClass=500,
	outFile='outFile.txt',
	timeFile='timeFile.txt',
	predictTime=False,
	tradeoff=1.0,
	useLogTime=True,
	embFile='embFile.txt',
	embSize=200,
	embFineTune=True,
	hiddenDimSize=[200,200],
	batchSize=100,
	max_epochs=10,
	L2_output=0.001,
	L2_time=0.001,
	dropout_rate=0.5,
	logEps=1e-8,
	verbose=False
):
	options = locals().copy()

	if len(timeFile) > 0: useTime = True
	else: useTime = False
	options['useTime'] = useTime
	
	print 'Initializing the parameters ... ',
	params = init_params(options)
	tparams = init_tparams(params, options)

	print 'Building the model ... ',
	f_grad_shared = None
	f_update = None
	if predictTime and embFineTune:
		print 'predicting duration, fine-tuning code representations'
		use_noise, x, y, t, t_label, mask, lengths, cost =  build_model(tparams, options)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label)
	elif predictTime and not embFineTune:
		print 'predicting duration, not fine-tuning code representations'
		W_emb = theano.shared(params['W_emb'], name='W_emb')
		use_noise, x, y, t, t_label, mask, lengths, cost =  build_model(tparams, options, W_emb)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label)
	elif useTime and embFineTune:
		print 'using duration information, fine-tuning code representations'
		use_noise, x, y, t, mask, lengths, cost =  build_model(tparams, options)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t)
	elif useTime and not embFineTune:
		print 'using duration information, not fine-tuning code representations'
		W_emb = theano.shared(params['W_emb'], name='W_emb')
		use_noise, x, y, t, mask, lengths, cost =  build_model(tparams, options, W_emb)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t)
	elif not useTime and embFineTune:
		print 'not using duration information, fine-tuning code representations'
		use_noise, x, y, mask, lengths, cost =  build_model(tparams, options)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options)
	elif not useTime and not embFineTune:
		print 'not using duration information, not fine-tuning code representations'
		W_emb = theano.shared(params['W_emb'], name='W_emb')
		use_noise, x, y, mask, lengths, cost =  build_model(tparams, options, W_emb)
		grads = T.grad(cost, wrt=tparams.values())
		f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options)

	print 'Loading data ... ',
	trainSet, validSet, testSet = load_data(seqFile, labelFile, timeFile)
	n_batches = int(np.ceil(float(len(trainSet[0])) / float(batchSize)))
	print 'done'

	if predictTime: test_model = theano.function(inputs=[x, y, t, t_label, mask, lengths], outputs=cost, name='test_model')
	elif useTime: test_model = theano.function(inputs=[x, y, t, mask, lengths], outputs=cost, name='test_model')
	else: test_model = theano.function(inputs=[x, y, mask, lengths], outputs=cost, name='test_model')

	bestValidCrossEntropy = 1e20
	bestValidEpoch = 0
	testCrossEntropy = 0.0
	print 'Optimization start !!'
	for epoch in xrange(max_epochs):
		iteration = 0
		costVector = []
		for index in random.sample(range(n_batches), n_batches):
			use_noise.set_value(1.)
			batchX = trainSet[0][index*batchSize:(index+1)*batchSize]
			batchY = trainSet[1][index*batchSize:(index+1)*batchSize]
			if predictTime:
				batchT = trainSet[2][index*batchSize:(index+1)*batchSize]
				x, y, t, t_label, mask, lengths = padMatrixWithTimePrediction(batchX, batchY, batchT, options)
				cost = f_grad_shared(x, y, t, t_label, mask, lengths)
			elif useTime:
				batchT = trainSet[2][index*batchSize:(index+1)*batchSize]
				x, y, t, mask, lengths = padMatrixWithTime(batchX, batchY, batchT, options)
				cost = f_grad_shared(x, y, t, mask, lengths)
			else:
				x, y, mask, lengths = padMatrixWithoutTime(batchX, batchY, options)
				cost = f_grad_shared(x, y, mask, lengths)
			costVector.append(cost)
			f_update()
			if (iteration % 10 == 0) and verbose: print 'epoch:%d, iteration:%d/%d, cost:%f' % (epoch, iteration, n_batches, cost)
			iteration += 1

		print 'epoch:%d, mean_cost:%f' % (epoch, np.mean(costVector))
		use_noise.set_value(0.)
		validAuc = calculate_auc(test_model, validSet, options)
		print 'Validation cross entropy:%f at epoch:%d' % (validAuc, epoch)
		if validAuc < bestValidCrossEntropy: 
			bestValidCrossEntropy = validAuc
			bestValidEpoch = epoch
			bestParams = unzip(tparams)
			testCrossEntropy = calculate_auc(test_model, testSet, options)
			print 'Test cross entropy:%f at epoch:%d' % (testCrossEntropy, epoch)
			tempParams = unzip(tparams)
			np.savez_compressed(outFile + '.' + str(epoch), **tempParams)
	print 'The best valid cross entropy:%f at epoch:%d' % (bestValidCrossEntropy, bestValidEpoch)
	print 'The test cross entropy: %f' % testCrossEntropy
Example #41
0
def evaluate_lenet5(
    learning_rate=0.1,
    n_epochs=200,
    dataset="mnist.pkl.gz",
    nkerns=[20, 50],
    batch_size=500,
):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # display some chars:
    display_some(train_set_x, train_set_y.eval(), n=5, title="label=")

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print("... building the model")

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2),
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2),
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        [layer3.errors(y), layer3.y_pred],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
        },
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
        },
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        inputs=[index],
        outputs=[cost, layer3.errors(y)],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
        },
    )

    ###############
    # TRAIN MODEL #
    ###############
    print("... training")
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.0
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    # for error_curve plot
    cost_train = []  # observe likelihood cost while training
    err_train = []  # observe train err while training
    err_valid = []  # observe valid err while training
    err_test = []  # observe test  err while training

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print("training @ iter = ", iter)
            train_outputs = train_model(minibatch_index)
            cost_ij = train_outputs[0]
            err_train.append(train_outputs[1])  # add error_train

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                err_valid.append(this_validation_loss)

                print("epoch %i, minibatch %i/%i, validation error %f %%" % (
                    epoch,
                    minibatch_index + 1,
                    n_train_batches,
                    this_validation_loss * 100.0,
                ))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)[0] for i in range(n_test_batches)
                    ]

                    test_score = numpy.mean(test_losses)

                    err_test.append(test_score)

                    print(("     epoch %i, minibatch %i/%i, test error of "
                           "best model %f %%") % (
                               epoch,
                               minibatch_index + 1,
                               n_train_batches,
                               test_score * 100.0,
                           ))
                    """
                    # save the best model
                    with open('../doc/data/best_model.pkl', 'wb') as f:
                        pickle.dump(layer0, layer1, layer2, layer3, f)
                    """

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print("Optimization complete.")
    print("Best validation score of %f %% obtained at iteration %i, "
          "with test performance %f %%" %
          (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0))
    print(
        ("The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" %
         ((end_time - start_time) / 60.0)),
        file=sys.stderr,
    )

    model = [layer0, layer1, layer2, layer3]
    # save the best model
    with open("../doc/data/best_model.pkl", "wb") as f:
        pickle.dump(model, f)

    test_pred_y = test_model(0)[1]  # predict on first batch_size sampless

    # display some chars using predict
    display_some(test_set_x, test_pred_y, n=5, title="pred=")  # n < batch_size
    return err_train, err_valid, err_test
Example #42
0
    def _run(self, num_features, num_timesteps, batch_size, mode):
        # determine shapes of inputs and targets depending on the batch size
        if batch_size == 1:
            inputs_size = (num_timesteps, num_features)
            targets_size = (num_timesteps, 1)
        else:
            inputs_size = (num_timesteps, batch_size, num_features)
            targets_size = (num_timesteps, batch_size, 1)

        # make inputs and targets shared variables
        inputs = theano.shared(self.rng.uniform(size=inputs_size).astype(
            config.floatX),
                               borrow=True)
        targets = theano.shared(self.rng.uniform(size=targets_size).astype(
            config.floatX),
                                borrow=True)

        # create symbolic inputs and targets variables
        if batch_size == 1:
            x = T.matrix('inputs')
            t = T.matrix('targets')
        else:
            x = T.tensor3('inputs')
            t = T.tensor3('inputs')
        x.tag.test_value = inputs.get_value(borrow=True)
        t.tag.test_value = targets.get_value(borrow=True)

        # create a set of parameters for a simple RNN
        W_xh = theano.shared(
            (0.01 * self.rng.uniform(size=(num_features, 10))).astype(
                config.floatX),
            borrow=True)
        W_hh = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX),
            borrow=True)
        W_hy = theano.shared(
            (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX),
            borrow=True)
        b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True)
        b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True)

        params = [W_xh, W_hh, W_hy, b_h, b_y]

        # recurrent function
        def step(x_t, h_tm1):
            h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h)
            return h

        # build recurrent graph
        if batch_size == 1:
            h_0 = T.alloc(0.0, 10).astype(config.floatX)
        else:
            h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX)
        h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0])
        # network output
        y = T.dot(h, W_hy) + b_y

        # Create Gauss-Newton-Matrix object. Not really of any use here, but I
        # need it for Hessian-Free optimization.
        gn = GaussNewtonMatrix(y)

        # compute MSE
        cost = ((t - y)**2).sum(axis=1).mean()

        # Compute the cost at some other point in the parameter
        # space. Not really of any use here, but this is how I do it
        # during certain iterations of CG in the HF algorithm. There,
        # it's in fact `pi + current update proposal`.  For simplicity,
        # I just multiply by 2 here.
        cost_ = theano.clone(cost,
                             replace=dict([(pi, 2 * pi) for pi in params]))

        # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG,
        # but for simplicity, I just take the parameters vector because it's
        # already there.
        Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0))

        # compile Theano function
        f = theano.function([], [cost_] + Gv,
                            givens={
                                x: inputs,
                                t: targets
                            },
                            mode=mode)
        # execute
        f()
Example #43
0
params = layers.get_all_params(NET, trainable=True)

# The dynamic learning rate is applied during the training process
lr_dynamic = T.scalar(name='learning_rate')

# The adam update methode is used to update the  params based on the loss function & the learning rate
param_updates = updates.adam(loss, params, learning_rate=lr_dynamic)

#################### TRAIN FUNCTION ######################
# The theano train functions takes images and class targets as input
# It updates the parameters of the net and returns the current loss as float value

# Compiling theano functions
#print "COMPILING THEANO TRAIN FUNCTION...",
train_net = theano.function(
    [layers.get_all_layers(NET)[0].input_var, targets, lr_dynamic],
    loss,
    updates=param_updates)

################# PREDICTION FUNCTION ####################
# The prediction function is used to calculate the validation accuracy
# First the CNN's output is retrieved
net_output = layers.get_output(NET)

# Compiling theano test function
print "COMPILING THEANO TEST FUNCTION...",
test_net = theano.function([layers.get_all_layers(NET)[0].input_var, targets],
                           [net_output, loss, accuracy])

##################### STAT PLOT #########################
plt.ion()
Example #44
0
def main():
    # step 1: load the data, transform as needed
    train = loadmat('../large_files/train_32x32.mat')
    test = loadmat('../large_files/test_32x32.mat')

    # Need to scale! don't leave as 0..255
    # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1)
    # So flatten it and make it 0..9
    # Also need indicator matrix for cost calculation
    Xtrain = rearrange(train['X'])
    Ytrain = train['y'].flatten() - 1
    del train
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Ytrain_ind = y2indicator(Ytrain)

    Xtest = rearrange(test['X'])
    Ytest = test['y'].flatten() - 1
    del test
    Ytest_ind = y2indicator(Ytest)

    max_iter = 8
    print_period = 10

    lr = np.float32(0.00001)
    reg = np.float32(0.01)
    mu = np.float32(0.99)

    N = Xtrain.shape[0]
    batch_sz = 500
    n_batches = N / batch_sz

    M = 500
    K = 10
    poolsz = (2, 2)

    # after conv will be of dimension 32 - 5 + 1 = 28
    # after downsample 28 / 2 = 14
    W1_shape = (
        20, 3, 5, 5
    )  # (num_feature_maps, num_color_channels, filter_width, filter_height)
    W1_init = init_filter(W1_shape, poolsz)
    b1_init = np.zeros(W1_shape[0],
                       dtype=np.float32)  # one bias per output feature map

    # after conv will be of dimension 14 - 5 + 1 = 10
    # after downsample 10 / 2 = 5
    W2_shape = (
        50, 20, 5, 5
    )  # (num_feature_maps, old_num_feature_maps, filter_width, filter_height)
    W2_init = init_filter(W2_shape, poolsz)
    b2_init = np.zeros(W2_shape[0], dtype=np.float32)

    # vanilla ANN weights
    W3_init = np.random.randn(W2_shape[0] * 5 * 5,
                              M) / np.sqrt(W2_shape[0] * 5 * 5 + M)
    b3_init = np.zeros(M, dtype=np.float32)
    W4_init = np.random.randn(M, K) / np.sqrt(M + K)
    b4_init = np.zeros(K, dtype=np.float32)

    # step 2: define theano variables and expressions
    X = T.tensor4('X', dtype='float32')
    Y = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')
    W3 = theano.shared(W3_init.astype(np.float32), 'W3')
    b3 = theano.shared(b3_init, 'b3')
    W4 = theano.shared(W4_init.astype(np.float32), 'W4')
    b4 = theano.shared(b4_init, 'b4')

    # momentum changes
    dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1')
    db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1')
    dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2')
    db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2')
    dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3')
    db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3')
    dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4')
    db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4')

    # forward pass
    Z1 = convpool(X, W1, b1)
    Z2 = convpool(Z1, W2, b2)
    Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3)
    pY = T.nnet.softmax(Z3.dot(W4) + b4)

    # define the cost function and prediction
    params = (W1, b1, W2, b2, W3, b3, W4, b4)
    reg_cost = reg * np.sum((param * param).sum() for param in params)
    cost = -(Y * T.log(pY)).sum() + reg_cost
    prediction = T.argmax(pY, axis=1)

    # step 3: training expressions and functions
    update_W1 = W1 + mu * dW1 - lr * T.grad(cost, W1)
    update_b1 = b1 + mu * db1 - lr * T.grad(cost, b1)
    update_W2 = W2 + mu * dW2 - lr * T.grad(cost, W2)
    update_b2 = b2 + mu * db2 - lr * T.grad(cost, b2)
    update_W3 = W3 + mu * dW3 - lr * T.grad(cost, W3)
    update_b3 = b3 + mu * db3 - lr * T.grad(cost, b3)
    update_W4 = W4 + mu * dW4 - lr * T.grad(cost, W4)
    update_b4 = b4 + mu * db4 - lr * T.grad(cost, b4)

    # update weight changes
    update_dW1 = mu * dW1 - lr * T.grad(cost, W1)
    update_db1 = mu * db1 - lr * T.grad(cost, b1)
    update_dW2 = mu * dW2 - lr * T.grad(cost, W2)
    update_db2 = mu * db2 - lr * T.grad(cost, b2)
    update_dW3 = mu * dW3 - lr * T.grad(cost, W3)
    update_db3 = mu * db3 - lr * T.grad(cost, b3)
    update_dW4 = mu * dW4 - lr * T.grad(cost, W4)
    update_db4 = mu * db4 - lr * T.grad(cost, b4)

    train = theano.function(
        inputs=[X, Y],
        updates=[
            (W1, update_W1),
            (b1, update_b1),
            (W2, update_W2),
            (b2, update_b2),
            (W3, update_W3),
            (b3, update_b3),
            (W4, update_W4),
            (b4, update_b4),
            (dW1, update_dW1),
            (db1, update_db1),
            (dW2, update_dW2),
            (db2, update_db2),
            (dW3, update_dW3),
            (db3, update_db3),
            (dW4, update_dW4),
            (db4, update_db4),
        ],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[X, Y],
        outputs=[cost, prediction],
    )

    t0 = datetime.now()
    LL = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (
                    i, j, cost_val, err)
                LL.append(cost_val)
    print "Elapsed time:", (datetime.now() - t0)
    plt.plot(LL)
    plt.show()
Example #45
0
w3, b3 = init_weights_bias2((num_filters2 * 3 * 3, 100), X.dtype)
w4, b4 = init_weights_bias2((100, 10), X.dtype)

y1, o1, y2, o2, py_x = model(X, w1, b1, w2, b2, w3, b3, w4, b4)

y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
params = [w1, b1, w2, b2, w3, b3, w4, b4]

updates = sgd(cost, params, learningrate, decayparameter)
updates2 = sgd_momentum(cost, params, learningrate, decayparameter, momentum)
updates3 = RMSprop(cost, params, learningrateRMS, decayparameterRMS, p, ebs)

train = theano.function(inputs=[X, Y],
                        outputs=cost,
                        updates=updates,
                        allow_input_downcast=True)
train2 = theano.function(inputs=[X, Y],
                         outputs=cost,
                         updates=updates2,
                         allow_input_downcast=True)
train3 = theano.function(inputs=[X, Y],
                         outputs=cost,
                         updates=updates3,
                         allow_input_downcast=True)

predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
test = theano.function(inputs=[X],
                       outputs=[y1, o1, y2, o2],
                       allow_input_downcast=True)
Example #46
0
def train(data,
          layers,
          negative_importance,
          negative_threshold,
          entropy_importance,
          updates_function,
          batch_size=10,
          epoch_size=200,
          initial_patience=1000,
          improvement_threshold=0.99,
          patience_increase=5,
          max_iter=100000):
    '''
    Utility function for training a siamese net for (potentially cross-modal)
    embedding of sequences.
    Assumes data['X']['train'][n] should be mapped close to
    data['Y']['train'][m] only when n == m
    The networks for hashing sequences from each modality should be given in
    the ``layers`` dictionary (see below).

    Parameters
    ----------
    data : dict of dict of list of np.ndarray
        Dict with keys ``'X'`` and ``'Y'``, corresponding to each modality,
        with each key mapping to a dict with keys ``'train'`` and
        ``'validate'``, each of which containing a list of np.ndarrays of shape
        ``(n_filters, n_time_steps, n_features)``.
    layers : dict of list of lasagne.layers.Layer
        This should be a dict with two keys, ``'X'`` and ``'Y'``, with each key
        mapping to a list of ``lasagne.layers.Layer`` instance corresponding to
        the layers in each network.  The only constraints are that the input
        shape should match the shape produced by ``sample_sequences`` when it's
        called with the provided data arrays (``data['X']['train']``, etc.) and
        that the output dimensionality of both networks should be the same.
    negative_importance : float
        Scaling parameter for cross-modality negative example cost
    negative_threshold : int
        Cross-modality negative example threshold
    entropy_importance : float
        Scaling parameter for hash entropy encouraging term
    updates_function : function
        Function for computing updates, probably from ``lasagne.updates``.
        Should take two arguments, a Theano tensor variable and a list of
        shared variables, and should return a dictionary of updates for those
        parameters (all other arguments, such as learning rate, should be
        factored out).
    batch_size : int
        Mini-batch size
    epoch_size : int
        Number of mini-batches per epoch
    initial_patience : int
        Always train on at least this many batches
    improvement_threshold : float
        Validation cost must decrease by this factor to increase patience
    patience_increase : int
        How many more epochs should we wait when we increase patience
    max_iter : int
        Maximum number of batches to train on

    Returns
    -------
    epoch : iterator
        Results for each epoch are yielded
    '''
    # First neural net, for X modality
    X_p_input = T.tensor4('X_p_input')
    X_n_input = T.tensor4('X_n_input')
    # For eval
    X_input = T.tensor4('X_input')
    # Second neural net, for Y modality
    Y_p_input = T.tensor4('Y_p_input')
    Y_n_input = T.tensor4('Y_n_input')
    Y_input = T.tensor4('Y_input')

    # Compute \sum max(0, m - ||a - b||_2)^2
    def hinge_cost(m, a, b):
        dist = m - T.sqrt(T.sum((a - b)**2, axis=1))
        return T.mean((dist * (dist > 0))**2)

    def hasher_cost(deterministic):
        X_p_output = lasagne.layers.get_output(layers['X'][-1],
                                               X_p_input,
                                               deterministic=deterministic)
        X_n_output = lasagne.layers.get_output(layers['X'][-1],
                                               X_n_input,
                                               deterministic=deterministic)
        Y_p_output = lasagne.layers.get_output(layers['Y'][-1],
                                               Y_p_input,
                                               deterministic=deterministic)
        Y_n_output = lasagne.layers.get_output(layers['Y'][-1],
                                               Y_n_input,
                                               deterministic=deterministic)

        # Unthresholded, unscaled cost of positive examples across modalities
        cost_p = T.mean(T.sum((X_p_output - Y_p_output)**2, axis=1))
        # Thresholded, scaled cost of cross-modality negative examples
        cost_n = negative_importance * hinge_cost(negative_threshold,
                                                  X_n_output, Y_n_output)
        # Cost to encourage each output unit to vary
        cost_e = entropy_importance * (T.mean(X_p_output**2) +
                                       T.mean(Y_p_output**2))
        # Sum positive and negative costs for overall cost
        cost = cost_p + cost_n + cost_e
        return cost

    # Combine all parameters from both networks
    params = (lasagne.layers.get_all_params(layers['X'][-1], trainable=True) +
              lasagne.layers.get_all_params(layers['Y'][-1], trainable=True))
    # Compute gradient descent updates
    updates = updates_function(hasher_cost(False), params)
    # Function for training the network
    train = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input],
                            hasher_cost(False),
                            updates=updates)

    # Compute cost without training
    cost = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input],
                           hasher_cost(True))

    # Start with infinite validate cost; we will always increase patience once
    current_validate_cost = np.inf
    patience = initial_patience

    # Functions for computing the neural net output on the train and val sets
    X_output = theano.function([X_input],
                               lasagne.layers.get_output(layers['X'][-1],
                                                         X_input,
                                                         deterministic=True))
    Y_output = theano.function([Y_input],
                               lasagne.layers.get_output(layers['Y'][-1],
                                                         Y_input,
                                                         deterministic=True))

    # Create sampled sequences for validation
    X_validate = utils.sample_sequences(data['X']['validate'], batch_size)
    Y_validate = utils.sample_sequences(data['Y']['validate'], batch_size)
    # Create fixed negative example validation set
    X_validate_shuffle = np.random.permutation(len(data['X']['validate']))
    Y_validate_shuffle = X_validate_shuffle[utils.random_derangement(
        len(data['Y']['validate']))]
    X_validate_n = utils.sample_sequences(
        [data['X']['validate'][n] for n in X_validate_shuffle], batch_size)
    Y_validate_n = utils.sample_sequences(
        [data['Y']['validate'][n] for n in Y_validate_shuffle], batch_size)
    # Create iterator to sample sequences from training data
    data_iterator = utils.get_next_batch(data['X']['train'],
                                         data['Y']['train'], batch_size,
                                         max_iter)
    # We will accumulate the mean train cost over each epoch
    train_cost = 0

    for n, (X_p, Y_p, X_n, Y_n) in enumerate(data_iterator):
        # Occasionally Theano was raising a MemoryError, this fails gracefully
        try:
            train_cost += train(X_p, X_n, Y_p, Y_n)
        except MemoryError as e:
            print "MemoryError: {}".format(e)
            return
        # Stop training if a NaN is encountered
        if not np.isfinite(train_cost):
            print 'Bad training cost {} at iteration {}'.format(train_cost, n)
            break
        # Validate the net after each epoch
        if n and (not n % epoch_size):
            epoch_result = collections.OrderedDict()
            epoch_result['iteration'] = n
            # Compute average training cost over the epoch
            epoch_result['train_cost'] = train_cost / float(epoch_size)
            # Reset training cost mean accumulation
            train_cost = 0

            # We need to accumulate the validation cost and network output over
            # batches to avoid MemoryErrors
            epoch_result['validate_cost'] = 0
            validate_batches = 0
            X_val_output = []
            Y_val_output = []
            for batch_idx in range(len(X_validate)):
                # Compute and accumulate cost
                epoch_result['validate_cost'] += cost(X_validate[batch_idx],
                                                      X_validate_n[batch_idx],
                                                      Y_validate[batch_idx],
                                                      Y_validate_n[batch_idx])
                # Keep track of # of batches for normalization
                validate_batches += 1
                # Compute network output and accumulate result
                X_val_output.append(X_output(X_validate[batch_idx]))
                Y_val_output.append(Y_output(Y_validate[batch_idx]))
            # Normalize cost by number of batches and store
            epoch_result['validate_cost'] /= float(validate_batches)
            # Concatenate per-batch output to tensors
            X_val_output = np.concatenate(X_val_output, axis=0)
            Y_val_output = np.concatenate(Y_val_output, axis=0)
            # Compute in-class and out-of-class distances
            in_dists = np.mean((X_val_output - Y_val_output)**2, axis=1)
            out_dists = np.mean((X_val_output[X_validate_shuffle] -
                                 Y_val_output[Y_validate_shuffle])**2,
                                axis=1)
            # Objective is Bhattacharrya coefficient of in-class and
            # out-of-class distances
            epoch_result['validate_objective'] = utils.bhatt_coeff(
                in_dists, out_dists)

            # Test whether this validate cost is the new smallest
            if epoch_result['validate_cost'] < current_validate_cost:
                # To update patience, we must be smaller than
                # improvement_threshold*(previous lowest validation cost)
                patience_cost = improvement_threshold * current_validate_cost
                if epoch_result['validate_cost'] < patience_cost:
                    # Increase patience by the supplied about
                    patience += epoch_size * patience_increase
                # Even if we didn't increase patience, update lowest valid cost
                current_validate_cost = epoch_result['validate_cost']
            # Store patience after this epoch
            epoch_result['patience'] = patience

            yield epoch_result

            if n > patience:
                break

    return
Example #47
0
def run(only_forward=False):
    logger = afs_safe_logger.Logger(
        os.path.join(FLAGS.log_path, FLAGS.experiment_name) + ".log")

    if FLAGS.data_type == "bl":
        data_manager = load_boolean_data
    elif FLAGS.data_type == "sst":
        data_manager = load_sst_data
    elif FLAGS.data_type == "snli":
        data_manager = load_snli_data
    else:
        logger.Log("Bad data type.")
        return

    pp = pprint.PrettyPrinter(indent=4)
    logger.Log("Flag values:\n" + pp.pformat(FLAGS.FlagValuesDict()))

    # Load the data.
    raw_training_data, vocabulary = data_manager.load_data(
        FLAGS.training_data_path)

    # Load the eval data.
    raw_eval_sets = []
    if FLAGS.eval_data_path:
        for eval_filename in FLAGS.eval_data_path.split(":"):
            eval_data, _ = data_manager.load_data(eval_filename)
            raw_eval_sets.append((eval_filename, eval_data))

    # Prepare the vocabulary.
    if not vocabulary:
        logger.Log(
            "In open vocabulary mode. Using loaded embeddings without fine-tuning."
        )
        train_embeddings = False
        vocabulary = util.BuildVocabulary(
            raw_training_data,
            raw_eval_sets,
            FLAGS.embedding_data_path,
            logger=logger,
            sentence_pair_data=data_manager.SENTENCE_PAIR_DATA)
    else:
        logger.Log("In fixed vocabulary mode. Training embeddings.")
        train_embeddings = True

    # Load pretrained embeddings.
    if FLAGS.embedding_data_path:
        logger.Log("Loading vocabulary with " + str(len(vocabulary)) +
                   " words from " + FLAGS.embedding_data_path)
        initial_embeddings = util.LoadEmbeddingsFromASCII(
            vocabulary, FLAGS.word_embedding_dim, FLAGS.embedding_data_path)
    else:
        initial_embeddings = None

    # Trim dataset, convert token sequences to integer sequences, crop, and
    # pad.
    logger.Log("Preprocessing training data.")
    training_data = util.PreprocessDataset(
        raw_training_data,
        vocabulary,
        FLAGS.seq_length,
        data_manager,
        eval_mode=False,
        logger=logger,
        sentence_pair_data=data_manager.SENTENCE_PAIR_DATA,
        for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW")
    training_data_iter = util.MakeTrainingIterator(training_data,
                                                   FLAGS.batch_size)

    eval_iterators = []
    for filename, raw_eval_set in raw_eval_sets:
        logger.Log("Preprocessing eval data: " + filename)
        e_X, e_transitions, e_y, e_num_transitions = util.PreprocessDataset(
            raw_eval_set,
            vocabulary,
            FLAGS.seq_length,
            data_manager,
            eval_mode=True,
            logger=logger,
            sentence_pair_data=data_manager.SENTENCE_PAIR_DATA,
            for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW")
        eval_iterators.append(
            (filename,
             util.MakeEvalIterator(
                 (e_X, e_transitions, e_y, e_num_transitions),
                 FLAGS.batch_size)))

    # Set up the placeholders.

    y = T.vector("y", dtype="int32")
    lr = T.scalar("lr")
    training_mode = T.scalar(
        "training_mode")  # 1: Training with dropout, 0: Eval
    ground_truth_transitions_visible = T.scalar(
        "ground_truth_transitions_visible", dtype="int32")

    logger.Log("Building model.")
    vs = util.VariableStore(default_initializer=util.UniformInitializer(
        FLAGS.init_range),
                            logger=logger)

    if FLAGS.model_type == "CBOW":
        model_cls = spinn.cbow.CBOW
    elif FLAGS.model_type == "RNN":
        model_cls = spinn.plain_rnn.RNN
    else:
        model_cls = getattr(spinn.fat_stack, FLAGS.model_type)

    # Generator of mask for scheduled sampling
    numpy_random = np.random.RandomState(1234)
    ss_mask_gen = T.shared_randomstreams.RandomStreams(
        numpy_random.randint(999999))

    # Training step number
    ss_prob = T.scalar("ss_prob")

    if data_manager.SENTENCE_PAIR_DATA:
        X = T.itensor3("X")
        transitions = T.itensor3("transitions")
        num_transitions = T.imatrix("num_transitions")

        predicted_premise_transitions, predicted_hypothesis_transitions, logits = build_sentence_pair_model(
            model_cls,
            len(vocabulary),
            FLAGS.seq_length,
            X,
            transitions,
            len(data_manager.LABEL_MAP),
            training_mode,
            ground_truth_transitions_visible,
            vs,
            initial_embeddings=initial_embeddings,
            project_embeddings=(not train_embeddings),
            ss_mask_gen=ss_mask_gen,
            ss_prob=ss_prob)
    else:
        X = T.matrix("X", dtype="int32")
        transitions = T.imatrix("transitions")
        num_transitions = T.vector("num_transitions", dtype="int32")

        predicted_transitions, logits = build_sentence_model(
            model_cls,
            len(vocabulary),
            FLAGS.seq_length,
            X,
            transitions,
            len(data_manager.LABEL_MAP),
            training_mode,
            ground_truth_transitions_visible,
            vs,
            initial_embeddings=initial_embeddings,
            project_embeddings=(not train_embeddings),
            ss_mask_gen=ss_mask_gen,
            ss_prob=ss_prob)

    xent_cost, acc = build_cost(logits, y)

    # Set up L2 regularization.
    l2_cost = 0.0
    for var in vs.trainable_vars:
        l2_cost += FLAGS.l2_lambda * T.sum(T.sqr(vs.vars[var]))

    # Compute cross-entropy cost on action predictions.
    if (not data_manager.SENTENCE_PAIR_DATA) and FLAGS.model_type not in [
            "Model0", "RNN", "CBOW"
    ]:
        transition_cost, action_acc = build_transition_cost(
            predicted_transitions, transitions, num_transitions)
    elif data_manager.SENTENCE_PAIR_DATA and FLAGS.model_type not in [
            "Model0", "RNN", "CBOW"
    ]:
        p_transition_cost, p_action_acc = build_transition_cost(
            predicted_premise_transitions, transitions[:, :, 0],
            num_transitions[:, 0])
        h_transition_cost, h_action_acc = build_transition_cost(
            predicted_hypothesis_transitions, transitions[:, :, 1],
            num_transitions[:, 1])
        transition_cost = p_transition_cost + h_transition_cost
        action_acc = (p_action_acc + h_action_acc
                      ) / 2.0  # TODO(SB): Average over transitions, not words.
    else:
        transition_cost = T.constant(0.0)
        action_acc = T.constant(0.0)
    transition_cost = transition_cost * FLAGS.transition_cost_scale

    total_cost = xent_cost + l2_cost + transition_cost

    if ".ckpt" in FLAGS.ckpt_path:
        checkpoint_path = FLAGS.ckpt_path
    else:
        checkpoint_path = os.path.join(FLAGS.ckpt_path,
                                       FLAGS.experiment_name + ".ckpt")
    if os.path.isfile(checkpoint_path):
        logger.Log("Found checkpoint, restoring.")
        step, best_dev_error = vs.load_checkpoint(
            checkpoint_path,
            num_extra_vars=2,
            skip_saved_unsavables=FLAGS.skip_saved_unsavables)
    else:
        assert not only_forward, "Can't run an eval-only run without a checkpoint. Supply a checkpoint."
        step = 0
        best_dev_error = 1.0

    # Do an evaluation-only run.
    if only_forward:
        if FLAGS.eval_output_paths:
            eval_output_paths = FLAGS.eval_output_paths.strip().split(":")
            assert len(eval_output_paths) == len(
                eval_iterators), "Invalid no. of output paths."
        else:
            eval_output_paths = [
                FLAGS.experiment_name + "-" + os.path.split(eval_set[0])[1] +
                "-parse" for eval_set in eval_iterators
            ]

        # Load model from checkpoint.
        logger.Log("Checkpointed model was trained for %d steps." % (step, ))

        # Generate function for forward pass.
        logger.Log("Building forward pass.")
        if data_manager.SENTENCE_PAIR_DATA:
            eval_fn = theano.function([
                X, transitions, y, num_transitions, training_mode,
                ground_truth_transitions_visible, ss_prob
            ], [
                acc, action_acc, logits, predicted_hypothesis_transitions,
                predicted_premise_transitions
            ],
                                      on_unused_input='ignore',
                                      allow_input_downcast=True)
        else:
            eval_fn = theano.function([
                X, transitions, y, num_transitions, training_mode,
                ground_truth_transitions_visible, ss_prob
            ], [acc, action_acc, logits, predicted_transitions],
                                      on_unused_input='ignore',
                                      allow_input_downcast=True)

        # Generate the inverse vocabulary lookup table.
        ind_to_word = {v: k for k, v in vocabulary.iteritems()}

        # Do a forward pass and write the output to disk.
        for eval_set, eval_out_path in zip(eval_iterators, eval_output_paths):
            logger.Log("Writing eval output for %s." % (eval_set[0], ))
            evaluate_expanded(
                eval_fn, eval_set, eval_out_path, logger, step,
                data_manager.SENTENCE_PAIR_DATA, ind_to_word, FLAGS.model_type
                not in ["Model0", "RNN", "CBOW"])
    else:
        # Train

        new_values = util.RMSprop(total_cost, vs.trainable_vars.values(), lr)
        new_values += [(key, vs.nongradient_updates[key])
                       for key in vs.nongradient_updates]
        # Training open-vocabulary embeddings is a questionable idea right now. Disabled:
        # new_values.append(
        #     util.embedding_SGD(total_cost, embedding_params, embedding_lr))

        # Create training and eval functions.
        # Unused variable warnings are supressed so that num_transitions can be passed in when training Model 0,
        # which ignores it. This yields more readable code that is very slightly slower.
        logger.Log("Building update function.")
        update_fn = theano.function([
            X, transitions, y, num_transitions, lr, training_mode,
            ground_truth_transitions_visible, ss_prob
        ], [total_cost, xent_cost, transition_cost, action_acc, l2_cost, acc],
                                    updates=new_values,
                                    on_unused_input='ignore',
                                    allow_input_downcast=True)
        logger.Log("Building eval function.")
        eval_fn = theano.function([
            X, transitions, y, num_transitions, training_mode,
            ground_truth_transitions_visible, ss_prob
        ], [acc, action_acc],
                                  on_unused_input='ignore',
                                  allow_input_downcast=True)
        logger.Log("Training.")

        # Main training loop.
        for step in range(step, FLAGS.training_steps):
            if step % FLAGS.eval_interval_steps == 0:
                for index, eval_set in enumerate(eval_iterators):
                    acc = evaluate(eval_fn, eval_set, logger, step)
                    if FLAGS.ckpt_on_best_dev_error and index == 0 and (
                            1 - acc) < 0.99 * best_dev_error and step > 1000:
                        best_dev_error = 1 - acc
                        logger.Log(
                            "Checkpointing with new best dev accuracy of %f" %
                            acc)
                        vs.save_checkpoint(checkpoint_path + "_best",
                                           extra_vars=[step, best_dev_error])

            X_batch, transitions_batch, y_batch, num_transitions_batch = training_data_iter.next(
            )
            learning_rate = FLAGS.learning_rate * (
                FLAGS.learning_rate_decay_per_10k_steps**(step / 10000.0))
            ret = update_fn(
                X_batch, transitions_batch, y_batch, num_transitions_batch,
                learning_rate, 1.0, 1.0,
                np.exp(step * np.log(FLAGS.scheduled_sampling_exponent_base)))
            total_cost_val, xent_cost_val, transition_cost_val, action_acc_val, l2_cost_val, acc_val = ret

            if step % FLAGS.statistics_interval_steps == 0:
                logger.Log("Step: %i\tAcc: %f\t%f\tCost: %5f %5f %5f %5f" %
                           (step, acc_val, action_acc_val, total_cost_val,
                            xent_cost_val, transition_cost_val, l2_cost_val))

            if step % FLAGS.ckpt_interval_steps == 0 and step > 0:
                vs.save_checkpoint(checkpoint_path,
                                   extra_vars=[step, best_dev_error])
Example #48
0
def _get_fprop(large_network=False, output_layers=[-1], detailed=False):
    arch = _get_architecture(large_network, detailed=detailed)
    expressions, input_var = fuse(arch, output_expressions=output_layers,
                                  input_dtype='float32')
    fprop = theano.function([input_var], expressions)
    return fprop
Example #49
0
def test_mlp(learning_rate=0.01,
             L1_reg=0.00,
             L2_reg=0.0001,
             n_epochs=1000,
             dataset='mnist.pkl.gz',
             batch_size=5,
             n_hidden=10):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """

    #print(trainO)
    Xo = theano.shared(value=np.asarray(trainD, dtype='float64'), name='Xo')
    yo = theano.shared(value=np.asarray(trainO, dtype='int32'), name='yo')
    Xot = theano.shared(value=np.asarray(testD, dtype='float64'), name='Xot')
    yot = theano.shared(value=np.asarray(testO, dtype='int32'), name='yot')
    Xov = theano.shared(value=np.asarray(validD, dtype='float64'), name='Xot')
    yov = theano.shared(value=np.asarray(validO, dtype='int32'), name='yot')

    #print(y)
    #    sys.exit()
    train_set_x, train_set_y = (Xo, yo)
    valid_set_x, valid_set_y = (Xov, yov)
    test_set_x, test_set_y = (Xot, yot)

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    #print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng, input=x, n_in=20285, n_hidden=n_hidden, n_out=2)

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sorted in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    #print('... training')

    # early-stopping parameters
    patience = 900  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                #print(
                #    'epoch %i, minibatch %i/%i, validation error %f %%' %
                #    (
                #        epoch,
                #        minibatch_index + 1,
                #        n_train_batches,
                #        this_validation_loss * 100.
                #    )
                #)

                # if we got the best validation score until now
                if (1 < 2):
                    #if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = np.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            #if patience <= iter:
            #done_looping = True
            #break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)
    print("\n")
Example #50
0
def train(
        dataname='5r',
        dataset='5Label_300_40000_glove.6B',
        n_words=40000,
        decay_c=1e-4,
        optimizer=adagrad,
        clip_c=4.,
        valid_batch_size=64,
        batch_size=32,
        disp_frq=1000,
        valid_freq=100,
        save_freq=1000,
        max_epochs=100,
        # lrate=0.05,
        lrate=0.05,
        lrate_embed=0.1,
        use_dropout=True,
        noise_std=0.5,
        patience=15,
        saveto='model.npz',
        encoder='lstm',
        dim_proj=300,
        end=True,
        dim_hidden=100):
    # Model options
    model_options = locals().copy()
    print(model_options)
    print 'Loading data'

    path = os.path.join('..', '..', '..', 'Data', 'TC', dataname,
                        dataset + '.pkl')
    # path = os.path.join('..', '..', 'Data', 'TC', dataname, dataset + '.pkl')
    data = pkl.load(open(path, 'rb'))
    train, valid, test, emb = data
    print(emb.shape)
    ydim = numpy.max(train[1]) - numpy.min(train[1]) + 1

    if numpy.min(train[1]) is not 0:
        bias = numpy.min(train[1])
        print 'Min of class is ', bias

        def min_y_to_zero(set):
            X, Y = set[0], set[1]
            new_Y = []
            for y in Y:
                new_Y.append(y - bias)
            return [X, new_Y]

        train = min_y_to_zero(train)
        valid = min_y_to_zero(valid)
        test = min_y_to_zero(test)

    model_options['ydim'] = ydim

    print 'Building model'
    params = init_params(model_options)
    params['Wemb'] = emb.astype(config.floatX)

    tparams = init_tparams(params)

    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            # weight_decay+=(theano.ifelse(kk is 'Wemb'), ((vv ** 2).sum() / 5.), ((vv ** 2).sum()))
            # if kk is 'Wemb':
            #     weight_decay += (vv ** 2).sum() / 5.
            # else:
            #     weight_decay += (vv ** 2).sum()
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost_decay = weight_decay + cost

    f_cost = theano.function([x, mask, y], cost_decay, name='f_cost')

    grads = tensor.grad(cost_decay, wrt=tparams.values())

    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    # lrate_embed = tensor.scalar(name='lrate_embed')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost,
                                        cost_decay)

    print 'Optimization'
    # kf_train4valid = get_minibatches_idx(len(train4valid[0]), valid_batch_size)
    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    history_errs = []
    best_p = None
    bad_count = 0

    if valid_freq == -1:
        valid_freq = len(train[0]) / batch_size
    if save_freq == -1:
        save_freq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    start_time = time.time()
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                if use_dropout is True:
                    use_noise.set_value(1.)
                else:
                    use_noise.set_value(0.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost, cost_decay = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if numpy.mod(uidx, disp_frq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'Cost_decay', cost_decay

                if numpy.mod(uidx, save_freq) == 0:
                    print 'Saving...',

                    # import ipdb; ipdb.set_trace()

                    if best_p != None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                    print 'Done'

                if numpy.mod(uidx, valid_freq) == 0:
                    use_noise.set_value(0.)

                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    history_errs.append(valid_err)

                    if (uidx == 0
                            or valid_err <= numpy.array(history_errs).min()):
                        best_p = unzip(tparams)
                        bad_counter = 0

                    if len(history_errs
                           ) > patience and valid_err >= numpy.array(
                               history_errs)[:-patience].min():
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    print 'Valid ', valid_err

            print 'Seen %d samples' % n_samples

            if estop:
                break

            if best_p is not None:
                zipp(best_p, tparams)

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    print 'Valid ', valid_err, 'Test ', test_err

    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)

    # print 'Train ', train_err,'Train4Valid ',train4valid_err, 'Valid ', valid_err, 'Test ', test_err
    print 'Train ', train_err
    print 'Dataset', dataname, 'Test Acc', (1. - test_err)
    print(model_options)
    if saveto:
        numpy.savez(saveto,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time))

    return train_err, valid_err, test_err
import theano
import theano.tensor as T

N = T.iscalar('n')


def fibonacci(n, x, x_prev):
    return x + x_prev, x


outputs, updates = theano.scan(
    fn=fibonacci,
    sequences=T.arange(N),
    n_steps=N,
    outputs_info=[1, 1],
)

fib_op = theano.function(inputs=[N], outputs=outputs)

print(fib_op(5))
    def SGD(self,
            training_data,
            epochs,
            mini_batch_size,
            eta,
            validation_data,
            test_data,
            lmbda=0.0):
        """Train the network using mini-batch stochastic gradient descent."""
        training_x, training_y = training_data
        validation_x, validation_y = validation_data
        test_x, test_y = test_data

        # compute number of minibatches for training, validation and testing
        num_training_batches = size(training_data) / mini_batch_size
        num_validation_batches = size(validation_data) / mini_batch_size
        num_test_batches = size(test_data) / mini_batch_size

        # define the (regularized) cost function, symbolic gradients, and updates
        l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
        cost = self.layers[-1].cost(self)+\
               0.5*lmbda*l2_norm_squared/num_training_batches
        grads = T.grad(cost, self.params)
        updates = [(param, param - eta * grad)
                   for param, grad in zip(self.params, grads)]

        # define functions to train a mini-batch, and to compute the
        # accuracy in validation and test mini-batches.
        i = T.lscalar()  # mini-batch index
        train_mb = theano.function(
            [i],
            cost,
            updates=updates,
            givens={
                self.x:
                training_x[i * self.mini_batch_size:(i + 1) *
                           self.mini_batch_size],
                self.y:
                training_y[i * self.mini_batch_size:(i + 1) *
                           self.mini_batch_size]
            })
        validate_mb_accuracy = theano.function(
            [i],
            self.layers[-1].accuracy(self.y),
            givens={
                self.x:
                validation_x[i * self.mini_batch_size:(i + 1) *
                             self.mini_batch_size],
                self.y:
                validation_y[i * self.mini_batch_size:(i + 1) *
                             self.mini_batch_size]
            })
        test_mb_accuracy = theano.function(
            [i],
            self.layers[-1].accuracy(self.y),
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) *
                       self.mini_batch_size],
                self.y:
                test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            })
        self.test_mb_predictions = theano.function(
            [i],
            self.layers[-1].y_out,
            givens={
                self.x:
                test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size]
            })
        # Do the actual training
        best_validation_accuracy = 0.0
        for epoch in xrange(epochs):
            for minibatch_index in xrange(num_training_batches):
                iteration = num_training_batches * epoch + minibatch_index
                if iteration % 1000 == 0:
                    print("Training mini-batch number {0}".format(iteration))
                cost_ij = train_mb(minibatch_index)
                if (iteration + 1) % num_training_batches == 0:
                    validation_accuracy = np.mean([
                        validate_mb_accuracy(j)
                        for j in xrange(num_validation_batches)
                    ])
                    print("Epoch {0}: validation accuracy {1:.2%}".format(
                        epoch, validation_accuracy))
                    if validation_accuracy >= best_validation_accuracy:
                        print("This is the best validation accuracy to date.")
                        best_validation_accuracy = validation_accuracy
                        best_iteration = iteration
                        if test_data:
                            test_accuracy = np.mean([
                                test_mb_accuracy(j)
                                for j in xrange(num_test_batches)
                            ])
                            print('The corresponding test accuracy is {0:.2%}'.
                                  format(test_accuracy))
        print("Finished training network.")
        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".
              format(best_validation_accuracy, best_iteration))
        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
Example #53
0
    def __init__(self, model, trn_data, trn_loss, trn_target=None, val_data=None, val_loss=None, val_target=None, step=ss.Adam()):
        """
        Constructs and configures the trainer.
        :param model: the model to be trained
        :param trn_data: train inputs and (possibly) train targets
        :param trn_loss: theano variable representing the train loss to minimize
        :param trn_target: theano variable representing the train target
        :param val_data: validation inputs and (possibly) validation targets
        :param val_loss: theano variable representing the validation loss
        :param val_target: theano variable representing the validation target
        :param step: step size strategy object
        :return: None
        """

        # parse input
        # TODO: it would be good to type check the other inputs too
        assert isinstance(step, ss.StepStrategy), 'Step must be a step strategy object.'

        # prepare train data
        n_trn_data_list = set([x.shape[0] for x in trn_data])
        assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.'
        self.n_trn_data = list(n_trn_data_list)[0]
        trn_data = [theano.shared(x.astype(dtype), borrow=True) for x in trn_data]

        # compile theano function for a single training update
        grads = tt.grad(trn_loss, model.parms)
        idx = tt.ivector('idx')
        trn_inputs = [model.input] if trn_target is None else [model.input, trn_target]
        self.make_update = theano.function(
            inputs=[idx],
            outputs=trn_loss,
            givens=zip(trn_inputs, [x[idx] for x in trn_data]),
            updates=step.updates(model.parms, grads)
        )

        # if model uses batch norm, compile a theano function for setting up stats
        if getattr(model, 'batch_norm', False):
            batch_norm_givens = [(bn.m, bn.bm) for bn in model.bns] + [(bn.v, bn.bv) for bn in model.bns]
            self.set_batch_norm_stats = theano.function(
                inputs=[],
                givens=zip(trn_inputs, trn_data),
                updates=[(bn.bm, bn.m) for bn in model.bns] + [(bn.bv, bn.v) for bn in model.bns]
            )
        else:
            self.set_batch_norm_stats = None
            batch_norm_givens = []

        # if validation data is given, then set up validation too
        self.do_validation = val_data is not None

        if self.do_validation:

            # prepare validation data
            n_val_data_list = set([x.shape[0] for x in val_data])
            assert len(n_val_data_list) == 1, 'Number of validation data is not consistent.'
            self.n_val_data = list(n_val_data_list)[0]
            val_data = [theano.shared(x.astype(dtype), borrow=True) for x in val_data]

            # compile theano function for validation
            val_inputs = [model.input] if val_target is None else [model.input, val_target]
            self.validate = theano.function(
                inputs=[],
                outputs=val_loss,
                givens=zip(val_inputs, val_data) + batch_norm_givens
            )

            # create checkpointer to store best model
            self.checkpointer = ModelCheckpointer(model)
            self.best_val_loss = float('inf')

        # initialize some variables
        self.trn_loss = float('inf')
        self.idx_stream = ds.IndexSubSampler(self.n_trn_data)
Example #54
0
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"

    data_names = data_name.split(":")
    data_count = len(data_names)
    print "Train dataset:"
    for i in xrange(data_count):
        print "%d: %s" % (i, data_names[i])

    print "Test dataset:"
    test_data_names = test_dataname.split(":")
    test_data_count = len(test_data_names)
    for i in xrange(test_data_count):
        print "%d: %s" % (i, test_data_names[i])

    if test_data_count != data_count:
        raise Exception(
            "The amount of test and train dataset must be the same.")

    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    sentenceW = None
    sentenceB = None
    docW = None
    docB = None

    hidden_layer_w = None
    hidden_layer_b = None
    logistic_layer_w = None
    logistic_layer_b = None
    layer0 = list()
    layer1 = list()
    layer2 = list()
    local_params = list()
    # for list-type data
    for i in xrange(data_count):
        layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=249, \
                     sentenceLayerNodesNum=50, \
                     sentenceLayerNodesSize=[5, 249], \
                     docLayerNodesNum=10, \
                     docLayerNodesSize=[3, 50],
                     sentenceW=sentenceW,
                     sentenceB=sentenceB,
                     docW=docW,
                     docB=docB,
                     pooling_mode=pooling_mode))

        sentenceW = layer0[i].sentenceW
        sentenceB = layer0[i].sentenceB
        docW = layer0[i].docW
        docB = layer0[i].docB

        layer1.append(
            HiddenLayer(rng,
                        input=layer0[i].output,
                        n_in=layer0[i].outputDimension,
                        n_out=10,
                        activation=T.tanh,
                        W=hidden_layer_w,
                        b=hidden_layer_b))

        hidden_layer_w = layer1[i].W
        hidden_layer_b = layer1[i].b

        layer2.append(
            LogisticRegression(input=layer1[i].output,
                               n_in=10,
                               n_out=2,
                               W=logistic_layer_w,
                               b=logistic_layer_b))
        # 		logistic_layer_w = layer2[i].W
        # 		logistic_layer_b = layer2[i].b

        local_params.append(layer2[i].params)

    share_params = list(layer0[0].params + layer1[0].params)
    # construct the parameter array.
    params = list(layer0[0].params) + layer1[0].params

    for i in xrange(data_count):
        params += layer2[i].params

# 	data_name = "car"

    para_path = "data/" + data_name + "/share_hidden_low_model_multiinput/" + pooling_mode + ".model"
    traintext = [
        "data/" + data_names[i] + "/train/text" for i in xrange(data_count)
    ]
    trainlabel = [
        "data/" + data_names[i] + "/train/label" for i in xrange(data_count)
    ]
    testtext = [
        "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)
    ]
    testlabel = [
        "data/" + test_data_names[i] + "/test/label"
        for i in xrange(data_count)
    ]

    # Load the parameters last time, optionally.
    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        train_model = list()
        valid_model = list()
        print "Loading train data."
        batchSize = 10
        share_learning_rate = 0.1
        local_learning_rate = 0.1
        n_batches = list()

        print "Loading test data."

        all_pred_label = list()
        all_real_label = list()
        all_pred_prob = list()
        for i in xrange(data_count):
            cr_train = CorpusReader(minDocSentenceNum=5,
                                    minSentenceWordNum=5,
                                    dataset=traintext[i],
                                    labelset=trainlabel[i])
            docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, posList = cr_train.getCorpus(
                [0, 100000])

            # 			docMatrixes = numpy.column_stack((docMatrixes, posList))
            docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
            # 			posList = transToTensor(posList, theano.config.floatX)
            docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
            sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
            labels = transToTensor(labels, numpy.int32)

            index = T.lscalar("index")

            n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) /
                             batchSize + 1)
            print "Dataname: %s" % data_names[i]
            print "Train set size is ", len(docMatrixes.get_value())
            print "Batch size is ", batchSize
            print "Number of training batches  is ", n_batches[i]
            error = layer2[i].errors(docLabel)
            cost = layer2[i].negative_log_likelihood(docLabel)

            share_grads = T.grad(cost, share_params)
            share_updates = [
                (param_i, param_i - share_learning_rate * grad_i)
                for param_i, grad_i in zip(share_params, share_grads)
            ]

            grads = T.grad(cost, local_params[i])
            local_updates = [
                (param_i, param_i - local_learning_rate * grad_i)
                for param_i, grad_i in zip(local_params[i], grads)
            ]
            updates = share_updates + local_updates
            print "Compiling train computing graph."
            if mode == "train":
                train_model.append(
                    theano.function(
                        [index], [cost, error, layer2[i].y_pred, docLabel],
                        updates=updates,
                        givens={
                            corpus:
                            docMatrixes,
                            docSentenceCount:
                            docSentenceNums[index *
                                            batchSize:(index + 1) * batchSize +
                                            1],
                            sentenceWordCount:
                            sentenceWordNums,
                            docLabel:
                            labels[index * batchSize:(index + 1) * batchSize]
                        }))
            print "Compiled."

            print "Load test dataname: %s" % test_data_names[i]
            cr_test = CorpusReader(minDocSentenceNum=5,
                                   minSentenceWordNum=5,
                                   dataset=testtext[i],
                                   labelset=testlabel[i])
            validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, validPosList = cr_test.getCorpus(
                [0, 1000])
            # 			validDocMatrixes = numpy.column_stack((validDocMatrixes, validPosList))
            validDocMatrixes = transToTensor(validDocMatrixes,
                                             theano.config.floatX)
            # 			validPosList = transToTensor(validPosList, theano.config.floatX)
            validDocSentenceNums = transToTensor(validDocSentenceNums,
                                                 numpy.int32)
            validSentenceWordNums = transToTensor(validSentenceWordNums,
                                                  numpy.int32)
            validLabels = transToTensor(validLabels, numpy.int32)
            print "Validating set size is ", len(validDocMatrixes.get_value())
            print "Data loaded."

            print "Compiling test computing graph."
            valid_model.append(
                theano.function(
                    [], [
                        cost, error, layer2[i].y_pred, docLabel,
                        T.transpose(layer2[i].p_y_given_x)[1]
                    ],
                    givens={
                        corpus: validDocMatrixes,
                        docSentenceCount: validDocSentenceNums,
                        sentenceWordCount: validSentenceWordNums,
                        docLabel: validLabels
                    }))
            print "Compiled."
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                i]()

            all_pred_label.extend(pred_label)
            all_real_label.extend(real_label)
            all_pred_prob.extend(pred_prob)

            print "Valid current model :", data_names[i]
            print "Cost: ", costNum
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            if 1 in threshold:
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

        print "Valid current model :", data_names
        errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
        print "Error: ", errorNum

        fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "ROC: ", roc_auc
        fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
        if 1 in threshold:
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]

        if mode == "test":
            return

        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(max(n_batches)):
                for dataset_index in xrange(data_count):
                    if i >= n_batches[dataset_index]:
                        continue
                    # for list-type data
                    print "dataset_index: %d, i: %d" % (dataset_index, i)
                    costNum, errorNum, pred_label, real_label = train_model[
                        dataset_index](i)
                    ite = ite + 1
                    # for padding data
                    if (ite % 1 == 0):
                        print
                        print "Dataset name: ", data_names[dataset_index]
                        print "@iter: ", ite
                        print "Cost: ", costNum
                        print "Error: ", errorNum

            # Validate the model
            all_pred_label = list()
            all_real_label = list()
            all_pred_prob = list()
            for dataset_index in xrange(data_count):
                costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                    dataset_index]()

                all_pred_label.extend(pred_label)
                all_real_label.extend(real_label)
                all_pred_prob.extend(pred_prob)

                print "Valid current model :", data_names[dataset_index]
                print "Cost: ", costNum
                print "Error: ", errorNum

                fpr, tpr, _ = roc_curve(real_label, pred_prob)
                roc_auc = auc(fpr, tpr)
                print "data_name: ", data_name
                print "ROC: ", roc_auc

                fpr, tpr, threshold = roc_curve(real_label, pred_label)
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

            print "Valid current model :", data_names
            errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
Example #55
0
def train_lstm(
        dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
        patience=10,  # Number of epoch to wait before early stop if no progress
        max_epochs=5000,  # The maximum number of epoch to run
        dispFreq=10,  # Display to stdout the training progress every N updates
        decay_c=0.,  # Weight decay for the classifier applied to the U weights.
        lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
        n_words=10000,  # Vocabulary size
        optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
        encoder='lstm',  # TODO: can be removed must be lstm.
        saveto='lstm_model.npz',  # The best model will be saved there
        validFreq=370,  # Compute the validation error after this number of update.
        saveFreq=1110,  # Save the parameters after every saveFreq updates
        maxlen=100,  # Sequence longer then this get ignored
        batch_size=16,  # The batch size during training.
        valid_batch_size=64,  # The batch size used for validation/test set.
        dataset='imdb',

        # Parameter for extra option
        noise_std=0.,
        use_dropout=True,  # if False slightly faster, but worst test error
        # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
        test_size=-1,  # If >0, we keep only this number of test example.
):

    # Model options
    model_options = locals().copy()
    print "model options", model_options

    load_data, prepare_data = get_dataset(dataset)

    print 'Loading data'
    train, valid, test = load_data(n_words=n_words,
                                   valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print 'Building model'
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U']**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    print 'Optimization'

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print "%d train examples" % len(train[0])
    print "%d valid examples" % len(valid[0])
    print "%d test examples" % len(test[0])

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.clock()
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

                if saveto and numpy.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Done'

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (uidx == 0 or valid_err <=
                            numpy.array(history_errs)[:, 0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Train ', train_err, 'Valid ', valid_err, 'Test ',
                          test_err)

                    if (len(history_errs) > patience and valid_err >=
                            numpy.array(history_errs)[:-patience, 0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.clock()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
    if saveto:
        numpy.savez(saveto,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time))
    return train_err, valid_err, test_err
    def setup(self):
        """
        Set up the model to train.
        """

        # input_words: shape (n_batch, n_sentence, sentence_len)
        input_words = T.itensor3()
        n_batch, n_sentences, sentence_len = input_words.shape
        # query_words: shape (n_batch, query_len)
        query_words = T.imatrix()
        # correct_output: shape (n_batch, ?, num_output_words)
        correct_output = T.ftensor3()

        # graph_num_new_nodes: shape(n_batch, n_sentence)
        graph_num_new_nodes = T.imatrix()
        # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter)
        graph_new_node_strengths =  T.ftensor3()
        # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids)
        graph_new_node_ids = T.ftensor4()
        # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types)
        graph_new_edges = T.TensorType('floatX', (False,)*5)()

        def _build(with_correct_graph, snap_to_best, using_dropout, evaluate_accuracy):
            info = {}
            # Process each sentence, flattened to (?, sentence_len)
            flat_input_words = input_words.reshape([-1, sentence_len])
            flat_input_reprs, flat_ref_matrices = self.input_transformer.process(flat_input_words)
            # flat_input_reprs of shape (?, input_repr_size)
            # flat_ref_matrices of shape (?, num_node_ids, input_repr_size)
            input_reprs = flat_input_reprs.reshape([n_batch, n_sentences, self.input_repr_size])
            ref_matrices = flat_ref_matrices.reshape([n_batch, n_sentences, self.num_node_ids, self.input_repr_size])

            query_repr, query_ref_matrix = self.input_transformer.process(query_words)

            if using_dropout:
                iter_dropouts = []
                states_mask = util.make_dropout_mask((self.node_state_size,), self.dropout_keep, self.srng)
                if self.nodes_mutable:
                    iter_dropouts.extend(self.node_state_updater.dropout_masks(self.srng, states_mask))
                if len(self.word_node_mapping) > 0:
                    iter_dropouts.extend(self.direct_reference_updater.dropout_masks(self.srng, states_mask))
                if self.intermediate_propagate != 0:
                    iter_dropouts.extend(self.intermediate_propagator.dropout_masks(self.srng, states_mask))
                if self.dynamic_nodes:
                    iter_dropouts.extend(self.new_node_adder.dropout_masks(self.srng))
                iter_dropouts.extend(self.edge_state_updater.dropout_masks(self.srng))
            else:
                iter_dropouts = []
                states_mask = None

            def _iter_fn(input_repr, ref_matrix, gstate, correct_num_new_nodes=None, correct_new_strengths=None, correct_new_node_ids=None, correct_edges=None, dropout_masks=None):
                # If necessary, update node state
                if self.nodes_mutable:
                    gstate, dropout_masks = self.node_state_updater.process(gstate, input_repr, dropout_masks)

                if len(self.word_node_mapping) > 0:
                    gstate, dropout_masks = self.direct_reference_updater.process(gstate, ref_matrix, dropout_masks)

                # If necessary, propagate node state
                if self.intermediate_propagate != 0:
                    gstate, dropout_masks = self.intermediate_propagator.process_multiple(gstate, self.intermediate_propagate, dropout_masks)

                node_loss = None
                node_accuracy = None
                # Propose and vote on new nodes
                if self.dynamic_nodes:
                    new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates(gstate, input_repr, self.new_nodes_per_iter, dropout_masks)
                    # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter)
                    # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids)
                    if with_correct_graph:
                        perm_idxs = np.array(list(itertools.permutations(range(self.new_nodes_per_iter))))
                        permuted_correct_str = correct_new_strengths[:,perm_idxs]
                        permuted_correct_ids = correct_new_node_ids[:,perm_idxs]
                        # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids)
                        ext_new_str = T.shape_padaxis(new_strengths,1)
                        ext_new_ids = T.shape_padaxis(new_ids,1)
                        strength_ll = permuted_correct_str * T.log(ext_new_str + util.EPSILON) + (1-permuted_correct_str) * T.log(1-ext_new_str + util.EPSILON)
                        ids_ll = permuted_correct_ids * T.log(ext_new_ids  + util.EPSILON)
                        reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum(ids_ll, axis=[2,3])
                        if self.best_node_match_only:
                            node_loss = -T.max(reduced_perm_lls, 1)
                        else:
                            full_ll = util.reduce_log_sum(reduced_perm_lls, 1)
                            # Note that some of these permutations are identical, since we likely did not add the maximum
                            # amount of nodes. Thus we will have added repeated elements here.
                            # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want
                            # log(kx) = log(k) + log(x)
                            # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)!
                            # Recall that n! = gamma(n+1)
                            # so log(x) = log(kx) - log(gamma(k+1))
                            log_rep_factor = T.gammaln(T.cast(self.new_nodes_per_iter - correct_num_new_nodes + 1, 'floatX'))
                            scaled_ll = full_ll - log_rep_factor
                            node_loss = -scaled_ll
                        if evaluate_accuracy:
                            best_match_idx = T.argmax(reduced_perm_lls, 1)
                            # should be of shape (n_batch), indexing the best permutation
                            best_correct_str = permuted_correct_str[T.arange(n_batch), best_match_idx]
                            best_correct_ids = permuted_correct_ids[T.arange(n_batch), best_match_idx]
                            snapped_strengths = util.independent_best(new_strengths)
                            snapped_ids = util.categorical_best(new_ids) * T.shape_padright(snapped_strengths)
                            close_strengths = T.all(T.isclose(best_correct_str, snapped_strengths), (1))
                            close_ids = T.all(T.isclose(best_correct_ids, snapped_ids), (1,2))
                            node_accuracy = T.and_(close_strengths, close_ids)
                        # now substitute in the correct nodes
                        gstate = gstate.with_additional_nodes(correct_new_strengths, correct_new_node_ids)
                    elif snap_to_best:
                        snapped_strengths = util.independent_best(new_strengths)
                        snapped_ids = util.categorical_best(new_ids)
                        gstate = gstate.with_additional_nodes(snapped_strengths, snapped_ids)
                    else:
                        gstate = gstate.with_additional_nodes(new_strengths, new_ids)


                # Update edge state
                gstate, dropout_masks = self.edge_state_updater.process(gstate, input_repr, dropout_masks)
                if with_correct_graph:
                    cropped_correct_edges = correct_edges[:,:gstate.n_nodes,:gstate.n_nodes,:]
                    edge_lls = cropped_correct_edges * T.log(gstate.edge_strengths + util.EPSILON) + (1-cropped_correct_edges) * T.log(1-gstate.edge_strengths + util.EPSILON)
                    # edge_lls currently penalizes for edges connected to nodes that do not exist
                    # we do not want it to do this, so we mask it with node strengths
                    mask_src = util.shape_padaxes(gstate.node_strengths,[2,3])
                    mask_dest = util.shape_padaxes(gstate.node_strengths,[1,3])
                    masked_edge_lls = edge_lls * mask_src * mask_dest
                    edge_loss = -T.sum(masked_edge_lls, axis=[1,2,3])
                    if evaluate_accuracy:
                        snapped_edges = util.independent_best(gstate.edge_strengths)
                        close_edges = T.isclose(cropped_correct_edges, snapped_edges)
                        ok_mask = T.invert(T.cast(mask_src * mask_dest,'bool')) # its OK for things not to match if node strengths are NOT both 1
                        edge_accuracy = T.all(T.or_(close_edges, ok_mask), (1,2,3))
                        overall_accuracy = edge_accuracy if node_accuracy is None else T.and_(node_accuracy, edge_accuracy)
                    else:
                        overall_accuracy = None
                    gstate = gstate.with_updates(edge_strengths=cropped_correct_edges)
                    return gstate, node_loss, edge_loss, overall_accuracy
                elif snap_to_best:
                    snapped_edges = util.independent_best(gstate.edge_strengths)
                    gstate = gstate.with_updates(edge_strengths=snapped_edges)
                    return gstate
                else:
                    return gstate

            # Scan over each sentence
            def _scan_fn(input_repr, *stuff): # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size)
                stuff = list(stuff)

                if len(self.word_node_mapping) > 0:
                    ref_matrix = stuff[0]
                    stuff = stuff[1:]
                else:
                    ref_matrix = None

                if with_correct_graph:
                    c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[:4]
                    stuff = stuff[4:]

                if using_dropout:
                    dropout_masks = stuff[:len(iter_dropouts)]
                    stuff = stuff[len(iter_dropouts):]
                else:
                    dropout_masks = None

                flat_graph_state = stuff[:-1]
                pad_graph_size = stuff[-1]
                gstate = GraphState.unflatten_from_const_size(flat_graph_state)

                if with_correct_graph:
                    gstate, node_loss, edge_loss, overall_accuracy = _iter_fn(input_repr, ref_matrix, gstate, c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges, dropout_masks=dropout_masks)
                else:
                    gstate = _iter_fn(input_repr, ref_matrix, gstate, dropout_masks=dropout_masks)

                retvals = gstate.flatten_to_const_size(pad_graph_size)
                if with_correct_graph:
                    if self.dynamic_nodes:
                        retvals.append(node_loss)
                    retvals.append(edge_loss)
                    if evaluate_accuracy:
                        retvals.append(overall_accuracy)
                return retvals

            if self.dynamic_nodes:
                initial_gstate = GraphState.create_empty(n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types)
            else:
                initial_gstate = GraphState.create_full_unique(n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types)

            # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness
            if self.dynamic_nodes:
                pad_graph_size = n_sentences * self.new_nodes_per_iter + 1
            else:
                pad_graph_size = self.num_node_ids
            outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size)
            prepped_input = input_reprs.dimshuffle([1,0,2])
            sequences = [prepped_input]
            if len(self.word_node_mapping) > 0:
                sequences.append(ref_matrices.dimshuffle([1,0,2,3]))
            if with_correct_graph:
                sequences.append(graph_num_new_nodes.swapaxes(0,1))
                sequences.append(graph_new_node_strengths.swapaxes(0,1))
                sequences.append(graph_new_node_ids.swapaxes(0,1))
                sequences.append(graph_new_edges.swapaxes(0,1))

                if self.dynamic_nodes:
                    outputs_info.extend([None])
                if evaluate_accuracy:
                    outputs_info.extend([None])
                outputs_info.extend([None])
            if using_dropout:
                sequences.extend(iter_dropouts)
            all_scan_out, _ = theano.scan(_scan_fn, sequences=sequences, outputs_info=outputs_info, non_sequences=[pad_graph_size])
            graph_accurate_list = None
            if with_correct_graph:
                if evaluate_accuracy:
                    full_graph_accuracy = all_scan_out[-1]
                    all_scan_out = all_scan_out[:-1]
                    graph_accurate_list = T.all(full_graph_accuracy, 0)
                    info["graph_accuracy"]=T.sum(graph_accurate_list, dtype='floatX')/T.cast(n_batch, 'floatX')
                if self.dynamic_nodes:
                    all_flat_gstates = all_scan_out[:-2]
                    node_loss, edge_loss = all_scan_out[-2:]
                    reduced_node_loss = T.sum(node_loss)/T.cast(n_batch, 'floatX')
                    reduced_edge_loss = T.sum(edge_loss)/T.cast(n_batch, 'floatX')
                    avg_graph_loss = (reduced_node_loss + reduced_edge_loss)/T.cast(input_words.shape[1], 'floatX')
                    info["node_loss"]=reduced_node_loss
                    info["edge_loss"]=reduced_edge_loss
                else:
                    all_flat_gstates = all_scan_out[:-1]
                    edge_loss = all_scan_out[-1]
                    reduced_edge_loss = T.sum(edge_loss)/T.cast(n_batch, 'floatX')
                    avg_graph_loss = reduced_edge_loss/T.cast(input_words.shape[1], 'floatX')
                    info["edge_loss"]=reduced_edge_loss
            else:
                all_flat_gstates = all_scan_out

            if self.sequence_representation:
                # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...)
                # except for the last one, which we handle separately
                # Swap to (n_batch, n_sentences, ...)
                # Then flatten to (n_batch*n_sentences, ...) for further processing
                final_flat_gstate = [x.swapaxes(0,1).reshape(T.concatenate([[-1], x.shape[2:]]), ndim=(x.ndim-1)) for x in all_flat_gstates[:-1]]
                # As for the last one, we need to get a single scalar value. The last one will be the biggest
                # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats
                # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength
                # nodes here!)
                final_flat_gstate.append(all_flat_gstates[-1][-1])
                # We also need to repeat query_repr and query_ref_matrix so that they broadcast together
                query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0)
                query_ref_matrix = T.extra_ops.repeat(query_ref_matrix, n_sentences, 0)
            else:
                # Extract last timestep
                final_flat_gstate = [x[-1] for x in all_flat_gstates]
            final_gstate = GraphState.unflatten_from_const_size(final_flat_gstate)

            if self.train_with_query:
                if self.wipe_node_state:
                    final_gstate = final_gstate.with_updates(node_states=T.zeros_like(final_gstate.node_states))

                qnsu_dropout_masks = self.query_node_state_updater.dropout_masks(self.srng, states_mask)
                query_gstate, _ = self.query_node_state_updater.process(final_gstate, query_repr, qnsu_dropout_masks)

                if len(self.word_node_mapping) > 0:
                    qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks(self.srng, states_mask)
                    query_gstate, _ = self.query_direct_reference_updater.process(query_gstate, query_ref_matrix, qdru_dropout_masks)

                fp_dropout_masks = self.final_propagator.dropout_masks(self.srng, states_mask)
                propagated_gstate, _ = self.final_propagator.process_multiple(query_gstate, self.final_propagate, fp_dropout_masks)

                agg_dropout_masks = self.aggregator.dropout_masks(self.srng)
                aggregated_repr, _ = self.aggregator.process(propagated_gstate, agg_dropout_masks) # shape (n_batch, output_repr_size)
                
                if self.sequence_representation:
                    # aggregated_repr is of shape (n_batch*n_sentences, repr_width)
                    # We want to split back to timesteps: (n_batch, n_sentences, repr_width)
                    agg_repr_seq = aggregated_repr.reshape([n_batch, n_sentences, -1])
                    # Now collapse it to a summary representation
                    aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks(self.srng)
                    aggregated_repr, _ = self.aggregate_summarizer.process(agg_repr_seq, aggsum_dropout_masks)
                    # At this point aggregated_repr is (n_batch, repr_width) as desired

                max_seq_len = correct_output.shape[1]
                if self.output_format == ModelOutputFormat.sequence:
                    final_output = self.output_processor.process(aggregated_repr, max_seq_len) # shape (n_batch, ?, num_output_words)
                else:
                    final_output = self.output_processor.process(aggregated_repr)

                if snap_to_best:
                    final_output = self.output_processor.snap_to_best(final_output)

                if self.output_format == ModelOutputFormat.subset:
                    elemwise_loss = T.nnet.binary_crossentropy(final_output, correct_output)
                    query_loss = T.sum(elemwise_loss)
                else:
                    flat_final_output = final_output.reshape([-1, self.num_output_words])
                    flat_correct_output = correct_output.reshape([-1, self.num_output_words])
                    timewise_loss = T.nnet.categorical_crossentropy(flat_final_output, flat_correct_output)
                    query_loss = T.sum(timewise_loss)
                query_loss = query_loss/T.cast(n_batch, 'floatX')
                info["query_loss"] = query_loss
            else:
                final_output = T.zeros([])

            full_loss = np.array(0.0,np.float32)
            if with_correct_graph:
                full_loss = full_loss + avg_graph_loss
            if self.train_with_query:
                full_loss = full_loss + query_loss

            if self.train_with_query:
                adjusted_query_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim+1))
                                           if self.sequence_representation else T.shape_padaxis(x,1)
                                           for x in query_gstate.flatten()]
                adjusted_prop_gstates =  [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim+1))
                                           if self.sequence_representation else T.shape_padaxis(x,1)
                                           for x in propagated_gstate.flatten()]
                full_flat_gstates = [T.concatenate([a.swapaxes(0,1),b,c],1)
                                        for a,b,c in zip(all_flat_gstates[:-1],
                                                         adjusted_query_gstates,
                                                         adjusted_prop_gstates)]
            else:
                full_flat_gstates = [a.swapaxes(0,1) for a in all_flat_gstates[:-1]]
                max_seq_len = T.iscalar()
            return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info

        train_loss, _, _, _, _, train_info = _build(self.train_with_graph, False, True, False)
        adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var)

        self.info_keys = list(train_info.keys())

        print("Compiling...")

        optimizer = theano.compile.predefined_optimizers['fast_run' if self.check_mode == 'debug' else theano.config.optimizer]
        optimizer = optimizer.excluding("scanOp_pushout_output","remove_constants_and_unused_inputs_scan")
        if self.check_mode == 'nan':
            mode = NanGuardMode(optimizer=optimizer, nan_is_error=True, inf_is_error=True, big_is_error=True)
        elif self.check_mode == 'debug':
            mode = DebugMode(optimizer=optimizer, check_isfinite=False, check_py_code=False, stability_patience=1)
            theano.tensor.TensorType.filter_checks_isfinite = False
        else:
            mode = theano.Mode(optimizer=optimizer)
        self.train_fn = theano.function([input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges],
                                        [train_loss]+list(train_info.values()),
                                        updates=adam_updates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build(self.train_with_graph, False, False, True)
        self.eval_info_keys = list(eval_info.keys())
        self.eval_fn = theano.function( [input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges],
                                        [eval_loss, graph_accurate_list]+list(eval_info.values()),
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        self.debug_test_fn = theano.function( [input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges],
                                        full_flat_gstates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(False, False, False, False)
        self.fuzzy_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []),
                                        [final_output] + full_flat_gstates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(False, True, False, False)
        self.snap_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []),
                                        [final_output] + full_flat_gstates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)
X = T.matrix()
state = model.fprop(X)
target = T.matrix()
wrong_target = T.matrix()

right_cost = model.layers[-1].kl(Y=target, Y_hat=state)
wrong_cost = model.layers[-1].kl(Y=wrong_target, Y_hat=state)

from theano.printing import Print
right_cost = Print('right_cost')(right_cost)

acc = (wrong_cost > right_cost).mean()

from theano import function

f = function([X, target, wrong_target], acc)

wrong_target = dataset.y.copy()
used = np.zeros((500,), dtype='bool')
for i in xrange(wrong_target.shape[0]):
    dists = np.square(dataset.y - dataset.y[i,:]).sum(axis=1)
    dists[i] = np.inf
    dists[used] = np.inf
    idx = np.argmin(dists)
    used[idx] = 1
    wrong_target[i, :] = dataset.y[idx, :].copy()

acc = f(dataset.X, dataset.y, wrong_target)
print dataset.y.sum()
print wrong_target.sum()
        shared_x = theano.shared(numpy.asarray(Feature_normalized,
                                    dtype=theano.config.floatX),
                                    borrow=True)
        numpy_rng = numpy.random.RandomState(123)

        ##########################
        ### model 1  第一种网络构架模式  #
        ##########################
        dbn = GRBM_DBN(numpy_rng=numpy_rng, n_ins=528,
        hidden_layers_sizes=[1000, 1000, 500],
        n_outs=201)
        dbn.load('dbn_2014-05-23-20-07-28.npy')#预先训练好的构架

        #这里就是theano的奇葩函数构架
        validate_model = theano.function(inputs=[],
            outputs=dbn.logLayer.p_y_given_x,#输出是逻辑回归层的输出
            givens={ dbn.x: shared_x})

        observ_likelihood_1 = validate_model()#调用函数得到结果
        del dbn


        """
        ##########################
        ### model 2
        ##########################
        dbn = GRBM_DBN(numpy_rng=numpy_rng, n_ins=528,
        hidden_layers_sizes=[1000, 1000, 500],
        n_outs=201)

        dbn.load('dbn_2014-05-24-05-53-17.npy')
Example #59
0
X = T.matrix('X')
M = T.imatrix('M')

X_complete = T.where(M, X, X_shared)
ll = model.get_log_likelihood(X_complete)

grad = T.grad(ll.mean(), X_shared, disconnected_inputs='warn')
updates = OrderedDict()

lr = T.scalar('lr')
is_noise = sharedX(0., 'is_noise')
updates[X_shared] = X_shared + lr * (grad + model.prior.theano_rng.normal(size=X_shared.shape))
updates[X_shared] = T.where(M, X, updates[X_shared])
updates[X_shared] = T.clip(updates[X_shared], 0, 1)

f = theano.function([X, M, lr], [ll.mean()], updates=updates, allow_input_downcast=True)
print 'Compiled training function'

# Setup for training and display
dataset_yaml_src = model.dataset_yaml_src
train_set = yaml_parse.load(dataset_yaml_src)
test_set = yaml_parse.load(dataset_yaml_src.replace("unlabeled", "test"))

dataset = train_set
num_samples = n_examples

vis_batch = dataset.get_batch_topo(num_samples)
rval = tuple(vis_batch.shape[dataset.X_topo_space.axes.index(axis)]
             for axis in ('b', 0, 1, 'c'))
_, patch_rows, patch_cols, channels = rval
mapback = hasattr(dataset, 'mapback_for_viewer')
def visualize_gates_lstm(gate_values, hidden_states, updates,
                         train_stream, valid_stream,
                         args):

    in_gates = gate_values["in_gates"]
    out_gates = gate_values["out_gates"]
    forget_gates = gate_values["forget_gates"]

    # Handle the theano shared variables that allow carrying the hidden state
    givens, f_updates = carry_hidden_state(updates, 1,
                                           not(has_indices(args.dataset)))

    generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs,
                                  outputs=in_gates,
                                  givens=givens,
                                  updates=f_updates,
                                  mode=Mode(optimizer='fast_compile'))
    generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs,
                                   outputs=out_gates,
                                   givens=givens,
                                   updates=f_updates,
                                   mode=Mode(optimizer='fast_compile'))
    generate_forget = theano.function(inputs=ComputationGraph(forget_gates).inputs,
                                      outputs=forget_gates,
                                      givens=givens,
                                      updates=f_updates,
                                      mode=Mode(optimizer='fast_compile'))

    # Generate
    epoch_iterator = valid_stream.get_epoch_iterator()
    for num in range(10):
        init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1]

        last_output_in = generate_in(init_)
        last_output_out = generate_out(init_)
        last_output_forget = generate_forget(init_)
        layers = len(last_output_in)

        time = last_output_in[0].shape[0]
        if has_indices(args.dataset):
            ticks = tuple(conv_into_char(init_[:, 0], args.dataset))
        else:
            ticks = tuple(np.arange(time))

        for i in range(layers):

            plt.subplot(3, layers, 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_in[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("in_gate of layer " + str(i))

            plt.subplot(3, layers, layers + 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_out[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("out_gate of layer " + str(i))

            plt.subplot(3, layers, 2 * layers + 1 + i)
            plt.plot(np.arange(time), np.mean(
                np.abs(last_output_forget[i][:, 0, :]), axis=1))
            plt.xticks(range(args.visualize_length), ticks)
            plt.grid(True)
            plt.title("forget_gate of layer " + str(i))
        plt.tight_layout()
        if args.local:
            plt.show()
        else:
            plt.savefig(
                args.save_path + "/visualize_gates_" + str(num) + ".png")
            logger.info("Figure \"visualize_gates_" + str(num) +
                        ".png\" saved at directory: " + args.save_path)