Ejemplo n.º 1
0
def build_fft_scale(x, y, size):
    W = []
    pnet = ll.InputLayer((None, 3, 101, 101), input_var=None)
    pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.NonlinearityLayer(ll.BatchNormLayer(pnet))
    pnet = ll.Pool2DLayer(pnet, (3, 3), (2, 2))
    pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.NonlinearityLayer(
        ll.BatchNormLayer(pnet),
        nonlinearity=l.nonlinearities.LeakyRectify(0.1))
    pnet = ll.Conv2DLayer(pnet, 32, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.BatchNormLayer(pnet)
    x_p, y_p = ll.get_output(pnet, x), ll.get_output(pnet, y)
    z_p = Customfftlayer(x_p, y_p)
    net = ll.InputLayer((None, 64, 50, 50), input_var=z_p)
    net = ll.BatchNormLayer(net)
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad')
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None))

    # return scale different: x_new/x_lod-1
    p_scale = ll.get_output(net)
    #p_scale = theano.gradient.disconnected_grad(p_scale)
    net_scale = ll.InputLayer((None, 10, 25, 25), p_scale)
    net_scale = ll.DenseLayer(net_scale,
                              100,
                              b=None,
                              nonlinearity=l.nonlinearities.tanh)
    W.append(net_scale.get_params(regularizable=True)[0])
    net_scale = ll.DenseLayer(net_scale, 2, b=None, nonlinearity=None)
    # return heatmap with 2 times upsample of size
    net_heat = ll.DenseLayer(net,
                             500,
                             b=None,
                             nonlinearity=l.nonlinearities.tanh)
    W.append(net_heat.get_params(regularizable=True)[0])
    net_heat = ll.DenseLayer(net, size**2, b=None, nonlinearity=None)
    W.append(net_heat.get_params(regularizable=True)[0])
    net_heat = ll.BatchNormLayer(net_heat)
    net_heat = ll.ReshapeLayer(net_heat, ([0], 1, size, size))
    net_heat = ll.Deconv2DLayer(net_heat,
                                64, (5, 5), (2, 2),
                                b=None,
                                crop='same',
                                nonlinearity=None)
    net_heat = ll.BatchNormLayer(net_heat)
    net_heat = ll.Conv2DLayer(net_heat,
                              1, (3, 3),
                              b=None,
                              pad='same',
                              nonlinearity=None)
    W.append(net_heat.get_params(regularizable=True)[0])
    return pnet, net_scale, net_heat, W
Ejemplo n.º 2
0
def build_siamese(layer):
    """"""
    smx = nonlinearities.softmax
    lnr = nonlinearities.linear
    layers = L.get_all_layers(layer)
    nl = filter(
        lambda l: hasattr(l, 'nonlinearity') and (
            (l.nonlinearity != smx) and (l.nonlinearity != lnr)),
        layers)[0].nonlinearity

    if len(layers[0].output_shape) == 3:
        Xl = T.tensor3('left')
        Xr = T.tensor3('right')
    elif len(layers[0].output_shape) == 4:
        Xl = T.tensor4('left')
        Xr = T.tensor4('right')

    Ol = L.get_output(layer, inputs=Xl)
    # Ol_vl = L.get_output(layer, inputs=Xl, deterministic=True)
    Or = L.get_output(layer, inputs=Xr)
    O = T.concatenate([Ol, Or], axis=-1)

    layer = L.InputLayer((None, layer.output_shape[-1] * 2), input_var=O)
    layer = L.DenseLayer(layer, 128, nonlinearity=None, name='hc1')
    layer = L.BatchNormLayer(layer)
    layer = L.NonlinearityLayer(layer, nonlinearity=nl)
    layer = L.DenseLayer(layer, 2, nonlinearity=smx)

    return layer, (Xl, Xr)
Ejemplo n.º 3
0
    def build_network(self, mfcc_input_var):
        print('Building cnn with parameters:')
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.net_opts)

        mfcc_network = layers.InputLayer((None, 130, MC_LENGTH), mfcc_input_var)
        mfcc_network = layers.BatchNormLayer(mfcc_network)
        mfcc_network = self.set_conv_layer(mfcc_network, 'conv_1', bnorm=False)
        mfcc_network = self.set_pool_layer(mfcc_network, 'pool_1')
        mfcc_network = self.set_conv_layer(mfcc_network, 'conv_2', bnorm=False)
        mfcc_network = self.set_pool_layer(mfcc_network, 'pool_2')
        for n in self.net_opts['layer_list']:
            # mfcc_network = layers.batch_norm(layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), 
            #                                  n, 
            #                                  nonlinearity=lasagne.nonlinearities.rectify)
            #                                 )
            mfcc_network = layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), 
                                            n, 
                                            nonlinearity=lasagne.nonlinearities.rectify)
            # mfcc_network = layers.BatchNormLayer(mfcc_network)
        mfcc_network = layers.DenseLayer(layers.dropout(mfcc_network, p=self.net_opts['dropout_p']), 
                                        self.net_opts['num_class'], 
                                        nonlinearity=lasagne.nonlinearities.softmax)
        
        self.network = mfcc_network
        return self.network
    def __build_48_net__(self):

        model24 = self.subnet
        network = layers.InputLayer((None, 3, 48, 48),
                                    input_var=self.__input_var__)
        network = layers.Conv2DLayer(network,
                                     num_filters=64,
                                     filter_size=(5, 5),
                                     stride=1,
                                     nonlinearity=relu)
        network = layers.batch_norm(
            layers.MaxPool2DLayer(network, pool_size=(3, 3), stride=2))
        network = layers.Conv2DLayer(network,
                                     num_filters=64,
                                     filter_size=(5, 5),
                                     stride=1,
                                     nonlinearity=relu)
        network = layers.BatchNormLayer(network)
        network = layers.MaxPool2DLayer(network, pool_size=(3, 3), stride=2)
        network = layers.DenseLayer(network, num_units=256, nonlinearity=relu)
        #network = layers.Conv2DLayer(network,num_filters=256,filter_size=(1,1),stride=1,nonlinearity=relu)
        denselayer24 = model24.net.input_layer
        network = layers.ConcatLayer([network, denselayer24])
        network = layers.DenseLayer(network, num_units=2, nonlinearity=softmax)
        return network
Ejemplo n.º 5
0
    def build_network(self, ra_input_var, mc_input_var):
        print('Building raw dnn with parameters:')
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.net_opts)

        ra_network_1 = layers.InputLayer((None, 1, 3969), ra_input_var)
        ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_1', dropout=False, pad='same')
        ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_1')
        ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_2', pad='same')
        ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_2')
        ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_3', pad='same')
        ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_3')
        ra_network_1 = self.set_conv_layer(ra_network_1, 'ra_conv_4', pad='same')
        ra_network_1 = self.set_pool_layer(ra_network_1, 'ra_pool_4')
        concat_list = [ra_network_1]
        mc_input = layers.InputLayer((None, 2, MC_LENGTH), mc_input_var)
        concat_list.append(mc_input)
        network = layers.ConcatLayer(concat_list, axis=1, cropping=[None, None, 'center'])
        network = layers.BatchNormLayer(network)
        for n in self.net_opts['layer_list']:
            network = layers.DenseLayer(layers.dropout(network, p=self.net_opts['dropout_p']), 
                                            n, 
                                            nonlinearity=lasagne.nonlinearities.rectify)
        network = layers.DenseLayer(layers.dropout(network, p=self.net_opts['dropout_p']), 
                                        self.net_opts['num_class'], 
                                        nonlinearity=lasagne.nonlinearities.softmax)
        
        # print(layers.get_output_shape(network))
        self.network = network
        return self.network
Ejemplo n.º 6
0
def build_TOY(x, y):
    z_p = T.concatenate((x, y), axis=1)

    net = ll.InputLayer((None, 2, 100, 100), input_var=z_p)
    net = ll.BatchNormLayer(net)
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad')
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad')
    net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None))
    net = ll.DenseLayer(net, 625, b=None, nonlinearity=None)
    net = ll.ReshapeLayer(net, ([0], 1, 25, 25))
    return net
def output_block(net, config, non_lin, verbose=True):
    """
    """
    # output setting
    out_acts = []
    for out_act in config.hyper_parameters.out_act:
        exec('from lasagne.nonlinearities import {}'.format(out_act))
        out_acts.append(eval(out_act))
    n_outs = config.hyper_parameters.n_out

    # Global Average Pooling
    last_conv_block_name = next(reversed(net))
    net['gap'] = L.GlobalPoolLayer(net[last_conv_block_name], name='gap')
    net['gap.bn'] = L.BatchNormLayer(net['gap'], name='gap.bn')
    n_features = net['gap.bn'].output_shape[-1]

    # feature Layer
    net['fc'] = L.dropout(L.batch_norm(
        L.DenseLayer(net['gap.bn'],
                     num_units=n_features,
                     nonlinearity=non_lin,
                     name='fc')),
                          name='fc.bn.do')

    # output (prediction)
    # check whether the model if for MTL or STL
    # target is passed as list, regardless whether
    # it's MTL or STL (configuration checker checks it)
    targets = config.target
    out_layer_names = []
    for target, n_out, out_act in zip(targets, n_outs, out_acts):

        out_layer_names.append('out.{}'.format(target))

        if target == 'self':
            net[out_layer_names[-1]], inputs = build_siamese(net['fc'])
        else:
            net[out_layer_names[-1]] = L.DenseLayer(net['fc'],
                                                    num_units=n_out,
                                                    nonlinearity=out_act,
                                                    name=out_layer_names[-1])
            inputs = [net['input'].input_var]

    # make a concatation layer just for save/load purpose
    net['IO'] = L.ConcatLayer([
        L.FlattenLayer(net[target_layer_name])
        if target == 'self' else net[target_layer_name]
        for target_layer_name in out_layer_names
    ],
                              name='IO')

    if verbose:
        print(net['gap.bn'].output_shape)
        print(net['fc'].output_shape)
        for target in targets:
            print(net['out.{}'.format(target)].output_shape)

    return net, inputs
Ejemplo n.º 8
0
def BatchNormRecurrentLayer(incoming,
                            num_units,
                            nonlinearity=None,
                            gradient_steps=-1,
                            grad_clipping=0,
                            layer_type=layers.CustomRecurrentLayer,
                            name='',
                            **kwargs):
    """
    Helper method to define a Vanilla Recurrent Layer with batch normalization
    """
    input_shape = incoming.output_shape
    # Define input to hidden connections
    in_to_hid_rf = layers.InputLayer((None, ) + input_shape[2:])
    in_to_hid_rf = layers.DenseLayer(in_to_hid_rf,
                                     num_units,
                                     b=None,
                                     nonlinearity=None,
                                     name='ith_{0}'.format(name))
    in_to_hid_rf_W = in_to_hid_rf.W

    # Use batch normalization in the input to hidden connections
    in_to_hid_rf = layers.BatchNormLayer(in_to_hid_rf,
                                         name='ith_bn_{0}'.format(name))

    # Define hidden to hidden connections
    hid_to_hid_rf = layers.InputLayer((None, num_units))
    hid_to_hid_rf = layers.DenseLayer(hid_to_hid_rf,
                                      num_units,
                                      b=None,
                                      nonlinearity=None,
                                      name='hth_{0}'.format(name))

    l_r_f = layer_type(incoming,
                       input_to_hidden=in_to_hid_rf,
                       hidden_to_hidden=hid_to_hid_rf,
                       gradient_steps=gradient_steps,
                       grad_clipping=grad_clipping,
                       nonlinearity=nonlinearity,
                       name='l_r_{0}'.format(name),
                       **kwargs)

    # Make layer parameters intuitively accessible
    l_r_f.W_in_to_hid = in_to_hid_rf_W
    l_r_f.W_hid_to_hid = hid_to_hid_rf.W
    l_r_f.beta = in_to_hid_rf.beta
    l_r_f.gamma = in_to_hid_rf.gamma
    l_r_f.mean = in_to_hid_rf.mean
    l_r_f.inv_std = in_to_hid_rf.inv_std
    l_r_f.hid_init = l_r_f.hid_init
    return l_r_f
Ejemplo n.º 9
0
def build_correlation_fft(x, y, size):
    pnet = ll.InputLayer((None, 3, 101, 101), input_var=None)
    pnet = ll.BatchNormLayer(pnet)
    pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.NonlinearityLayer(
        ll.BatchNormLayer(pnet),
        nonlinearity=l.nonlinearities.LeakyRectify(0.1))
    pnet = ll.Pool2DLayer(pnet, (3, 3), stride=(2, 2))
    pnet = ll.Conv2DLayer(pnet, 64, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.NonlinearityLayer(
        ll.BatchNormLayer(pnet),
        nonlinearity=l.nonlinearities.LeakyRectify(0.1))
    pnet = ll.Conv2DLayer(pnet, 32, (3, 3), pad='same', nonlinearity=None)
    pnet = ll.BatchNormLayer(pnet)
    x_p, y_p = ll.get_output(pnet, x), ll.get_output(pnet, y)
    x_p, y_p = fft.rfft(x_p, 'ortho'), fft.rfft(y_p, 'ortho')

    XX, XY = T.zeros_like(x_p), T.zeros_like(y_p)
    XX = T.set_subtensor(
        XX[:, :, :, :, 0], x_p[:, :, :, :, 0] * x_p[:, :, :, :, 0] +
        x_p[:, :, :, :, 1] * x_p[:, :, :, :, 1])
    XY = T.set_subtensor(
        XY[:, :, :, :, 0], x_p[:, :, :, :, 0] * y_p[:, :, :, :, 0] +
        x_p[:, :, :, :, 1] * y_p[:, :, :, :, 1])
    XY = T.set_subtensor(
        XY[:, :, :, :, 1], x_p[:, :, :, :, 0] * y_p[:, :, :, :, 1] -
        x_p[:, :, :, :, 1] * y_p[:, :, :, :, 0])
    xx = fft.irfft(XX, 'ortho')
    xy = fft.irfft(XY, 'ortho')

    z_p = T.concatenate((xx, xy), axis=1)
    z_p *= T.constant(hanningwindow(50))
    net = ll.InputLayer((None, 64, 50, 50), input_var=z_p)
    net = ll.BatchNormLayer(net)
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.Pool2DLayer(net, (2, 2), mode='average_inc_pad')
    net = ll.NonlinearityLayer(
        ll.BatchNormLayer(
            ll.Conv2DLayer(net, 64, (5, 5), pad='same', nonlinearity=None)))
    net = ll.BatchNormLayer(ll.Conv2DLayer(net, 10, (1, 1), nonlinearity=None))
    net = ll.DenseLayer(net, size**2, b=None, nonlinearity=None)
    net = ll.ReshapeLayer(net, ([0], 1, size, size))
    return pnet, net
Ejemplo n.º 10
0
    def build_network(self, mspec_input_var):
        print('Building spec dnn with parameters:')
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(self.net_opts)

        mspec_network = layers.InputLayer((None, 130, MC_LENGTH), mspec_input_var)
        mspec_network = layers.BatchNormLayer(mspec_network)
        for n in self.net_opts['layer_list']:
            mspec_network = layers.DenseLayer(layers.dropout(mspec_network, p=self.net_opts['dropout_p']), 
                                            n, 
                                            nonlinearity=lasagne.nonlinearities.rectify)
        mspec_network = layers.DenseLayer(layers.dropout(mspec_network, p=self.net_opts['dropout_p']), 
                                        self.net_opts['num_class'], 
                                        nonlinearity=lasagne.nonlinearities.softmax)
        
        self.network = mspec_network
        return self.network
Ejemplo n.º 11
0
def build_transition_down(incoming,
                          reduction,
                          p=0.1,
                          W_init=lasagne.init.GlorotUniform(),
                          b_init=None):
    """"Builds a transition in the DenseNet model. 

    Transitions consist of the sequence: Batch Normalization, 1x1 Convolution,
    2x2 Average Pooling. The channels can be compressed by specifying 
    0 < m <= 1, where num_channels = channels * m.
    """
    num_filters = int(incoming.output_shape[1] * reduction)

    network = nn.BatchNormLayer(incoming)
    network = nn.NonlinearityLayer(network, lasagne.nonlinearities.rectify)
    network = nn.Conv2DLayer(network, num_filters, 1, W=W_init, b=b_init)
    if p > 0:
        network = nn.DropoutLayer(network, p=p)
    return nn.Pool2DLayer(network, 2, 2, mode='max')
Ejemplo n.º 12
0
def conv2d(incoming, n_filters, filter_size, stride, pool_size, nonlinearity,
           batch_norm, name, verbose, *args, **kwargs):
    """"""
    if stride is None:
        stride = (1, 1)

    layer = L.Conv2DLayer(incoming,
                          num_filters=n_filters,
                          filter_size=filter_size,
                          stride=stride,
                          pad='same',
                          nonlinearity=None,
                          name=name)
    if batch_norm:
        name += '.bn'
        layer = L.BatchNormLayer(layer, name=name)

    name += '.nonlin'
    layer = L.NonlinearityLayer(layer, nonlinearity=nonlinearity)
    return layer
Ejemplo n.º 13
0
def build_block(
    incoming,
    num_layers,
    num_filters,
    use_linear_skip=True,
    filter_size=3,
    p=0.1,
    W_init=lasagne.init.GlorotUniform(),
    b_init=None,
    nonlinearity=lasagne.nonlinearities.rectify,
):
    """Builds a block in the DenseNet model."""

    feature_maps = [incoming]

    for i in xrange(num_layers):

        if len(feature_maps) == 1:
            network = incoming
        else:
            network = nn.ConcatLayer(feature_maps, axis=1)

        network = nn.BatchNormLayer(network)
        network = nn.NonlinearityLayer(network, nonlinearity)
        network = nn.Conv2DLayer(network,
                                 num_filters,
                                 filter_size,
                                 pad='same',
                                 W=W_init,
                                 b=b_init)
        if p > 0:
            network = nn.DropoutLayer(network, p=p)
        feature_maps.append(network)

    # Whether to return all connections (vanilla DenseNet), or to return only
    # those feature maps created in the current block used in upscale path for
    # semantic segmentation (100 layer tiramisu)
    if use_linear_skip:
        return nn.ConcatLayer(feature_maps, axis=1)
    return nn.ConcatLayer(feature_maps[1:], axis=1)
Ejemplo n.º 14
0
def build_segmenter_simple_absurd_res():
    sys.setrecursionlimit(1500)
    inp = ll.InputLayer(shape=(None, 1, None, None), name='input')
    n_layers = 64  # should get a 128 x 128 receptive field
    layers = [inp]
    for i in range(n_layers):
        # every 2 layers, add a skip connection
        layers.append(
            ll.Conv2DLayer(layers[-1],
                           num_filters=8,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=linear,
                           name='conv%d' % (i + 1)))
        layers.append(ll.BatchNormLayer(layers[-1], name='bn%i' % (i + 1)))
        if (i % 2 == 0) and (i != 0):
            layers.append(
                ll.ElemwiseSumLayer([
                    layers[-1],  # prev layer
                    layers[-6],
                ]  # 3 actual layers per block, skip the previous block
                                    ))
        layers.append(ll.NonlinearityLayer(layers[-1], nonlinearity=rectify))

    # our output layer is also convolutional, remember that our Y is going to be the same exact size as the
    conv_final = ll.Conv2DLayer(layers[-1],
                                num_filters=2,
                                filter_size=(3, 3),
                                pad='same',
                                W=Orthogonal(),
                                name='conv_final',
                                nonlinearity=linear)
    # we need to reshape it to be a (batch*n*m x 3), i.e. unroll s.t. the feature dimension is preserved
    softmax = Softmax4D(conv_final, name='4dsoftmax')

    return [softmax]
Ejemplo n.º 15
0
    incomings, merge_function=T.maximum)
concat = lambda axis=1: lambda incomings: layers.ConcatLayer(incomings,
                                                             axis=axis)

noise = lambda sigma=0.1: lambda incoming: \
  layers.GaussianNoiseLayer(incoming, sigma=sigma) if sigma is not None and sigma > 0 else incoming

nothing = lambda incoming: incoming

dense = lambda num_units, f=None: lambda incoming: \
  layers.DenseLayer(incoming, num_units=num_units, nonlinearity=(nonlinearities.LeakyRectify(0.05) if f is None else f))

dropout = lambda p=0.1, rescale=True: lambda incoming: \
  layers.DropoutLayer(incoming, p=p, rescale=rescale) if p is not None else incoming

batch_norm = lambda axes='auto': lambda incoming: layers.BatchNormLayer(
    incoming, axes=axes)


class Select(object):
    def __getitem__(self, item):
        return lambda incomings: incomings[item]


select = Select()
take = select

nonlinearity = lambda f=None: lambda incoming: layers.NonlinearityLayer(
    incoming, (nonlinearities.LeakyRectify(0.05) if f is None else f))

elementwise = lambda f=T.add: lambda incomings: layers.ElemwiseMergeLayer(
    incomings, f)
Ejemplo n.º 16
0
    def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim,
                 story_len, patches, cnn_dim_fc, truncate_gradient,
                 learning_rate, mode, answer_module, memory_hops, batch_size,
                 l2, normalize_attention, batch_norm, dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.data_dir = data_dir
        self.truncate_gradient = truncate_gradient
        self.learning_rate = learning_rate

        self.trng = RandomStreams(1234)

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.cnn_dim = cnn_dim
        self.cnn_dim_fc = cnn_dim_fc
        self.story_len = story_len
        self.patches = patches
        self.mode = mode
        self.answer_module = answer_module
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.vocab, self.ivocab = self._load_vocab(self.data_dir)

        self.train_story = None
        self.test_story = None
        self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind_lmdb(
            self.data_dir, 'train')
        self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind_lmdb(
            self.data_dir, 'val')

        self.train_story = self.train_dict_story.keys()
        self.test_story = self.test_dict_story.keys()
        self.vocab_size = len(self.vocab)

        # This is the local patch of each image.
        self.input_var = T.tensor4(
            'input_var')  # (batch_size, seq_len, patches, cnn_dim)
        self.q_var = T.tensor3(
            'q_var')  # Now, it's a batch * story_len * image_sieze.
        self.answer_var = T.ivector(
            'answer_var')  # answer of example in minibatch
        self.answer_mask = T.matrix('answer_mask')
        self.answer_idx = T.imatrix('answer_idx')  # batch x seq
        self.answer_inp_var = T.tensor3(
            'answer_inp_var')  # answer of example in minibatch

        print "==> building input module"
        # It's very simple now, the input module just need to map from cnn_dim to dim.
        logging.info('self.cnn_dim = %d', self.cnn_dim)
        logging.info('self.cnn_dim_fc = %d', self.cnn_dim_fc)
        logging.info('self.dim = %d', self.dim)
        self.W_q_emb_in = nn_utils.normal_param(std=0.1,
                                                shape=(self.dim,
                                                       self.cnn_dim_fc))
        self.b_q_emb_in = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))

        q_var_shuffled = self.q_var.dimshuffle(1, 2, 0)  # seq x cnn x batch.

        def _dot(x, W, b):
            return T.tanh(T.dot(W, x) + b.dimshuffle(0, 'x'))

        q_var_shuffled_emb, _ = theano.scan(
            fn=_dot,
            sequences=q_var_shuffled,
            non_sequences=[self.W_q_emb_in, self.b_q_emb_in])
        #print 'q_var_shuffled_emb', q_var_shuffled_emb.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')})
        q_var_emb = q_var_shuffled_emb.dimshuffle(2, 0,
                                                  1)  # batch x seq x emb_size
        q_var_emb_ext = q_var_emb.dimshuffle(0, 'x', 1, 2)
        q_var_emb_ext = T.repeat(q_var_emb_ext, q_var_emb.shape[1],
                                 1)  # batch x seq x seq x emb_size
        q_var_emb_rhp = T.reshape(
            q_var_emb,
            (q_var_emb.shape[0] * q_var_emb.shape[1], q_var_emb.shape[2]))
        q_var_emb_ext_rhp = T.reshape(
            q_var_emb_ext, (q_var_emb_ext.shape[0] * q_var_emb_ext.shape[1],
                            q_var_emb_ext.shape[2], q_var_emb_ext.shape[3]))
        q_var_emb_ext_rhp = q_var_emb_ext_rhp.dimshuffle(0, 2, 1)
        q_idx = T.arange(self.story_len).dimshuffle('x', 0)
        q_idx = T.repeat(q_idx, self.batch_size, axis=0)
        q_idx = T.reshape(q_idx, (q_idx.shape[0] * q_idx.shape[1], ))

        self.W_inp_emb_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim,
                                                         self.cnn_dim))
        self.b_inp_emb_in = nn_utils.constant_param(value=0.0,
                                                    shape=(self.dim, ))

        inp_rhp = T.reshape(
            self.input_var,
            (self.batch_size * self.story_len * self.patches, self.cnn_dim))
        inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0)
        inp_rhp_emb = T.dot(
            self.W_inp_emb_in,
            inp_rhp_dimshuffled) + self.b_inp_emb_in.dimshuffle(0, 'x')
        inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0)
        inp_emb_raw = T.reshape(
            inp_rhp_emb_dimshuffled,
            (self.batch_size * self.story_len, self.patches, self.cnn_dim))
        inp_emb = T.tanh(
            inp_emb_raw
        )  # Just follow the paper DMN for visual and textual QA.

        self.inp_c = inp_emb.dimshuffle(1, 2, 0)

        logging.info('building question module')
        self.W_qf_res_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim, self.dim))
        self.W_qf_res_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_qf_upd_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim, self.dim))
        self.W_qf_upd_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_qf_hid_in = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim, self.dim))
        self.W_qf_hid_hid = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        inp_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))

        q_var_shuffled_emb_reversed = q_var_shuffled_emb[::
                                                         -1, :, :]  # seq x emb_size x batch
        q_glb, _ = theano.scan(fn=self.q_gru_step_forward,
                               sequences=q_var_shuffled_emb_reversed,
                               outputs_info=[T.zeros_like(inp_dummy)])
        q_glb_shuffled = q_glb.dimshuffle(2, 0,
                                          1)  # batch_size * seq_len * dim
        q_glb_last = q_glb_shuffled[:, -1, :]  # batch_size * dim

        q_net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                         self.dim),
                                  input_var=q_var_emb_rhp)
        if self.batch_norm:
            q_net = layers.BatchNormLayer(incoming=q_net)
        if self.dropout > 0 and self.mode == 'train':
            q_net = layers.DropoutLayer(q_net, p=self.dropout)
        self.q_q = layers.get_output(q_net).dimshuffle(1, 0)

        #print "==> creating parameters for memory module"
        logging.info('creating parameters for memory module')
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_update1 = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.dim * 2))
        self.b_mem_upd1 = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))
        self.W_mem_update2 = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.dim * 2))
        self.b_mem_upd2 = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))
        self.W_mem_update3 = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.dim * 2))
        self.b_mem_upd3 = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))

        self.W_mem_update = [
            self.W_mem_update1, self.W_mem_update2, self.W_mem_update3
        ]
        self.b_mem_update = [self.b_mem_upd1, self.b_mem_upd2, self.b_mem_upd3]

        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))
        logging.info(
            '==> building episodic memory module (fixed number of steps: %d)',
            self.memory_hops)
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            #m = printing.Print('mem')(memory[iter-1])
            current_episode = self.new_episode(memory[iter - 1])
            # Replace GRU with ReLU activation + MLP.
            c = T.concatenate([memory[iter - 1], current_episode], axis=0)
            cur_mem = T.dot(self.W_mem_update[iter - 1],
                            c) + self.b_mem_update[iter - 1].dimshuffle(
                                0, 'x')
            memory.append(T.nnet.relu(cur_mem))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                       self.dim),
                                input_var=last_mem_raw)

        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        print "==> building answer module"

        answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0)
        # Sounds good. Now, we need to map last_mem to a new space.
        self.W_mem_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim, self.dim * 2))
        self.b_mem_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.W_inp_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim,
                                                      self.vocab_size + 1))
        self.b_inp_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        def _dot2(x, W, b):
            #return  T.tanh(T.dot(W, x) + b.dimshuffle(0,'x'))
            return T.dot(W, x) + b.dimshuffle(0, 'x')

        answer_inp_var_shuffled_emb, _ = theano.scan(
            fn=_dot2,
            sequences=answer_inp_var_shuffled,
            non_sequences=[self.W_inp_emb,
                           self.b_inp_emb])  # seq x dim x batch

        init_ans = T.concatenate([self.q_q, last_mem],
                                 axis=0)  # dim x (batch x self.story_len)

        mem_ans = T.dot(self.W_mem_emb, init_ans) + self.b_mem_emb.dimshuffle(
            0, 'x')  # dim x (batchsize x self.story_len)
        #mem_ans_dim = mem_ans.dimshuffle('x',0,1)
        mem_ans_rhp = T.reshape(mem_ans.dimshuffle(
            1, 0), (self.batch_size, self.story_len, mem_ans.shape[0]))
        mem_ans_dim = mem_ans_rhp.dimshuffle(1, 2, 0)
        answer_inp = answer_inp_var_shuffled_emb
        #answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) #seq + 1 x dim x (batch-size x self.story+len)
        # Now, each answer got its input, our next step is to obtain the sequences.
        answer_inp_shu = answer_inp.dimshuffle(2, 0, 1)
        answer_inp_shu_rhp = T.reshape(answer_inp_shu, (self.batch_size, self.story_len, answer_inp_shu.shape[1],\
                answer_inp_shu.shape[2]))

        answer_inp = answer_inp_shu_rhp.dimshuffle(
            1, 2, 3, 0)  # story_len x seq + 1 x dim x batch_size

        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size + 1, self.dim))

        self.W_ans_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_map = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim, self.dim * 2))
        self.b_ans_map = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        results = None
        r = None

        dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))
        for i in range(self.story_len):
            answer_inp_i = answer_inp[i, :]  # seq + 1 x dim x batch_size
            mem_ans_dim_i = mem_ans_dim[i, :]  # dim x batch_size
            if i == 0:
                q_glb_inp = q_glb_last.dimshuffle('x', 1,
                                                  0)  #1 x dim x batch_size
                answer_inp_i = T.concatenate([q_glb_inp, answer_inp_i], axis=0)

                init_h = T.concatenate([dummy, mem_ans_dim_i], axis=0)
                init_h = T.dot(self.W_ans_map,
                               init_h) + self.b_ans_map.dimshuffle(0, 'x')
                init_h = T.tanh(init_h)
                r, _ = theano.scan(fn=self.answer_gru_step,
                                   sequences=answer_inp_i,
                                   truncate_gradient=self.truncate_gradient,
                                   outputs_info=[init_h])
                r = r[1:, :]  # get rid of the first glob one.
                results = r.dimshuffle('x', 0, 1, 2)
            else:
                prev_h = r[self.answer_idx[:, i], :, T.arange(self.batch_size)]
                h_ = T.concatenate([prev_h.dimshuffle(1, 0), mem_ans_dim_i],
                                   axis=0)
                h_ = T.dot(self.W_ans_map, h_) + self.b_ans_map.dimshuffle(
                    0, 'x')
                h_ = T.tanh(h_)

                r, _ = theano.scan(fn=self.answer_gru_step,
                                   sequences=answer_inp_i,
                                   truncate_gradient=self.truncate_gradient,
                                   outputs_info=[h_])
                results = T.concatenate([results, r.dimshuffle('x', 0, 1, 2)])
        ## results: story_len x seq+1 x dim x batch_size
        results = results.dimshuffle(3, 0, 1, 2)
        results = T.reshape(results, (self.batch_size * self.story_len,
                                      results.shape[2], results.shape[3]))
        results = results.dimshuffle(1, 2, 0)  # seq_len x dim x (batch x seq)

        # Assume there is a start token
        #print 'results', results.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'),
        #    self.q_var: np.random.rand(2,5, 4096).astype('float32'),
        #    self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'),
        #    self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')})

        #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image)
        #print results.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(3, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}, on_unused_input='ignore')

        # Now, we need to transform it to the probabilities.

        prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x),
                              sequences=results,
                              non_sequences=self.W_a)
        #print 'prob', prob.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'),
        #    self.q_var: np.random.rand(2,5, 4096).astype('float32'),
        #    self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'),
        #    self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')})

        #preds = prob[1:,:,:]
        #prob = prob[1:-1,:,:]
        preds = prob
        prob = prob[:-1, :, :]

        prob_shuffled = prob.dimshuffle(2, 0, 1)  # b * len * vocab
        preds_shuffled = preds.dimshuffle(2, 0, 1)

        logging.info("prob shape.")
        #print prob.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(3, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')})

        n = prob_shuffled.shape[0] * prob_shuffled.shape[1]
        n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1]

        prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2]))
        preds_rhp = T.reshape(preds_shuffled,
                              (n_preds, preds_shuffled.shape[2]))

        prob_sm = nn_utils.softmax_(prob_rhp)
        preds_sm = nn_utils.softmax_(preds_rhp)
        self.prediction = prob_sm  # this one is for the training.

        #print 'prob_sm', prob_sm.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')})
        #print 'lbl', loss_vec.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')})
        # This one is for the beamsearch.
        self.pred = T.reshape(
            preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1],
                       preds_shuffled.shape[2]))

        mask = T.reshape(self.answer_mask, (n, ))
        lbl = T.reshape(self.answer_var, (n, ))

        self.params = [
            self.W_inp_emb_in,
            self.b_inp_emb_in,
            self.W_q_emb_in,
            self.b_q_emb_in,
            #self.W_glb_att_1, self.W_glb_att_2, self.b_glb_att_1, self.b_glb_att_2,
            self.W_qf_res_in,
            self.W_qf_res_hid,
            self.b_qf_res,
            self.W_qf_upd_in,
            self.W_qf_upd_hid,
            self.b_qf_upd,
            self.W_qf_hid_in,
            self.W_qf_hid_hid,
            self.b_qf_hid,
            self.W_mem_emb,
            self.W_inp_emb,
            self.b_mem_emb,
            self.b_inp_emb,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            #self.W_mem_emb, self.W_inp_emb,self.b_mem_emb, self.b_inp_emb,
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a,
            self.W_ans_res_in,
            self.W_ans_res_hid,
            self.b_ans_res,
            self.W_ans_upd_in,
            self.W_ans_upd_hid,
            self.b_ans_upd,
            self.W_ans_hid_in,
            self.W_ans_hid_hid,
            self.b_ans_hid,
            self.W_ans_map,
            self.b_ans_map,
        ]
        self.params += self.W_mem_update
        self.params += self.b_mem_update

        print "==> building loss layer and computing updates"
        reward_prob = prob_sm[T.arange(n), lbl]
        reward_prob = T.reshape(
            reward_prob, (prob_shuffled.shape[0], prob_shuffled.shape[1]))
        #reward_prob = printing.Print('mean_r')(reward_prob)

        loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl)
        #loss_vec = T.nnet.categorical_crossentropy(prob_sm, T.flatten(self.answer_var))
        #print 'loss_vec', loss_vec.shape.eval({prob_sm: np.random.rand(39,8900).astype('float32'),
        #    lbl: np.random.rand(39,).astype('int32')})

        self.loss_ce = (mask * loss_vec).sum() / mask.sum()
        print 'loss_ce', self.loss_ce.eval({
            prob_sm:
            np.random.rand(39, 8900).astype('float32'),
            lbl:
            np.random.rand(39, ).astype('int32'),
            mask:
            np.random.rand(39, ).astype('float32')
        })

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2
        grads = T.grad(self.loss, wrt=self.params, disconnected_inputs='raise')

        updates = lasagne.updates.adadelta(grads,
                                           self.params,
                                           learning_rate=self.learning_rate)

        if self.mode == 'train':
            logging.info("compiling train_fn")
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.answer_mask, self.answer_inp_var, self.answer_idx
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        logging.info("compiling test_fn")
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.answer_mask,
            self.answer_inp_var, self.answer_idx
        ],
                                       outputs=[self.prediction, self.loss])

        logging.info("compiling pred_fn")
        self.pred_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_inp_var, self.answer_idx
        ],
                                       outputs=[self.pred])
Ejemplo n.º 17
0
    def __init__(
        self,
        image_shape,
        filter_shape,
        num_class,
        conv_type,
        kernel_size,
        kernel_pool_size,
        dropout_rate,
    ):
        """

        """
        self.filter_shape = filter_shape
        self.n_visible = numpy.prod(image_shape)
        self.n_layers = len(filter_shape)
        self.rng = RandomStreams(123)
        self.x = T.matrix()
        self.y = T.ivector()

        self.conv_layers = []

        NoiseLayer = layers.DropoutLayer

        dropout_rate = float(dropout_rate)

        self.l_input = layers.InputLayer((None, self.n_visible), self.x)
        this_layer = layers.ReshapeLayer(self.l_input, ([0], ) + image_shape)

        for l in range(self.n_layers):
            activation = lasagne.nonlinearities.rectify
            if len(filter_shape[l]) == 3:
                if conv_type == 'double' and filter_shape[l][1] > kernel_size:
                    this_layer = DoubleConvLayer(
                        this_layer,
                        filter_shape[l][0],
                        filter_shape[l][1:],
                        pad='same',
                        nonlinearity=activation,
                        kernel_size=kernel_size,
                        kernel_pool_size=kernel_pool_size)
                    this_layer = layers.batch_norm(this_layer)
                elif conv_type == 'maxout':
                    this_layer = layers.Conv2DLayer(this_layer,
                                                    filter_shape[l][0],
                                                    filter_shape[l][1:],
                                                    b=None,
                                                    pad='same',
                                                    nonlinearity=None)
                    this_layer = layers.FeaturePoolLayer(
                        this_layer, pool_size=kernel_pool_size**2)
                    this_layer = layers.BatchNormLayer(this_layer)
                    this_layer = layers.NonlinearityLayer(
                        this_layer, activation)

                elif conv_type == 'cyclic':
                    this_layers = []
                    this_layers.append(
                        layers.Conv2DLayer(this_layer,
                                           filter_shape[l][0],
                                           filter_shape[l][1:],
                                           b=None,
                                           pad='same',
                                           nonlinearity=None))
                    for _ in range(3):
                        W = this_layers[-1].W.dimshuffle(0, 1, 3,
                                                         2)[:, :, :, ::-1]
                        this_layers.append(
                            layers.Conv2DLayer(this_layer,
                                               filter_shape[l][0],
                                               filter_shape[l][1:],
                                               W=W,
                                               b=None,
                                               pad='same',
                                               nonlinearity=None))
                    this_layer = layers.ElemwiseMergeLayer(
                        this_layers, T.maximum)
                    this_layer = layers.BatchNormLayer(this_layer)
                    this_layer = layers.NonlinearityLayer(
                        this_layer, activation)

                elif conv_type == 'standard' \
                     or (conv_type == 'double' and filter_shape[l][1] <= kernel_size):
                    this_layer = layers.Conv2DLayer(this_layer,
                                                    filter_shape[l][0],
                                                    filter_shape[l][1:],
                                                    pad='same',
                                                    nonlinearity=activation)
                    this_layer = layers.batch_norm(this_layer)
                else:
                    raise NotImplementedError

                self.conv_layers.append(this_layer)

            elif len(filter_shape[l]) == 2:
                this_layer = layers.MaxPool2DLayer(this_layer, filter_shape[l])
                this_layer = NoiseLayer(this_layer, dropout_rate)
            elif len(filter_shape[l]) == 1:
                raise NotImplementedError

        self.top_conv_layer = this_layer
        this_layer = layers.GlobalPoolLayer(this_layer, T.mean)
        self.clf_layer = layers.DenseLayer(this_layer,
                                           num_class,
                                           W=lasagne.init.Constant(0.),
                                           nonlinearity=T.nnet.softmax)

        self.params = layers.get_all_params(self.clf_layer, trainable=True)

        self.params_all = layers.get_all_params(self.clf_layer)
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units

        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')

        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        network = layers.InputLayer(shape=(None, 1, 128, 858),
                                    input_var=self.input_var)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=16,
                                     filter_size=(7, 7),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(5, 5),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var: example}).shape
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=(2, 1),
                                        pad=2)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        self.params = layers.get_all_params(network, trainable=True)

        output = layers.get_output(network)
        output = output.transpose((0, 3, 1, 2))
        output = output.flatten(ndim=3)

        # NOTE: these constants are shapes of last pool layer, it can be symbolic
        # explicit values are better for optimizations
        num_channels = 32
        filter_W = 852
        filter_H = 8

        # InputLayer
        network = layers.InputLayer(shape=(None, filter_W,
                                           num_channels * filter_H),
                                    input_var=output)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # GRULayer
        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)
        print layers.get_output(network).eval({self.input_var: example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var: example}).shape

        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        #print "==> param shapes", [x.eval().shape for x in self.params]

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(
                self.params, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) # good one
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
Ejemplo n.º 19
0
    def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim,
                 story_len, mode, answer_module, memory_hops, batch_size, l2,
                 normalize_attention, batch_norm, dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.data_dir = data_dir

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.cnn_dim = cnn_dim
        self.story_len = story_len
        self.mode = mode
        self.answer_module = answer_module
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.vocab, self.ivocab = self._load_vocab(self.data_dir)

        self.train_story = None
        self.test_story = None
        self.train_dict_story, self.train_lmdb_env_fc = self._process_input_sind_lmdb(
            self.data_dir, 'train')
        self.val_dict_story, self.val_lmdb_env_fc = self._process_input_sind_lmdb(
            self.data_dir, 'val')
        self.test_dict_story, self.test_lmdb_env_fc = self._process_input_sind_lmdb(
            self.data_dir, 'test')

        self.train_story = self.train_dict_story.keys()
        self.val_story = self.val_dict_story.keys()
        self.test_story = self.test_dict_story.keys()
        self.vocab_size = len(self.vocab)

        self.q_var = T.tensor3(
            'q_var')  # Now, it's a batch * story_len * image_sieze.
        self.answer_var = T.imatrix(
            'answer_var')  # answer of example in minibatch
        self.answer_mask = T.matrix('answer_mask')
        self.answer_inp_var = T.tensor3(
            'answer_inp_var')  # answer of example in minibatch

        print "==> building input module"
        # It's very simple now, the input module just need to map from cnn_dim to dim.
        logging.info('self.cnn_dim = %d', self.cnn_dim)
        self.W_inp_emb_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim,
                                                         self.cnn_dim))
        self.b_inp_emb_in = nn_utils.constant_param(value=0.0,
                                                    shape=(self.dim, ))

        q_seq = self.q_var.dimshuffle(0, 'x', 1, 2)
        q_seq_rpt = T.repeat(q_seq, self.story_len, 1)
        q_seq_rhp = T.reshape(q_seq_rpt,
                              (q_seq_rpt.shape[0] * q_seq_rpt.shape[1],
                               q_seq_rpt.shape[2], q_seq_rpt.shape[3]))

        inp_var_shuffled = q_seq_rhp.dimshuffle(1, 2, 0)  #seq x cnn x batch

        def _dot(x, W, b):
            return T.dot(W, x) + b.dimshuffle(0, 'x')

        inp_c_hist, _ = theano.scan(
            fn=_dot,
            sequences=inp_var_shuffled,
            non_sequences=[self.W_inp_emb_in, self.b_inp_emb_in])
        #inp_c_hist,_ = theano.scan(fn = _dot, sequences=self.input_var, non_sequences = [self.W_inp_emb_in, self.b_inp_emb_in])

        self.inp_c = inp_c_hist  # seq x emb x batch

        print "==> building question module"
        # Now, share the parameter with the input module.
        q_var_shuffled = self.q_var.dimshuffle(
            1, 2, 0)  # now: story_len * image_size * batch_size

        # This is the RNN used to produce the Global Glimpse
        self.W_inpf_res_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.cnn_dim))
        self.W_inpf_res_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.dim, self.dim))
        self.b_inpf_res = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))

        self.W_inpf_upd_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.cnn_dim))
        self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.dim, self.dim))
        self.b_inpf_upd = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))

        self.W_inpf_hid_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim,
                                                          self.cnn_dim))
        self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.dim, self.dim))
        self.b_inpf_hid = nn_utils.constant_param(value=0.0,
                                                  shape=(self.dim, ))
        inp_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))

        q_glb, _ = theano.scan(fn=self.input_gru_step_forward,
                               sequences=q_var_shuffled,
                               outputs_info=[T.zeros_like(inp_dummy)])
        q_glb_shuffled = q_glb.dimshuffle(2, 0,
                                          1)  # batch_size * seq_len * dim
        q_glb_last = q_glb_shuffled[:, -1, :]  # batch_size * dim

        # Now, we also need to build the individual model.
        #q_var_shuffled = self.q_var.dimshuffle(1,0)
        q_single = T.reshape(
            self.q_var,
            (self.q_var.shape[0] * self.q_var.shape[1], self.q_var.shape[2]))
        q_single_shuffled = q_single.dimshuffle(1,
                                                0)  #cnn_dim x batch_size * 5

        # batch_size * 5 x dim
        q_hist = T.dot(self.W_inp_emb_in,
                       q_single_shuffled) + self.b_inp_emb_in.dimshuffle(
                           0, 'x')
        q_hist_shuffled = q_hist.dimshuffle(1, 0)  # batch_size * 5 x dim

        if self.batch_norm:
            logging.info("Using batch normalization.")
        q_net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                         self.dim),
                                  input_var=q_hist_shuffled)
        if self.batch_norm:
            q_net = layers.BatchNormLayer(incoming=q_net)
        if self.dropout > 0 and self.mode == 'train':
            q_net = layers.DropoutLayer(q_net, p=self.dropout)
        #last_mem = layers.get_output(q_net).dimshuffle((1, 0))
        self.q_q = layers.get_output(q_net).dimshuffle(1, 0)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))
        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            #m = printing.Print('mem')(memory[iter-1])
            current_episode = self.new_episode(memory[iter - 1])
            #current_episode = self.new_episode(m)
            #current_episode = printing.Print('current_episode')(current_episode)
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                       self.dim),
                                input_var=last_mem_raw)

        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        print "==> building answer module"

        answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0)
        # Sounds good. Now, we need to map last_mem to a new space.
        self.W_mem_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim, self.dim * 3))
        self.W_inp_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim,
                                                      self.vocab_size + 1))

        def _dot2(x, W):
            return T.dot(W, x)

        answer_inp_var_shuffled_emb, _ = theano.scan(
            fn=_dot2,
            sequences=answer_inp_var_shuffled,
            non_sequences=self.W_inp_emb)  # seq x dim x batch

        # dim x batch_size * 5
        q_glb_dim = q_glb_last.dimshuffle(0, 'x', 1)  # batch_size * 1 * dim
        q_glb_repmat = T.repeat(q_glb_dim, self.story_len,
                                1)  # batch_size * len * dim
        q_glb_rhp = T.reshape(q_glb_repmat,
                              (q_glb_repmat.shape[0] * q_glb_repmat.shape[1],
                               q_glb_repmat.shape[2]))
        init_ans = T.concatenate(
            [self.q_q, last_mem,
             q_glb_rhp.dimshuffle(1, 0)], axis=0)

        mem_ans = T.dot(self.W_mem_emb, init_ans)  # dim x batchsize.
        mem_ans_dim = mem_ans.dimshuffle('x', 0, 1)
        answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb],
                                   axis=0)

        dummy = theano.shared(
            np.zeros((self.dim, self.batch_size * self.story_len),
                     dtype=floatX))

        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size + 1, self.dim))

        self.W_ans_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        results, _ = theano.scan(fn=self.answer_gru_step,
                                 sequences=answer_inp,
                                 outputs_info=[dummy])
        # Assume there is a start token
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')
        #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image)
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')

        # Now, we need to transform it to the probabilities.

        prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x),
                              sequences=results,
                              non_sequences=self.W_a)
        preds = prob[1:, :, :]
        prob = prob[1:-1, :, :]

        prob_shuffled = prob.dimshuffle(2, 0, 1)  # b * len * vocab
        preds_shuffled = preds.dimshuffle(2, 0, 1)

        logging.info("prob shape.")
        #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')})

        n = prob_shuffled.shape[0] * prob_shuffled.shape[1]
        n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1]

        prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2]))
        preds_rhp = T.reshape(preds_shuffled,
                              (n_preds, preds_shuffled.shape[2]))

        prob_sm = nn_utils.softmax_(prob_rhp)
        preds_sm = nn_utils.softmax_(preds_rhp)
        self.prediction = prob_sm  # this one is for the training.

        # This one is for the beamsearch.
        self.pred = T.reshape(
            preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1],
                       preds_shuffled.shape[2]))

        mask = T.reshape(self.answer_mask, (n, ))
        lbl = T.reshape(self.answer_var, (n, ))

        self.params = [
            self.W_inp_emb_in,
            self.b_inp_emb_in,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        self.params = self.params + [
            self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
            self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
            self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid,
            self.W_mem_emb, self.W_inp_emb
        ]

        print "==> building loss layer and computing updates"
        loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl)
        self.loss_ce = (mask * loss_vec).sum() / mask.sum()

        #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl)

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.q_var, self.answer_var, self.answer_mask,
                    self.answer_inp_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var
        ],
                                       outputs=[self.prediction, self.loss])

        print "==> compiling pred_fn"
        self.pred_fn = theano.function(
            inputs=[self.q_var, self.answer_inp_var], outputs=[self.pred])
Ejemplo n.º 20
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, batch_size, l2, normalize_attention, batch_norm,
                 dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.vocab = {}
        self.ivocab = {}

        self.type = "batch"

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.tensor3(
            'input_var')  # (batch_size, seq_len, glove_dim)
        self.q_var = T.tensor3('question_var')  # as self.input_var
        self.answer_var = T.ivector(
            'answer_var')  # answer of example in minibatch
        self.fact_count_var = T.ivector(
            'fact_count_var')  # number of facts in the example of minibatch
        self.input_mask_var = T.imatrix(
            'input_mask_var')  # (batch_size, indices)

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        input_var_shuffled = self.input_var.dimshuffle(1, 2, 0)
        inp_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))
        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=input_var_shuffled,
                                       outputs_info=T.zeros_like(inp_dummy))

        inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1)

        inp_c_list = []
        inp_c_mask_list = []
        for batch_index in range(self.batch_size):
            taken = inp_c_history_shuffled[batch_index].take(
                self.input_mask_var[
                    batch_index, :self.fact_count_var[batch_index]],
                axis=0)
            inp_c_list.append(
                T.concatenate([
                    taken,
                    T.zeros((self.input_mask_var.shape[1] - taken.shape[0],
                             self.dim), floatX)
                ]))
            inp_c_mask_list.append(
                T.concatenate([
                    T.ones((taken.shape[0], ), np.int32),
                    T.zeros((self.input_mask_var.shape[1] - taken.shape[0], ),
                            np.int32)
                ]))

        self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0)
        inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0)

        q_var_shuffled = self.q_var.dimshuffle(1, 2, 0)
        q_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))
        q_q_history, _ = theano.scan(fn=self.input_gru_step,
                                     sequences=q_var_shuffled,
                                     outputs_info=T.zeros_like(q_dummy))
        self.q_q = q_q_history[-1]

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size, self.dim),
                                input_var=last_mem_raw)
        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(
                np.zeros((self.vocab_size, self.batch_size), dtype=floatX))
            results, updates = theano.scan(
                fn=self.answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],  #(last_mem, y)
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        self.prediction = self.prediction.dimshuffle(1, 0)

        self.params = [
            self.W_inp_res_in,
            self.W_inp_res_hid,
            self.b_inp_res,
            self.W_inp_upd_in,
            self.W_inp_upd_hid,
            self.b_inp_upd,
            self.W_inp_hid_in,
            self.W_inp_hid_hid,
            self.b_inp_hid,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(self.prediction,
                                                       self.answer_var).mean()

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.fact_count_var, self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.fact_count_var,
            self.input_mask_var
        ],
                                       outputs=[self.prediction, self.loss])
Ejemplo n.º 21
0
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 l2, mode, rnn_num_units, batch_norm, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.l2 = l2
        self.mode = mode
        self.num_units = rnn_num_units
        self.batch_norm = batch_norm

        self.input_var = T.tensor3('input_var')
        self.answer_var = T.ivector('answer_var')

        # scale inputs to be in [-1, 1]
        input_var_norm = 2 * self.input_var - 1

        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 858, 256),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        # InputLayer
        network = layers.InputLayer(shape=(None, 858, 256),
                                    input_var=input_var_norm)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # GRULayer
        network = layers.GRULayer(incoming=network, num_units=self.num_units)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
            print layers.get_output(network).eval({
                self.input_var: example
            }).shape

        # GRULayer
        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var: example}).shape

        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(
                network, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
Ejemplo n.º 22
0
def build_kpextractor128():
    inp = ll.InputLayer(shape=(None, 1, 128, 128), name='input')
    # alternate pooling and conv layers to minimize parameters
    filter_pad = lambda x, y: (x // 2, y // 2)
    filter3 = (3, 3)
    same_pad3 = filter_pad(*filter3)
    conv1 = ll.Conv2DLayer(inp,
                           num_filters=16,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv1')
    mp1 = ll.MaxPool2DLayer(conv1, 2, stride=2)  # now down to 64 x 64
    bn1 = ll.BatchNormLayer(mp1)
    conv2 = ll.Conv2DLayer(bn1,
                           num_filters=32,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv2')
    mp2 = ll.MaxPool2DLayer(conv2, 2, stride=2)  # now down to 32 x 32
    bn2 = ll.BatchNormLayer(mp2)
    conv3 = ll.Conv2DLayer(bn2,
                           num_filters=64,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv3')
    mp3 = ll.MaxPool2DLayer(conv3, 2, stride=2)  # now down to 16 x 16
    bn3 = ll.BatchNormLayer(mp3)
    conv4 = ll.Conv2DLayer(bn3,
                           num_filters=128,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv4')
    mp4 = ll.MaxPool2DLayer(conv4, 2, stride=2)  # now down to 8 x 8
    bn4 = ll.BatchNormLayer(mp4)
    conv5 = ll.Conv2DLayer(bn4,
                           num_filters=256,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv5')
    mp5 = ll.MaxPool2DLayer(conv5, 2, stride=2)  # down to 4 x 4
    bn5 = ll.BatchNormLayer(mp5)

    conv6 = ll.Conv2DLayer(bn5,
                           num_filters=512,
                           filter_size=filter3,
                           pad=same_pad3,
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv6')
    mp6 = ll.MaxPool2DLayer(conv6, 2, stride=2)  # down to 4 x 4
    bn6 = ll.BatchNormLayer(mp6)

    # now let's bring it down to a FC layer that takes in the 2x2x64 mp4 output
    fc1 = ll.DenseLayer(bn6, num_units=256, nonlinearity=rectify)
    bn1_fc = ll.BatchNormLayer(fc1)
    #dp1 = ll.DropoutLayer(bn1, p=0.5)
    fc2 = ll.DenseLayer(bn1_fc, num_units=64, nonlinearity=rectify)
    #dp2 = ll.DropoutLayer(fc2, p=0.5)
    bn2_fc = ll.BatchNormLayer(fc2)
    out = ll.DenseLayer(bn2_fc, num_units=6, nonlinearity=linear)
    out_rs = ll.ReshapeLayer(out, ([0], 3, 2))

    return out_rs
    def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient,
                 learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches,
                 mode, answer_module, memory_hops, batch_size, l2,
                 normalize_attention, batch_norm, dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.data_dir = data_dir
        self.learning_rate = learning_rate

        self.truncate_gradient = truncate_gradient
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.cnn_dim = cnn_dim
        self.cnn_dim_fc = cnn_dim_fc
        self.story_len = story_len
        self.mode = mode
        self.patches = patches
        self.answer_module = answer_module
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        #self.vocab, self.ivocab = self._load_vocab(self.data_dir)
        self.vocab, self.ivocab = self._ext_vocab_from_word2vec()

        self.train_story = None
        self.test_story = None
        self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind(
            self.data_dir, 'train')
        self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind(
            self.data_dir, 'val')

        self.train_story = self.train_dict_story.keys()
        self.test_story = self.test_dict_story.keys()
        self.vocab_size = len(self.vocab)

        # Since this is pretty expensive, we will pass a story each time.
        # We assume that the input has been processed such that the sequences of patches
        # are snake like path.

        self.input_var = T.tensor4(
            'input_var')  # (batch_size, seq_len, patches, cnn_dim)
        self.q_var = T.matrix('q_var')  # Now, it's a batch * image_sieze.
        self.answer_var = T.imatrix(
            'answer_var')  # answer of example in minibatch
        self.answer_mask = T.matrix('answer_mask')
        self.answer_inp_var = T.tensor3(
            'answer_inp_var')  # answer of example in minibatch

        print "==> building input module"
        self.W_inp_emb_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim,
                                                         self.cnn_dim))
        #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        # First, we embed the visual features before sending it to the bi-GRUs.

        inp_rhp = T.reshape(
            self.input_var,
            (self.batch_size * self.story_len * self.patches, self.cnn_dim))
        inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0)
        inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled)
        inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0)
        inp_emb_raw = T.reshape(
            inp_rhp_emb_dimshuffled,
            (self.batch_size, self.story_len, self.patches, self.cnn_dim))
        inp_emb = T.tanh(
            inp_emb_raw
        )  # Just follow the paper DMN for visual and textual QA.

        # Now, we use a bi-directional GRU to produce the input.
        # Forward GRU.
        self.inp_dim = self.dim / 2  # since we have forward and backward
        self.W_inpf_res_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_res_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_res = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpf_upd_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_upd = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpf_hid_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpf_hid = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))
        # Backward GRU.
        self.W_inpb_res_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_res_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_res = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpb_upd_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_upd = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        self.W_inpb_hid_in = nn_utils.normal_param(std=0.1,
                                                   shape=(self.inp_dim,
                                                          self.cnn_dim))
        self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1,
                                                    shape=(self.inp_dim,
                                                           self.inp_dim))
        self.b_inpb_hid = nn_utils.constant_param(value=0.0,
                                                  shape=(self.inp_dim, ))

        # Now, we use the GRU to build the inputs.
        # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one.
        inp_dummy = theano.shared(
            np.zeros((self.inp_dim, self.story_len), dtype=floatX))
        for i in range(self.batch_size):
            if i == 0:
                inp_1st_f, _ = theano.scan(
                    fn=self.input_gru_step_forward,
                    sequences=inp_emb[i, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)

                inp_1st_b, _ = theano.scan(
                    fn=self.input_gru_step_backward,
                    sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)
                # Now, combine them.
                inp_1st = T.concatenate([
                    inp_1st_f.dimshuffle(2, 0, 1),
                    inp_1st_b.dimshuffle(2, 0, 1)
                ],
                                        axis=-1)
                self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2)
            else:
                inp_f, _ = theano.scan(
                    fn=self.input_gru_step_forward,
                    sequences=inp_emb[i, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)

                inp_b, _ = theano.scan(
                    fn=self.input_gru_step_backward,
                    sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0),
                    outputs_info=T.zeros_like(inp_dummy),
                    truncate_gradient=self.truncate_gradient)
                # Now, combine them.
                inp_fb = T.concatenate(
                    [inp_f.dimshuffle(2, 0, 1),
                     inp_b.dimshuffle(2, 0, 1)],
                    axis=-1)
                self.inp_c = T.concatenate(
                    [self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis=0)
        # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim
        # Eventually, we can flattern them.
        # Now, the input dimension is 1024 because we have forward and backward.
        inp_c_t = T.reshape(
            self.inp_c,
            (self.batch_size, self.story_len * self.patches, self.dim))
        inp_c_t_dimshuffled = inp_c_t.dimshuffle(0, 'x', 1, 2)
        inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis=1)
        # Now, its ready for all the 5 images in the same story.
        # 50 * 980 * 512
        self.inp_batch = T.reshape(inp_batch,
                                   (inp_batch.shape[0] * inp_batch.shape[1],
                                    inp_batch.shape[2], inp_batch.shape[3]))
        self.inp_batch_dimshuffled = self.inp_batch.dimshuffle(
            1, 2, 0)  # 980 x 512 x 50

        # It's very simple now, the input module just need to map from cnn_dim to dim.
        logging.info('self.cnn_dim = %d', self.cnn_dim)

        print "==> building question module"
        # Now, share the parameter with the input module.
        self.W_inp_emb_q = nn_utils.normal_param(std=0.1,
                                                 shape=(self.dim,
                                                        self.cnn_dim_fc))
        self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, ))
        q_var_shuffled = self.q_var.dimshuffle(1, 0)

        inp_q = T.dot(
            self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle(
                0, 'x')  # 512 x 50
        self.q_q = T.tanh(
            inp_q
        )  # Since this is used to initialize the memory, we need to make it tanh.

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            #m = printing.Print('mem')(memory[iter-1])
            current_episode = self.new_episode(memory[iter - 1])
            #current_episode = self.new_episode(m)
            #current_episode = printing.Print('current_episode')(current_episode)
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size * self.story_len,
                                       self.dim),
                                input_var=last_mem_raw)

        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        logging.info('last_mem size')
        print last_mem.shape.eval({
            self.input_var:
            np.random.rand(10, 5, 196, 512).astype('float32'),
            self.q_var:
            np.random.rand(50, 4096).astype('float32')
        })

        print "==> building answer module"

        answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0)
        # Sounds good. Now, we need to map last_mem to a new space.
        self.W_mem_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim, self.dim * 2))
        self.W_inp_emb = nn_utils.normal_param(std=0.1,
                                               shape=(self.dim,
                                                      self.word_vector_size))

        def _dot2(x, W):
            return T.dot(W, x)

        answer_inp_var_shuffled_emb, _ = theano.scan(
            fn=_dot2,
            sequences=answer_inp_var_shuffled,
            non_sequences=self.W_inp_emb)  # seq x dim x batch

        # Now, we also need to embed the image and use it to do the memory.
        #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch.
        init_ans = T.concatenate([self.q_q, last_mem], axis=0)

        mem_ans = T.dot(self.W_mem_emb, init_ans)  # dim x batchsize.

        mem_ans_dim = mem_ans.dimshuffle('x', 0, 1)

        answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb],
                                   axis=0)

        # Now, we have both embedding. We can let them go to the rnn.

        # We also need to map the input layer as well.

        dummy = theano.shared(
            np.zeros((self.dim, self.batch_size * self.story_len),
                     dtype=floatX))

        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size + 1, self.dim))

        self.W_ans_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_ans_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        logging.info('answer_inp size')

        #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32')})

        #last_mem = printing.Print('prob_sm')(last_mem)
        results, _ = theano.scan(fn=self.answer_gru_step,
                                 sequences=answer_inp,
                                 outputs_info=[dummy])
        # Assume there is a start token
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')
        results = results[
            1:
            -1, :, :]  # get rid of the last token as well as the first one (image)
        #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore')

        # Now, we need to transform it to the probabilities.

        prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x),
                              sequences=results,
                              non_sequences=self.W_a)

        prob_shuffled = prob.dimshuffle(2, 0, 1)  # b * len * vocab

        logging.info("prob shape.")
        #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'),
        #    self.q_var: np.random.rand(10, 4096).astype('float32'),
        #    self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')})

        n = prob_shuffled.shape[0] * prob_shuffled.shape[1]
        prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2]))
        prob_sm = nn_utils.softmax_(prob_rhp)
        self.prediction = prob_sm

        mask = T.reshape(self.answer_mask, (n, ))
        lbl = T.reshape(self.answer_var, (n, ))

        self.params = [
            self.W_inp_emb_in,  #self.b_inp_emb_in, 
            self.W_inpf_res_in,
            self.W_inpf_res_hid,
            self.b_inpf_res,
            self.W_inpf_upd_in,
            self.W_inpf_upd_hid,
            self.b_inpf_upd,
            self.W_inpf_hid_in,
            self.W_inpf_hid_hid,
            self.b_inpf_hid,
            self.W_inpb_res_in,
            self.W_inpb_res_hid,
            self.b_inpb_res,
            self.W_inpb_upd_in,
            self.W_inpb_upd_hid,
            self.b_inpb_upd,
            self.W_inpb_hid_in,
            self.W_inpb_hid_hid,
            self.b_inpb_hid,
            self.W_inp_emb_q,
            self.b_inp_emb_q,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a,
            self.W_mem_emb,
            self.W_inp_emb,
            self.W_ans_res_in,
            self.W_ans_res_hid,
            self.b_ans_res,
            self.W_ans_upd_in,
            self.W_ans_upd_hid,
            self.b_ans_upd,
            self.W_ans_hid_in,
            self.W_ans_hid_hid,
            self.b_ans_hid,
        ]

        print "==> building loss layer and computing updates"
        loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl)
        self.loss_ce = (mask * loss_vec).sum() / mask.sum()

        #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl)

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.adam(self.loss, self.params, learning_rate = self.learning_rate)
        updates = lasagne.updates.rmsprop(self.loss,
                                          self.params,
                                          learning_rate=self.learning_rate)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.answer_mask, self.answer_inp_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)
            #profile = True)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.answer_mask,
            self.answer_inp_var
        ],
                                       outputs=[self.prediction, self.loss])
Ejemplo n.º 24
0
def build_segmenter_jet_preconv():
    # downsample down to a small region, then upsample all the way back up, using jet architecture
    # recreate basic FCN-8s structure (though more aptly 1s here since we upsample back to the original input size)
    # this jet will have another conv layer in the final upsample
    # difference here is that instead of combining softmax layers in the jet, we'll upsample before the conv_f* layer
    # this will certainly make the model slower, but should give us better predictions...
    # The awkward part here is combining the intermediate conv layers when they have different filter shapes
    # We could:
    #   concat them
    #   have intermediate conv layers that bring them to the shape needed then merge them
    # in the interests of speed we'll just concat them, though we'll have a ton of filters at the end
    inp = ll.InputLayer(shape=(None, 1, None, None), name='input')
    conv1 = ll.Conv2DLayer(inp,
                           num_filters=32,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv1_1')
    bn1 = ll.BatchNormLayer(conv1, name='bn1')
    conv2 = ll.Conv2DLayer(conv1,
                           num_filters=64,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv1_2')
    bn2 = ll.BatchNormLayer(conv2, name='bn2')
    mp1 = ll.MaxPool2DLayer(conv2, 2, stride=2, name='mp1')  # 2x downsample
    conv3 = ll.Conv2DLayer(mp1,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv2_1')
    bn3 = ll.BatchNormLayer(conv3, name='bn3')
    conv4 = ll.Conv2DLayer(conv3,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv2_2')
    bn4 = ll.BatchNormLayer(conv4, name='bn4')
    mp2 = ll.MaxPool2DLayer(conv4, 2, stride=2, name='mp2')  # 4x downsample
    conv5 = ll.Conv2DLayer(mp2,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv3_1')
    bn5 = ll.BatchNormLayer(conv5, name='bn5')
    conv6 = ll.Conv2DLayer(conv5,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv3_2')
    bn6 = ll.BatchNormLayer(conv6, name='bn6')
    mp3 = ll.MaxPool2DLayer(conv6, 2, stride=2, name='mp3')  # 8x downsample
    conv7 = ll.Conv2DLayer(mp3,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv4_1')
    bn7 = ll.BatchNormLayer(conv7, name='bn7')
    conv8 = ll.Conv2DLayer(conv7,
                           num_filters=128,
                           filter_size=(3, 3),
                           pad='same',
                           W=Orthogonal(),
                           nonlinearity=rectify,
                           name='conv4_2')
    bn8 = ll.BatchNormLayer(conv8, name='bn8')
    # f 68 s 8
    # now start the upsample
    ## FIRST UPSAMPLE PREDICTION (akin to FCN-32s)

    up8 = ll.Upscale2DLayer(
        bn8, 8,
        name='upsample_8x')  # take loss here, 8x upsample from 8x downsample
    conv_f8 = ll.Conv2DLayer(up8,
                             num_filters=2,
                             filter_size=(3, 3),
                             pad='same',
                             W=Orthogonal(),
                             nonlinearity=linear,
                             name='conv_8xpred')
    softmax_8 = Softmax4D(conv_f8, name='4dsoftmax_8x')

    ## COMBINE BY UPSAMPLING CONV 8 AND CONV 6
    conv_8_up2 = ll.Upscale2DLayer(bn8, 2,
                                   name='upsample_c8_2')  # 4x downsample
    concat_c8_c6 = ll.ConcatLayer([conv_8_up2, bn6],
                                  axis=1,
                                  name='concat_c8_c6')
    up4 = ll.Upscale2DLayer(
        concat_c8_c6, 4,
        name='upsample_4x')  # take loss here, 4x upsample from 4x downsample
    conv_f4 = ll.Conv2DLayer(up4,
                             num_filters=2,
                             filter_size=(3, 3),
                             pad='same',
                             W=Orthogonal(),
                             nonlinearity=linear,
                             name='conv_4xpred')
    softmax_4 = Softmax4D(conv_f4, name='4dsoftmax_4x')  # 4x downsample

    ## COMBINE BY UPSAMPLING CONCAT_86 AND CONV 4
    concat_86_up2 = ll.Upscale2DLayer(
        concat_c8_c6, 2, name='upsample_concat_86_2')  # 2x downsample
    concat_ct86_c4 = ll.ConcatLayer([concat_86_up2, bn4],
                                    axis=1,
                                    name='concat_ct86_c4')

    up2 = ll.Upscale2DLayer(
        concat_ct86_c4, 2, name='upsample_2x'
    )  # final loss here, 2x upsample from a 2x downsample
    conv_f2 = ll.Conv2DLayer(up2,
                             num_filters=2,
                             filter_size=(3, 3),
                             pad='same',
                             W=Orthogonal(),
                             nonlinearity=linear,
                             name='conv_2xpred')

    softmax_2 = Softmax4D(conv_f2, name='4dsoftmax_2x')

    ## COMBINE BY UPSAMPLING CONCAT_864 AND CONV 2
    concat_864_up2 = ll.Upscale2DLayer(
        concat_ct86_c4, 2, name='upsample_concat_86_2')  # no downsample
    concat_864_c2 = ll.ConcatLayer([concat_864_up2, bn2],
                                   axis=1,
                                   name='concat_ct864_c2')
    conv_f1 = ll.Conv2DLayer(concat_864_c2,
                             num_filters=2,
                             filter_size=(3, 3),
                             pad='same',
                             W=Orthogonal(),
                             nonlinearity=linear,
                             name='conv_1xpred')

    softmax_1 = Softmax4D(conv_f1, name='4dsoftmax_1x')

    # this is where up1 would go but that doesn't make any sense
    return [softmax_8, softmax_4, softmax_2, softmax_1]
Ejemplo n.º 25
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        gx_init = sparse.csr_matrix('gx', dtype='float32')
        gy_init = T.ivector('gy')
        gz_init = T.vector('gz')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        x_to_label = layers.SparseLayer(x_input, self.y.shape[1],
                                        nonlinearity=lg.nonlinearities.softmax)
        x_to_emd = layers.SparseLayer(x_input, self.embedding_size)
        W = x_to_emd.W
        x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1)
        x_concat = layers.DenseLayer(x_concat, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        pred = lgl.get_output(x_concat)
        step_loss = lgo.categorical_crossentropy(pred, y_init).mean()
        hid_loss = lgl.get_output(x_to_label)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(x_to_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = lgl.get_all_params(x_concat)
        step_updates = lg.updates.sgd(step_loss, step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init], step_loss,
                                          updates=step_updates)
        self.test_fn = theano.function([x_init], pred)

        # supervised train
        gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                  input_var=gx_init)
        gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W)
        gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver,
                                   nonlinearity=lg.nonlinearities.softmax)
        gx_pred = lgl.get_output(gx_to_emd)
        g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum()
        sup_params = lgl.get_all_params(gx_to_emd)
        sup_updates = lg.updates.sgd(g_loss, sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        # handle lstm input
        cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init)
        cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None)
        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        sub_path_batch1 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch1)
        sub_path_batch2 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch2)
        sub_path_batch3 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch3)
        sub_path_batch4 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch4)
        sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size,
                                           W=W)
        sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd2 = layers.SparseLayer(sub_path_input2,
                                           self.embedding_size, W=W)
        sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size,
                                           W=W)
        sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size,
                                           W=W)
        sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2,
                                         sub_path_emd3, sub_path_emd4], axis=1)
        sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1,
                                                      self.embedding_size),
                                               input_var=sub_path_concat)

        # lstm layer
        lstm_layer = lgl.LSTMLayer(sub_path_concat_layer,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_updates = lg.updates.sgd(reweight_loss, lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([gx_init, gy_init, gz_init,
                                        sub_path_batch1, sub_path_batch2,
                                        sub_path_batch3, sub_path_batch4,
                                        mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss, sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([gx_init, gy_init, gz_init,
                                         sub_path_batch1, sub_path_batch2,
                                         sub_path_batch3, sub_path_batch4,
                                         mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')

        print(' -- Done!')
Ejemplo n.º 26
0
def build_network():
    batch_norm = False
    num_units = 500  # rnn hidden units number
    l2 = 0.0  # l2 regularization
    dropout = 0.5

    input_var = T.tensor4('input_var')
    answer_var = T.ivector('answer_var')

    print('==> building network')
    example = np.random.uniform(size=(batch_size, 1, 128, 858),
                                low=0.0,
                                high=1.0).astype(np.float32)
    answer = np.random.randint(low=0, high=176, size=(batch_size, ))

    network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=input_var)
    print(layers.get_output(network).eval({input_var: example}).shape)

    # conv-relu-pool 1
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=16,
                                 filter_size=(7, 7),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 2
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(5, 5),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 3
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(5, 5),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    # conv-relu-pool 4
    network = layers.Conv2DLayer(incoming=network,
                                 num_filters=32,
                                 filter_size=(3, 3),
                                 stride=1,
                                 nonlinearity=rectify)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.MaxPool2DLayer(incoming=network,
                                    pool_size=(3, 3),
                                    stride=2,
                                    pad=2)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)

    params = layers.get_all_params(network, trainable=True)
    output = layers.get_output(network)
    output = output.transpose((0, 3, 1, 2))
    output = output.flatten(ndim=3)

    # This params is important
    num_channels = 32
    filter_w = 54
    filter_h = 8

    network = layers.InputLayer(shape=(None, filter_w,
                                       num_channels * filter_h),
                                input_var=output)
    print(layers.get_output(network).eval({input_var: example}).shape)

    network = layers.GRULayer(incoming=network,
                              num_units=num_units,
                              only_return_final=True)
    print(layers.get_output(network).eval({input_var: example}).shape)
    if batch_norm:
        network = layers.BatchNormLayer(incoming=network)
    if dropout > 0:
        network = layers.dropout(network, dropout)

    # last layer: classification
    network = layers.DenseLayer(incoming=network,
                                num_units=176,
                                nonlinearity=softmax)
    print(layers.get_output(network).eval({input_var: example}).shape)

    params += layers.get_all_params(network, trainable=True)
    prediction = layers.get_output(network)

    print('==> param shapes', [x.eval().shape for x in params])

    loss_ce = lasagne.objectives.categorical_crossentropy(
        prediction, answer_var).mean()
    if l2 > 0:
        loss_l2 = l2 * lasagne.regularization.apply_penalty(
            params, lasagne.regularization.l2)
    else:
        loss_l2 = 0
    loss = loss_ce + loss_l2

    # updates = lasagne.updates.adadelta(loss, params)
    updates = lasagne.updates.momentum(loss, params,
                                       learning_rate=0.003)  # good one
    # updates = lasagne.updates.momentum(loss, params, learning_rate=0.0003)  # good one

    print('==> compiling train_fn')
    train_fn = theano.function(inputs=[input_var, answer_var],
                               outputs=[prediction, loss],
                               updates=updates)
    test_fn = theano.function(inputs=[input_var, answer_var],
                              outputs=[prediction, loss])

    return train_fn, test_fn
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, rnn_num_units, **kwargs):
        
        print ("==> not used params in DMN class:", kwargs.keys())
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        self.num_units = rnn_num_units
        
        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')
        
        print ("==> building network")
        example = np.random.uniform(size=(self.batch_size, 1, 128, 858), low=0.0, high=1.0).astype(np.float32) #########
        answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) #########
       
        network = layers.InputLayer(shape=(None, 1, 128, 858), input_var=self.input_var)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        
        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        self.params = layers.get_all_params(network, trainable=True)
        
        output = layers.get_output(network)
        num_channels  = 32 
        filter_W = 54
        filter_H = 8
        
        # NOTE: these constants are shapes of last pool layer, it can be symbolic 
        # explicit values are better for optimizations
        
        channels = []
        for channel_index in range(num_channels):
            channels.append(output[:, channel_index, :, :].transpose((0, 2, 1)))
        
        rnn_network_outputs = []
        W_in_to_updategate = None
        W_hid_to_updategate = None
        b_updategate = None
        W_in_to_resetgate = None
        W_hid_to_resetgate = None
        b_resetgate = None
        W_in_to_hidden_update = None
        W_hid_to_hidden_update = None
        b_hidden_update = None
        
        W_in_to_updategate1 = None
        W_hid_to_updategate1 = None
        b_updategate1 = None
        W_in_to_resetgate1 = None
        W_hid_to_resetgate1 = None
        b_resetgate1 = None
        W_in_to_hidden_update1 = None
        W_hid_to_hidden_update1 = None
        b_hidden_update1 = None
        
        for channel_index in range(num_channels):
            rnn_input_var = channels[channel_index]
            
            # InputLayer       
            network = layers.InputLayer(shape=(None, filter_W, filter_H), input_var=rnn_input_var)

            if (channel_index == 0):
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False)
                W_in_to_updategate = network.W_in_to_updategate
                W_hid_to_updategate = network.W_hid_to_updategate
                b_updategate = network.b_updategate
                W_in_to_resetgate = network.W_in_to_resetgate
                W_hid_to_resetgate = network.W_hid_to_resetgate
                b_resetgate = network.b_resetgate
                W_in_to_hidden_update = network.W_in_to_hidden_update
                W_hid_to_hidden_update = network.W_hid_to_hidden_update
                b_hidden_update = network.b_hidden_update
                
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                
                # GRULayer
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True)
                W_in_to_updategate1 = network.W_in_to_updategate
                W_hid_to_updategate1 = network.W_hid_to_updategate
                b_updategate1 = network.b_updategate
                W_in_to_resetgate1 = network.W_in_to_resetgate
                W_hid_to_resetgate1 = network.W_hid_to_resetgate
                b_resetgate1 = network.b_resetgate
                W_in_to_hidden_update1 = network.W_in_to_hidden_update
                W_hid_to_hidden_update1 = network.W_hid_to_hidden_update
                b_hidden_update1 = network.b_hidden_update
                        
                # add params 
                self.params += layers.get_all_params(network, trainable=True)

            else:
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=False,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate, W_hid=W_hid_to_resetgate, b=b_resetgate),
                            updategate=layers.Gate(W_in=W_in_to_updategate, W_hid=W_hid_to_updategate, b=b_updategate),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update, W_hid=W_hid_to_hidden_update, b=b_hidden_update))
                            
                # BatchNormalization Layer
                if (self.batch_norm):
                    network = layers.BatchNormLayer(incoming=network)
                    
                # GRULayer, but shared
                network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True,
                            resetgate=layers.Gate(W_in=W_in_to_resetgate1, W_hid=W_hid_to_resetgate1, b=b_resetgate1),
                            updategate=layers.Gate(W_in=W_in_to_updategate1, W_hid=W_hid_to_updategate1, b=b_updategate1),
                            hidden_update=layers.Gate(W_in=W_in_to_hidden_update1, W_hid=W_hid_to_hidden_update1, b=b_hidden_update1))
                
            
            rnn_network_outputs.append(layers.get_output(network))
        
        all_output_var = T.concatenate(rnn_network_outputs, axis=1)
        print (all_output_var.eval({self.input_var:example}).shape)
        
        # InputLayer
        network = layers.InputLayer(shape=(None, self.num_units * num_channels), input_var=all_output_var)
        
        # Dropout Layer
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        
        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # Last layer: classification
        network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax)
        print (layers.get_output(network).eval({self.input_var:example}).shape)
        
    
        self.params += layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
    
        #print "==> param shapes", [x.eval().shape for x in self.params]
        
        self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.apply_penalty(self.params, 
                                                                          lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003)
        
        if self.mode == 'train':
            print ("==> compiling train_fn")
            self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print ("==> compiling test_fn")
        self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
                                       outputs=[self.prediction, self.loss])
Ejemplo n.º 28
0
def conv_nonl(data, num_filters, name, pad, use_bn=True):
    res = conv(data, num_filters, name, pad=pad)
    if (use_bn):
        res = L.BatchNormLayer(res, name='bn_' + name)
    res = L.NonlinearityLayer(res, rectify, name='relu_' + name)
    return res
Ejemplo n.º 29
0
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs):
        
        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm
        
        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')
        
        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 1, 256, 858), low=0.0, high=1.0).astype(np.float32) #########
        answer = np.random.randint(low=0, high=176, size=(self.batch_size,)) #########
       
        network = layers.InputLayer(shape=(None, 1, 256, 858), input_var=self.input_var)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        
        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 5
        network = layers.Conv2DLayer(incoming=network, num_filters=128, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # CONV-RELU-POOL 6
        network = layers.Conv2DLayer(incoming=network, num_filters=256, filter_size=(3, 3), 
                                     stride=1, nonlinearity=rectify)
        print layers.get_output(network).eval({self.input_var:example}).shape
        network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=(3, 2), ignore_border=False)
        print layers.get_output(network).eval({self.input_var:example}).shape
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        
        # DENSE 1
        network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
        """
        # DENSE 2
        network = layers.DenseLayer(incoming=network, num_units=1024, nonlinearity=rectify)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        print layers.get_output(network).eval({self.input_var:example}).shape
        """
        
        # Last layer: classification
        network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var:example}).shape
        
        
        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
        
        self.loss_ce = lasagne.objectives.categorical_crossentropy(self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(network, 
                                                                    lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2
        
        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003)
        
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(inputs=[self.input_var, self.answer_var], 
                                            outputs=[self.prediction, self.loss],
                                            updates=updates)
        
        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
                                       outputs=[self.prediction, self.loss])
Ejemplo n.º 30
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        g_init = T.imatrix('g')
        ind_init = T.ivector('ind')
        sub_path_init = T.imatrix('subPathsBatch')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init)
        ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init)
        pair_second = lgl.SliceLayer(g_input, indices=1, axis=1)
        pair_first = lgl.SliceLayer(g_input, indices=0, axis=1)
        pair_first_emd = lgl.EmbeddingLayer(pair_first,
                                            input_size=self.num_ver,
                                            output_size=self.embedding_size)
        emd_to_numver = layers.DenseLayer(
            pair_first_emd,
            self.num_ver,
            nonlinearity=lg.nonlinearities.softmax)
        index_emd = lgl.EmbeddingLayer(ind_input,
                                       input_size=self.num_ver,
                                       output_size=self.embedding_size,
                                       W=pair_first_emd.W)
        x_to_ydim = layers.SparseLayer(x_input,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        index_emd = layers.DenseLayer(index_emd,
                                      self.y.shape[1],
                                      nonlinearity=lg.nonlinearities.softmax)
        concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1)
        concat_two = layers.DenseLayer(concat_two,
                                       self.y.shape[1],
                                       nonlinearity=lg.nonlinearities.softmax)
        concat_two_output = lgl.get_output(concat_two)
        step_loss = lgo.categorical_crossentropy(concat_two_output,
                                                 y_init).mean()
        hid_loss = lgl.get_output(x_to_ydim)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(index_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = [
            index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W,
            concat_two.b
        ]
        step_updates = lg.updates.sgd(step_loss,
                                      step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init, ind_init],
                                          step_loss,
                                          updates=step_updates,
                                          on_unused_input='ignore')
        self.test_fn = theano.function([x_init, ind_init],
                                       concat_two_output,
                                       on_unused_input='ignore')

        # supervised train
        fc_output = lgl.get_output(emd_to_numver)
        pair_second_output = lgl.get_output(pair_second)
        sup_loss = lgo.categorical_crossentropy(fc_output,
                                                pair_second_output).sum()
        sup_params = lgl.get_all_params(emd_to_numver, trainable=True)
        sup_updates = lg.updates.sgd(sup_loss,
                                     sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([g_init],
                                         sup_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        cross_entropy = lgo.categorical_crossentropy(fc_output,
                                                     pair_second_output)
        cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size),
                                  ndim=None)

        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=sub_path_init)
        sub_path_emd = lgl.EmbeddingLayer(subPath_in,
                                          input_size=self.num_ver,
                                          output_size=self.embedding_size,
                                          W=pair_first_emd.W)

        lstm_layer = lgl.LSTMLayer(sub_path_emd,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_params = list(set(lstm_params_all).difference(set(sup_params)))
        lstm_updates = lg.updates.sgd(reweight_loss,
                                      lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([sub_path_init, g_init, mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss,
                                       sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([sub_path_init, g_init, mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')
        print(' -- Done!')