예제 #1
0
def deploy_net(conf, batch_size, class_num):
    '''
    :param conf:  the data_set_config information, defined in data_info_set.item
    :param batch_size: the batch_size of prototxt
    :param class_num: the class_num of the data_set
    :param channels: the channels of hyperspectral data, maybe it is 224,448 or 103,206
    :param kernel_size: the kernel_size of the convolution layer, often is 1/9 of the channels
    :return: deploy file handle
    '''
    n = caffe.NetSpec()
    if conf.use_CK is True:
        n.data, n.label = L.DummyData(
            shape={'dim': [batch_size, 1, conf.CK_channels, 1]}, ntop=2)
        n.conv1 = L.Convolution(n.data,
                                kernel_h=conf.CK_kernel_size,
                                kernel_w=1,
                                num_output=20,
                                weight_filler=dict(type='gaussian', std=0.05),
                                bias_filler=dict(type='constant', value=0.1))
    else:
        n.data, n.label = L.DummyData(
            shape={'dim': [batch_size, 1, conf.channels, 1]}, ntop=2)
        n.conv1 = L.Convolution(n.data,
                                kernel_h=conf.kernel_size,
                                kernel_w=1,
                                num_output=20,
                                weight_filler=dict(type='gaussian', std=0.05),
                                bias_filler=dict(type='constant', value=0.1))
    n.bn1 = L.BatchNorm(n.conv1, use_global_stats=1, in_place=True)
    n.relu1 = L.PReLU(n.bn1, in_place=True)
    n.ip1 = L.InnerProduct(n.relu1,
                           num_output=100,
                           weight_filler=dict(type='gaussian', std=0.05),
                           bias_filler=dict(type='constant', value=0.1))
    n.drop1 = L.Dropout(n.ip1, dropout_ratio=0.1, in_place=True)
    n.relu2 = L.PReLU(n.drop1, in_place=True)
    n.ip2 = L.InnerProduct(n.relu2,
                           num_output=class_num,
                           weight_filler=dict(type='gaussian', std=0.05),
                           bias_filler=dict(type='constant', value=0.1))
    return n.to_proto()
예제 #2
0
def gradient_y(bottom):

    dummy_data = L.DummyData(dummy_data_param=dict(
        shape=[dict(dim=[1, 1, 99, 100])]))
    crop_1 = L.Crop(bottom, dummy_data, crop_param=dict(offset=[1, 0]))
    crop_2 = L.Crop(bottom, dummy_data, crop_param=dict(offset=[0, 0]))
    diff = L.Eltwise(crop_1,
                     crop_2,
                     eltwise_param=dict(operation=P.Eltwise.SUM,
                                        coeff=[1.0, -1.0]))
    gradient_y = L.AbsVal(diff)
    return gradient_y
예제 #3
0
    def __init__(self, data_shape, label_shape, last_layer='fc8', params=[]):

        data_shape_list = list(data_shape)
        data_shape_list[0] = 1

        label_shape_list = list(label_shape)
        label_shape_list[0] = 1

        self.data_shape = data_shape_list
        self.data = L.DummyData(shape=dict(dim=data_shape_list))
        self.label = L.DummyData(shape=dict(dim=label_shape_list))

        self.last_layer = last_layer
        self.receptiveFieldStride = []  # cumprod of the stride values across the whole net

        self.net = self.net(params=params)
        # self.solver = self.solver(params)

        self.receptiveFieldStride = np.asarray(self.receptiveFieldStride)
        self.receptiveFieldStride = np.cumprod(self.receptiveFieldStride)
        np.save(str.split(params['path2net'], '.')[0] +'_stride.npy', self.receptiveFieldStride)
예제 #4
0
    def build_retrieval_model_deploy(self, save_tag, visual_feature_dim,
                                     language_feature_dim):

        image_input = L.DummyData(shape=[dict(dim=[21, visual_feature_dim])],
                                  ntop=1)
        setattr(self.n, 'image_data', image_input)

        loc_input = L.DummyData(shape=[dict(dim=[21, 2])], ntop=1)
        setattr(self.n, 'loc_data', loc_input)

        im_model, lang_model = self.get_models()

        bottom_visual = im_model(image_input, loc_input)

        if self.language_layers in recurrent_layers:

            text_input = L.DummyData(shape=[
                dict(dim=[
                    self.params['sentence_length'], 21, language_feature_dim
                ])
            ],
                                     ntop=1)
            setattr(self.n, 'text_data', text_input)
            cont_input = L.DummyData(
                shape=[dict(dim=[self.params['sentence_length'], 21])], ntop=1)
            setattr(self.n, 'cont_data', cont_input)
            bottom_text = lang_model(text_input, cont_input)

        else:
            text_input = L.DummyData(
                shape=[dict(dim=[21, language_feature_dim])], ntop=1)
            bottom_text = lang_model(text_input)
            if self.language_layers == '0':
                setattr(self.n, 'text_data', bottom_text)
            else:
                setattr(self.n, 'text_data', text_input)

        self.n.tops['rank_score'] = self.distance_function(
            bottom_visual, bottom_text)
        self.write_net(save_tag, self.n)
예제 #5
0
    def create_architecture(self, mode, hdf5_data):
        """Returns the architecture (i.e., caffe prototxt) of the model.

        Jer: One day this should probably be written to be more general.
        """

        arch = self.arch
        pars = self.pars
        n = caffe.NetSpec()

        if mode == 'deploy':
            n.data = L.DummyData(shape=[dict(dim=pars['deploy_dims'])])
        elif mode == 'train':
            n.data, n.label = L.HDF5Data(batch_size=pars['train_batch_size'], source=hdf5_data, ntop=pars['ntop'])
        else:  # Test.
            n.data, n.label = L.HDF5Data(batch_size=pars['test_batch_size'], source=hdf5_data, ntop=pars['ntop'])

        # print(n.to_proto())
        in_layer = n.data

        for layer in arch:
            layer_type, vals = layer

            if layer_type == 'e2e':
                in_layer = n.e2e = e2e_conv(in_layer, vals['n_filters'], vals['kernel_h'], vals['kernel_w'])
            elif layer_type == 'e2n':
                in_layer = n.e2n = e2n_conv(in_layer, vals['n_filters'], vals['kernel_h'], vals['kernel_w'])
            elif layer_type == 'fc':
                in_layer = n.fc = full_connect(in_layer, vals['n_filters'])
            elif layer_type == 'out':
                n.out = full_connect(in_layer, vals['n_filters'])
                # Rename to user specified unique layer name.
                # n.__setattr__('out', n.new_layer)

            elif layer_type == 'dropout':
                in_layer = n.dropout = L.Dropout(in_layer, in_place=True,
                                                 dropout_param=dict(dropout_ratio=vals['dropout_ratio']))
            elif layer_type == 'relu':
                in_layer = n.relu = L.ReLU(in_layer, in_place=True,
                                           relu_param=dict(negative_slope=vals['negative_slope']))
            else:
                raise ValueError('Unknown layer type: ' + str(layer_type))

        # ~ end for.

        if mode != 'deploy':
            if self.pars['loss'] == 'EuclideanLoss':
                n.loss = L.EuclideanLoss(n.out, n.label)
            else:
                ValueError("Only 'EuclideanLoss' currently implemented for pars['loss']!")

        return n
예제 #6
0
def vgg_net(mode, batch_size=1):
    #This is not the whole network! missing ReLU ect.

    if mode == "cl":
        pad_init = 1
    elif mode == "sg":
        pad_init = 96
    else:
        raise ValueError

    n = caffe.NetSpec()
    p = 1
    pl = P.Pooling.MAX

    n.data = L.DummyData(shape=[dict(dim=[batch_size, 3, 224, 224])], ntop=1)

    n.conv1_1 = L.Convolution(n.data,
                              kernel_size=3,
                              pad=pad_init,
                              num_output=64)
    n.conv1_2 = L.Convolution(n.conv1_1, kernel_size=3, pad=p, num_output=64)
    n.pool1 = L.Pooling(n.conv1_2, kernel_size=2, stride=2, pool=pl)

    n.conv2_1 = L.Convolution(n.pool1, kernel_size=3, pad=p, num_output=128)
    n.conv2_2 = L.Convolution(n.conv2_1, kernel_size=3, pad=p, num_output=128)
    n.pool2 = L.Pooling(n.conv2_2, kernel_size=2, stride=2, pool=pl)

    n.conv3_1 = L.Convolution(n.pool2, kernel_size=3, pad=p, num_output=256)
    n.conv3_2 = L.Convolution(n.conv3_1, kernel_size=3, pad=p, num_output=256)
    n.conv3_3 = L.Convolution(n.conv3_2, kernel_size=3, pad=p, num_output=256)
    n.pool3 = L.Pooling(n.conv3_3, kernel_size=2, stride=2, pool=pl)

    n.conv4_1 = L.Convolution(n.pool3, kernel_size=3, pad=p, num_output=512)
    n.conv4_2 = L.Convolution(n.conv4_1, kernel_size=3, pad=p, num_output=512)
    n.conv4_3 = L.Convolution(n.conv4_2, kernel_size=3, pad=p, num_output=512)
    n.pool4 = L.Pooling(n.conv4_3, kernel_size=2, stride=2, pool=pl)

    n.conv5_1 = L.Convolution(n.pool4, kernel_size=3, pad=p, num_output=512)
    n.conv5_2 = L.Convolution(n.conv5_1, kernel_size=3, pad=p, num_output=512)
    n.conv5_3 = L.Convolution(n.conv5_2, kernel_size=3, pad=p, num_output=512)
    n.pool5 = L.Pooling(n.conv5_3, kernel_size=2, stride=2, pool=pl)

    if mode == "cl":
        n.fc6 = L.InnerProduct(n.pool5, num_output=4096)
        n.fc7 = L.InnerProduct(n.fc6, num_output=4096)
    elif mode == "sg":
        n.fc6 = L.Convolution(n.pool5, kernel_size=7, pad=0, num_output=4096)
        n.fc7 = L.Convolution(n.fc6, kernel_size=1, pad=0, num_output=4096)
    else:
        raise ValueError

    return n
예제 #7
0
    def __init__(self, batch_size=32, shape=(32, 32)):
        # Counter for layers of different types, e.g. conv, relu, pool.
        self.counters = dict()

        self.n = caffe.NetSpec()
        # Dummy data layer must be edited manually in prototxt
        self.n.data, self.n.label = layers.DummyData(
            shape=[
                dict(dim=[batch_size, 1, shape[0], shape[1]]),
                dict(dim=[batch_size, 1, 1, 1])
            ],
            transform_param=dict(scale=1. / 255),
            ntop=2)
예제 #8
0
    def build_relational_model_deploy(self, save_tag, visual_feature_dim,
                                      language_feature_dim):

        image_input = L.DummyData(
            shape=[dict(dim=[21, 1, visual_feature_dim + 2])], ntop=1)
        setattr(self.n, 'image_data', image_input)

        image_global = L.DummyData(
            shape=[dict(dim=[21, 21, visual_feature_dim + 2])], ntop=1)
        setattr(self.n, 'global_data', image_global)

        im_model, lang_model = self.get_models()

        self.silence_count += 1

        bottom_tile = L.Tile(image_input, axis=1, tiles=21)

        bottom_concat = L.Concat(bottom_tile, image_global, axis=2)
        bottom_visual = im_model(bottom_concat, axis=2)

        text_input = L.DummyData(shape=[
            dict(
                dim=[self.params['sentence_length'], 21, language_feature_dim])
        ],
                                 ntop=1)
        setattr(self.n, 'text_data', text_input)
        cont_input = L.DummyData(
            shape=[dict(dim=[self.params['sentence_length'], 21])], ntop=1)
        setattr(self.n, 'cont_data', cont_input)
        bottom_text = lang_model(text_input, cont_input)

        t_reshape = L.Reshape(bottom_text,
                              shape=dict(dim=[self.batch_size, 1, -1]))
        t_tile = L.Tile(t_reshape, axis=1, tiles=21)

        self.n.tops['scores'] = self.distance_function(bottom_visual,
                                                       t_tile)[0]

        self.write_net(save_tag, self.n)
def conv_pool_net():
    n = caffe.NetSpec()
    n.data = L.DummyData(dummy_data_param=dict(num=20,
                                               channels=1,
                                               height=64,
                                               width=64,
                                               data_filler=dict(
                                                   type="gaussian")))
    n.label = L.DummyData(dummy_data_param=dict(num=20,
                                                channels=10,
                                                height=1,
                                                width=1,
                                                data_filler=dict(
                                                    type="gaussian")))
    n.conv1 = L.Convolution(n.data,
                            num_output=20,
                            kernel_size=4,
                            stride=3,
                            pad=0)
    n.relu1 = L.ReLU(n.conv1, in_place=True)
    n.pool1 = L.Pooling(n.relu1, pool=P.Pooling.MAX, kernel_size=2, stride=2)
    # 当变量名相同时,caffe会自动将之前的变量都按自定义的方式命名,只有最后一次使用时才保留自己定义的名

    for i in range(2):
        n.conv1 = L.Convolution(n.pool1,
                                num_output=10,
                                kernel_size=4,
                                stride=2,
                                pad=3)
        n.relu1 = L.ReLU(n.conv1, in_place=True)
        n.pool1 = L.Pooling(n.relu1,
                            pool=P.Pooling.MAX,
                            kernel_size=2,
                            stride=2)
    n.ip2 = L.InnerProduct(n.pool1,
                           num_output=10,
                           weight_filler=dict(type='xavier'))
    n.loss = L.SigmoidCrossEntropyLoss(n.ip2, n.label)
    return n.to_proto()
예제 #10
0
def net():
    n = caffe.NetSpec()
    n.data = L.DummyData(dummy_data_param=dict(num=10,
                                               channels=1,
                                               height=28,
                                               width=28,
                                               data_filler=dict(
                                                   type='gaussian')))
    n.label = L.DummyData(dummy_data_param=dict(num=10,
                                                channels=1,
                                                height=1,
                                                width=1,
                                                data_filler=dict(
                                                    type='gaussian')))
    n.ip1 = L.InnerProduct(n.data,
                           num_output=50,
                           weight_filler=dict(type='xavier'))
    n.relu1 = L.ReLU(n.ip1, in_place=True)
    n.ip2 = L.InnerProduct(n.relu1,
                           num_output=4,
                           weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.ip2, n.label)

    return n.to_proto()
예제 #11
0
def anon_lenet(batch_size):
    data, label = L.DummyData(shape=[dict(dim=[batch_size, 1, 28, 28]),
                                     dict(dim=[batch_size, 1, 1, 1])],
                              transform_param=dict(scale=1./255), ntop=2)
    conv1 = L.Convolution(data, kernel_size=5, num_output=20,
        weight_filler=dict(type='xavier'))
    pool1 = L.Pooling(conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    conv2 = L.Convolution(pool1, kernel_size=5, num_output=50,
        weight_filler=dict(type='xavier'))
    pool2 = L.Pooling(conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    ip1 = L.InnerProduct(pool2, num_output=500,
        weight_filler=dict(type='xavier'))
    relu1 = L.ReLU(ip1, in_place=True)
    ip2 = L.InnerProduct(relu1, num_output=10,
        weight_filler=dict(type='xavier'))
    loss = L.SoftmaxWithLoss(ip2, label)
    return loss.to_proto()
예제 #12
0
def encoder_network(batch_size):
    n = caffe.NetSpec()

    n.image = L.DummyData(shape=[dict(dim=[1]), dict(dim=[1])],
                          transform_param=dict(scale=1.0 / 255.0),
                          ntop=2)

    n.accuracy = L.Python(
        n.loss,
        n.label,
        python_param=dict(module='python_accuracy',
                          layer='PythonAccuracy',
                          param_str='{ "param_name": param_value }'),
        ntop=1,
    )

    return n.to_proto()
예제 #13
0
def lenet(batch_size):
    n = caffe.NetSpec()
    n.data, n.label = L.DummyData(shape=[dict(dim=[batch_size, 1, 28, 28]),
                                         dict(dim=[batch_size, 1, 1, 1])],
                                  transform_param=dict(scale=1./255), ntop=2)
    n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20,
        weight_filler=dict(type='xavier'))
    n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50,
        weight_filler=dict(type='xavier'))
    n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX)
    n.ip1 = L.InnerProduct(n.pool2, num_output=500,
        weight_filler=dict(type='xavier'))
    n.relu1 = L.ReLU(n.ip1, in_place=True)
    n.ip2 = L.InnerProduct(n.relu1, num_output=10,
        weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.ip2, n.label)
    return n.to_proto()
예제 #14
0
def make_net_from_python_layer(input_names_and_values, output_names, py_module, py_layer, param_str=None,
                               propagate_down=None):
    """
    wrap a python layer in a "net"
    :param input_names_and_values: list of tuples [(in_name, in_data_np_array),...]
    :param output_names: names of outputs, list of strings
    :param py_module: string, "module" parameter of python layer
    :param py_layer: string, "layer" parameter for python layer
    :param param_str: optional string, "param_str" for python layer (default is None)
    :param propagate_down: list of booleans same length as len(input_names_and_shapes)

    :return:
        caffe.Net object encapsulating the tested layer
        updated propagate_down boolean vector
    """
    # build net
    ns = caffe.NetSpec()
    inputs = []
    for in_ in input_names_and_values:
        inl_ = L.DummyData(name=in_[0], dummy_data_param={'shape': {'dim': list(in_[1].shape)}})
        ns.__setattr__(in_[0], inl_)
        inputs.append(inl_)
    str(ns.to_proto())
    python_param = {'module': py_module, 'layer': py_layer}
    if param_str:
        python_param['param_str'] = param_str
    if propagate_down is None:
        propagate_down = [True for _ in xrange(len(output_names))]
    outputs = L.Python(*inputs, name='tested_py_layer', ntop=len(output_names),
                       python_param=python_param,
                       propagate_down=propagate_down,
                       loss_weight=[1.0 for _ in output_names])  # mark this layer as "loss" for gradients

    ns.__setattr__(output_names[0], outputs)

    # for o, on in zip(outputs, output_names):
    #     ns.__setattr__(on, o)
    with open('./test_py_layer.prototxt', 'w') as tf:
        tf.write('name: "test_py_layer"\n')
        tf.write('force_backward: true\n')  # must have. otherwise python's backward is not called at all.
        tf.write(str(ns.to_proto()))
    net = caffe.Net('./test_py_layer.prototxt', caffe.TEST)
    os.unlink('./test_py_layer.prototxt')
    return net, propagate_down
예제 #15
0
    def batch_norm_radnom_data(self):
        matconv_net = utils.load_matconvnet_from_file(
            'test_data/ucf101-img-resnet-50-split1/net.mat')['net']
        batch_norm_layer_id = 1
        input_layer = L.DummyData(shape=dict(dim=[1, 1, 2, 2]))
        batch_norm_layer = matconv_net.layers[batch_norm_layer_id]
        lr_params_dic = py_matconv_to_caffe.convert_model_params._create_lr_params_dic(
            matconv_net.params)
        caffe_layer = _dagnn_BatchNorm([input_layer], batch_norm_layer,
                                       utils.get_values_for_multi_keys(
                                           lr_params_dic,
                                           batch_norm_layer.params))

        n = caffe.NetSpec()
        n.input_layer = input_layer
        # layer_name = batch_norm_layer.name
        n.__setattr__(batch_norm_layer.name, caffe_layer)

        prototxt = str(n.to_proto())
        output_proto_fn = join('test_data/batch_norm_test/workspace',
                               'net.prototxt')
        with open(output_proto_fn, 'w') as prototxt_file:
            prototxt_file.write(prototxt)
        net = caffe.Net(output_proto_fn, caffe.TEST)
        layer_name = 'bn_conv1'
        mu = 1
        sig = 2
        net.params[layer_name][0].data[...] = np.asarray([mu])
        net.params[layer_name][1].data[...] = np.asarray([sig**2])
        net.params[layer_name][2].data[...] = 1

        data = np.arange(4).reshape((1, 1, 2, 2)) * 1.0
        net.blobs['input_layer'].data[...] = data

        data -= mu
        data /= sig

        net.forward()

        np.testing.assert_array_almost_equal(data,
                                             net.blobs[layer_name].data,
                                             decimal=1)
예제 #16
0
def gen_net(train_hdf5_in,
            train_batch_size,
            test_hdf5_in,
            test_batch_size,
            deploy=False):

    # Input Layers
    n = caffe.NetSpec()
    if deploy:
        n.data = L.DummyData(ntop=1, shape=[dict(dim=[1, 1, 20, 20, 20])])
    else:
        n.data, n.label = L.HDF5Data(
            ntop=2,
            include=dict(phase=caffe.TRAIN),
            hdf5_data_param=dict(batch_size=train_batch_size),
            source=train_hdf5_in)
        n.data2 = L.HDF5Data(ntop=0,
                             top=['data', 'label'],
                             include=dict(phase=caffe.TEST),
                             hdf5_data_param=dict(batch_size=test_batch_size),
                             source=test_hdf5_in)

    # Core Architecture
    n.deconv1 = Deconvolution(n.data)
    n.conv1, n.bn1, n.relu1 = Convolution_BN_ReLU(n.deconv1, num_output=64)
    n.conv2, n.bn2, n.relu2 = Convolution_BN_ReLU(n.relu1, num_output=64)
    n.conv3, n.bn3, n.relu3 = Convolution_BN_ReLU(n.relu2, num_output=32)
    n.conv4, n.bn4, n.relu4 = Convolution_BN_ReLU(n.relu3, num_output=16)
    n.conv5, n.bn5, n.relu5 = Convolution_BN_ReLU(n.relu4, num_output=16)
    n.conv6 = Convolution(n.relu5,
                          num_output=1,
                          param=[dict(lr_mult=0.1),
                                 dict(lr_mult=0.1)])
    n.recon = L.Eltwise(n.deconv1, n.conv6, operation=P.Eltwise.SUM)

    # Output Layers
    if not deploy:
        n.loss = L.EuclideanLoss(n.recon, n.label)
        #n.loss = L.Python (n.recon, n.label, python_param=dict(module='pyloss',layer='SmoothL1LossLayer_2'),loss_weight=1)

    # Return the network
    return n.to_proto()
def concat_slice_net():
    n = caffe.NetSpec()
    n.data = L.DummyData(dummy_data_param=dict(num=20,
                                               channels=50,
                                               height=64,
                                               width=64,
                                               data_filler=dict(
                                                   type="gaussian")))
    # 将输入的data层分为a,b,c输出,slice_point比Slice的个数少1
    # 如本例将输入的data层分为a,b,c输出,即top有三个,slice_point则有两个,
    # 其中第一个slice_point=20是top:"a"的个数,第二个slice_point=30是top:"b"+top:"a"的个数
    # 而top:"c"的个数:channels-第二个slice_point=50-30=20,
    # 因此a,b,c的channels分别是:20,10,20
    n.a, n.b, n.c = L.Slice(n.data, ntop=3, slice_point=[20, 30], axis=0)
    n.d = L.Concat(n.a, n.b, axis=0)

    # Eltwise层的操作有三个:product(点乘), sum(相加减) 和 max(取大值),其中sum是默认操作
    n.e = L.Eltwise(n.a, n.c)

    return n.to_proto()
예제 #18
0
def example_network(batch_size, fname='network.prototxt'):
    n = caffe.NetSpec()

    n.data, n.label = L.DummyData(
        shape=[dict(dim=[batch_size, 3]),
               dict(dim=[batch_size])],
        transform_param=dict(scale=1.0 / 255.0),
        ntop=2)

    n.affine = L.InnerProduct(n.data, num_output=3)
    n.lowrank = L.Python(
        n.affine,
        n.label,
        python_param=dict(module='LowRankLoss', layer='LowRankLossLayer'),
        ntop=1,
    )
    #param_str='{ "param_name": param_value }'),

    f = open(fname, 'w')
    f.write(str(n.to_proto()))
    f.close()
예제 #19
0
def LowAGAN(w, batchsize, n):
    #input w = 11 damit output 10 <= 8*10 ist maximum
    #input w = 16 damit output 20 <= 4*20 ist maximum

    level = 2
    listofsizes =  []
    if full_conv:
        listofsizes = [w]
    
        for i in range(0, level-1):
            alast = listofsizes[i]
            listofsizes.append((alast - 4)*2)
        listofsizes[0] -= 4


    transform_param = dict(mirror=False, crop_size=w, scale=1., mean_value=103.939)
    if full_conv:
        transform_param = dict(mirror=False, crop_size=120, scale=1., mean_value=103.939)
    n.Adata, n.Anothing = L.ImageData(transform_param=transform_param, source='datasource.txt', 
                            is_color=False, shuffle=True, batch_size=batchsize, ntop=2)
    n.Aresize = L.Python(n.Adata, python_param=dict(module='resizelayer', layer='ResizeData'), param_str=str(4))
    n.Acropped = L.Python(n.Aresize, python_param=dict(module='randomrot', layer='RandomRotLayer'), param_str=str(listofsizes[level -1] - 4))

    codings = [8, 16, 24, 32, 40]

    d=w
    outname = ""
    for i in range(0,level):
        if full_conv:
            n["AZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level-1 -i] + 4, listofsizes[level-1 -i] + 4])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1)
        else:
            n["AZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, d, d])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1)
        n, outname = convBlock("AconvA"+str(i), codings[0], n, "AZrand_"+str(i), train=False)
        d /= 2

    n, outname = joinBlock("AjoinA", codings[0], n, outname, 'gelu'+'AconvA0'+'_3', train=False)

    n, outname = convBlock("AconvB", codings[1], n, outname, train=False)
    convolution_param = dict(num_output=1, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier'))
    n["Atexture"] = L.Convolution(n[outname], convolution_param=convolution_param, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], name="Atextureparam")

    #0 => blurdat
    #1 => data
    n.Aswap, n.Alabels = L.Python(n["Atexture"], n.Acropped, python_param=dict(module='swaplayer', layer='SwapLayer'), propagate_down=[False, False], ntop=2)

    if full_conv:
        n.Anoise = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level -1] - 4, listofsizes[level -1] - 4])], data_filler=dict(type='gaussian', std=2.0), ntop=1)
    else:
        n.Anoise = L.DummyData(shape=[dict(dim=[batchsize, 1, w, w])], data_filler=dict(type='gaussian', std=2.0), ntop=1)

    n.Ainp = L.Eltwise(n.Aswap, n.Anoise, eltwise_param={'operation':1})

    #GAN network
    convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganAconv1 = L.Convolution(n.Ainp, convolution_param=convolution_param)
    n.ganAconv1 = L.ReLU(n.ganAconv1, negative_slope=0.1)
    
    convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganAconv2 = L.Convolution(n.ganAconv1, convolution_param=convolution_param)
    n.ganAconv2 = L.ReLU(n.ganAconv2, negative_slope=0.1)
    
    convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier'))
    n.ganAconv3 = L.Convolution(n.ganAconv2, convolution_param=convolution_param)
    n.ganAconv3 = L.ReLU(n.ganAconv3, negative_slope=0.1)

    convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganAconv4 = L.Convolution(n.ganAconv3, convolution_param=convolution_param)
    n.ganAconv4 = L.ReLU(n.ganAconv4, negative_slope=0.1)
    
    convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganAconv5 = L.Convolution(n.ganAconv4, convolution_param=convolution_param)
    #n.ganAconv5 = L.BatchNorm(n.ganAconv5, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganAconv5 = L.ReLU(n.ganAconv5, negative_slope=0.1)
    convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier'))
    n.ganAconv6 = L.Convolution(n.ganAconv5, convolution_param=convolution_param)
    #n.ganAconv6 = L.BatchNorm(n.ganAconv6, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganAconv6 = L.ReLU(n.ganAconv6, negative_slope=0.1)

    convolution_param = dict(num_output=32, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier'))
    n.ganAconv7 = L.Convolution(n.ganAconv6, convolution_param=convolution_param)
    #n.ganAconv7 = L.BatchNorm(n.ganAconv7, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganAconv7 = L.ReLU(n.ganAconv7, negative_slope=0.1)
    n.ganAconv7_pool = L.Pooling(n.ganAconv7, global_pooling=True, pool=P.Pooling.AVE)

    n.Aip3 = L.InnerProduct(n.ganAconv7_pool, num_output=1, weight_filler=dict(type='xavier'), name="last")
    
    n.Aloss = L.SigmoidCrossEntropyLoss(n.Aip3, n.Alabels)

    return n
예제 #20
0
def LowABGAN(w, batchsize, n):
    n.ABnothing = L.DummyData(shape=[dict(dim=[batchsize, 1, 1, 1])], data_filler=dict(type='constant'), ntop=1)
    n.ABlabels = L.Python(n.ABnothing, python_param=dict(module='destroy', layer='DestroyLayer'))
    codings = [8, 16, 24, 32, 40]

    level = 2
    listofsizes =  []
    if full_conv:
        listofsizes = [w]
    
        for i in range(0, level-1):
            alast = listofsizes[i]
            listofsizes.append((alast - 4)*2)
        listofsizes[0] -= 4

    d=w
    outname = ""
    for i in range(0,level):
        if full_conv:
            n["ABZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level-1 -i] + 4, listofsizes[level-1 -i] + 4])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1)
        else:
            n["ABZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, d, d])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1)
        n, outname = convBlock("ABconvA"+str(i), codings[0], n, "ABZrand_"+str(i), train=True)
        d /= 2

    n, outname = joinBlock("ABjoinA", codings[0], n, outname, 'gelu'+'ABconvA0'+'_3', train=True)

    n, outname = convBlock("ABconvB", codings[1], n, outname, train=True)
    convolution_param = dict(num_output=1, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier'))
    n["ABtexture"] = L.Convolution(n[outname], convolution_param=convolution_param, name="Atextureparam")

    #GAN network
    convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganABconv1 = L.Convolution(n["ABtexture"], param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    n.ganABconv1 = L.ReLU(n.ganABconv1, negative_slope=0.1)
    
    convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganABconv2 = L.Convolution(n.ganABconv1, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    n.ganABconv2 = L.ReLU(n.ganABconv2, negative_slope=0.1)
    
    convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier'))
    n.ganABconv3 = L.Convolution(n.ganABconv2, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    n.ganABconv3 = L.ReLU(n.ganABconv3, negative_slope=0.1)

    convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganABconv4 = L.Convolution(n.ganABconv3, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    n.ganABconv4 = L.ReLU(n.ganABconv4, negative_slope=0.1)
    
    convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier'))
    n.ganABconv5 = L.Convolution(n.ganABconv4, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    #n.ganAconv5 = L.BatchNorm(n.ganAconv5, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganABconv5 = L.ReLU(n.ganABconv5, negative_slope=0.1)
    convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier'))
    n.ganABconv6 = L.Convolution(n.ganABconv5, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    #n.ganAconv6 = L.BatchNorm(n.ganAconv6, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganABconv6 = L.ReLU(n.ganABconv6, negative_slope=0.1)

    convolution_param = dict(num_output=32, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier'))
    n.ganABconv7 = L.Convolution(n.ganABconv6, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param)
    #n.ganAconv7 = L.BatchNorm(n.ganAconv7, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}])
    n.ganABconv7 = L.ReLU(n.ganABconv7, negative_slope=0.1)
    n.ganABconv7_pool = L.Pooling(n.ganABconv7, global_pooling=True, pool=P.Pooling.AVE)

    n.ABip3 = L.InnerProduct(n.ganABconv7_pool, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], num_output=1, weight_filler=dict(type='xavier'), name="last")
    
    n.ABloss = L.SigmoidCrossEntropyLoss(n.ABip3, n.ABlabels)

    return n
def convert(keras_model, caffe_net_file, caffe_params_file):

    caffe_net = caffe.NetSpec()

    net_params = dict()

    outputs = dict()
    shape = ()

    input_str = ''

    for layer in keras_model.layers:
        name = layer.name
        layer_type = type(layer).__name__

        config = layer.get_config()

        blobs = layer.get_weights()
        blobs_num = len(blobs)

        if type(layer.output) == list:
            raise Exception('Layers with multiply outputs are not supported')
        else:
            top = layer.output.name

        if type(layer.input) != list:
            bottom = layer.input.name

        if layer_type == 'InputLayer' or len(caffe_net.tops) == 0:

            input_name = 'data'
            caffe_net[input_name] = L.Layer()
            input_shape = config['batch_input_shape']
            input_str = 'input: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}'.format(
                '"' + input_name + '"', 1, input_shape[3], input_shape[1],
                input_shape[2])
            outputs[layer.input.name] = input_name
            if layer_type == 'InputLayer':
                continue

        if layer_type == 'Conv2D' or layer_type == 'Convolution2D':

            strides = config['strides']
            kernel_size = config['kernel_size']

            kwargs = {'num_output': config['filters']}

            if kernel_size[0] == kernel_size[1]:
                kwargs['kernel_size'] = kernel_size[0]
            else:
                kwargs['kernel_h'] = kernel_size[0]
                kwargs['kernel_w'] = kernel_size[1]

            if strides[0] == strides[1]:
                kwargs['stride'] = strides[0]
            else:
                kwargs['stride_h'] = strides[0]
                kwargs['stride_w'] = strides[1]

            if not config['use_bias']:
                kwargs['bias_term'] = False
                #kwargs['param']=[dict(lr_mult=0)]
            else:
                #kwargs['param']=[dict(lr_mult=0), dict(lr_mult=0)]
                pass

            set_padding(config, layer.input_shape, kwargs)

            caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]],
                                            **kwargs)

            blobs[0] = np.array(blobs[0]).transpose(3, 2, 0, 1)
            net_params[name] = blobs

            if config['activation'] == 'relu':
                name_s = name + 's'
                caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True)
            elif config['activation'] == 'sigmoid':
                name_s = name + 's'
                caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True)
            elif config['activation'] == 'linear':
                #do nothing
                pass
            else:
                raise Exception('Unsupported activation ' +
                                config['activation'])

        elif layer_type == 'Conv2DTranspose':

            stride = config['strides']
            kernel_size = config['kernel_size']
            channels = config['filters']
            group = config['group']

            w = layer.input_shape[1]
            h = layer.input_shape[2]

            out_w = math.ceil(w / float(stride[1]))
            pad_w = int((kernel_size[1] * out_w -
                         (kernel_size[1] - strides[1]) * (out_w - 1) - w) / 2)

            out_h = math.ceil(h / float(strides[0]))
            pad_h = int((kernel_size[0] * out_h -
                         (kernel_size[0] - strides[0]) * (out_h - 1) - h) / 2)

            if not config['use_bias']:
                bias_flag = False
            else:
                bias_flag = True

            if pad_w == 0:

                caffe_net[name] = L.Deconvolution(
                    caffe_net[outputs[bottom]],
                    convolution_param=dict(num_output=channels,
                                           group=channels,
                                           kernel_size=kernel_size,
                                           stride=stride,
                                           weight_filler=dict(type='bilinear'),
                                           bias_term=bias_flag),
                    param=dict(lr_mult=0, decay_mult=0))

            else:

                if pad_w == pad_h:
                    config_caffe['pad'] = pad_w
                else:
                    config_caffe['pad_h'] = pad_h
                    config_caffe['pad_w'] = pad_w

                caffe_net[name] = L.Deconvolution(
                    caffe_net[outputs[bottom]],
                    convolution_param=dict(num_output=channels,
                                           group=channels,
                                           kernel_size=kernel_size,
                                           stride=stride,
                                           pad=pad,
                                           weight_filler=dict(type='bilinear'),
                                           bias_term=bias_flag),
                    param=dict(lr_mult=0, decay_mult=0))

            blob = np.array(blobs[0]).transpose(2, 3, 0, 1)
            blob.shape = (1, ) + blob.shape
            net_params[name] = blob

            if config['activation'] == 'relu':
                name_s = name + 's'
                caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True)
            elif config['activation'] == 'sigmoid':
                name_s = name + 's'
                caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True)
            elif config['activation'] == 'linear':
                #do nothing
                pass
            else:
                raise Exception('Unsupported activation ' +
                                config['activation'])

        # 深度可分离卷积
        elif layer_type == 'DepthwiseConv2D':

            strides = config['strides']
            kernel_size = config['kernel_size']

            kwargs = {'num_output': layer.input_shape[3]}

            if kernel_size[0] == kernel_size[1]:
                kwargs['kernel_size'] = kernel_size[0]
            else:
                kwargs['kernel_h'] = kernel_size[0]
                kwargs['kernel_w'] = kernel_size[1]

            if strides[0] == strides[1]:
                kwargs['stride'] = strides[0]
            else:
                kwargs['stride_h'] = strides[0]
                kwargs['stride_w'] = strides[1]

            set_padding(config, layer.input_shape, kwargs)

            kwargs['group'] = layer.input_shape[3]

            kwargs['bias_term'] = False
            caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]],
                                            **kwargs)
            blob = np.array(blobs[0]).transpose(2, 3, 0, 1)
            blob.shape = (1, ) + blob.shape
            net_params[name] = blob

            if config['activation'] == 'relu':
                name_s = name + 's'
                caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True)
            elif config['activation'] == 'sigmoid':
                name_s = name + 's'
                caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True)
            elif config['activation'] == 'linear':
                #do nothing
                pass
            else:
                raise Exception('Unsupported activation ' +
                                config['activation'])

        elif layer_type == 'SeparableConv2D':

            strides = config['strides']
            kernel_size = config['kernel_size']

            kwargs = {'num_output': layer.input_shape[3]}

            if kernel_size[0] == kernel_size[1]:
                kwargs['kernel_size'] = kernel_size[0]
            else:
                kwargs['kernel_h'] = kernel_size[0]
                kwargs['kernel_w'] = kernel_size[1]

            if strides[0] == strides[1]:
                kwargs['stride'] = strides[0]
            else:
                kwargs['stride_h'] = strides[0]
                kwargs['stride_w'] = strides[1]

            set_padding(config, layer.input_shape, kwargs)

            kwargs['group'] = layer.input_shape[3]

            kwargs['bias_term'] = False
            caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]],
                                            **kwargs)
            blob = np.array(blobs[0]).transpose(2, 3, 0, 1)
            blob.shape = (1, ) + blob.shape
            net_params[name] = blob

            name2 = name + '_'
            kwargs = {
                'num_output': config['filters'],
                'kernel_size': 1,
                'bias_term': config['use_bias']
            }
            caffe_net[name2] = L.Convolution(caffe_net[name], **kwargs)

            if config['use_bias'] == True:
                blob2 = []
                blob2.append(np.array(blobs[1]).transpose(3, 2, 0, 1))
                blob2.append(np.array(blobs[2]))
                blob2[0].shape = (1, ) + blob2[0].shape
            else:
                blob2 = np.array(blobs[1]).transpose(3, 2, 0, 1)
                blob2.shape = (1, ) + blob2.shape

            net_params[name2] = blob2
            name = name2

        elif layer_type == 'BatchNormalization':

            param = dict()

            variance = np.array(blobs[-1])
            mean = np.array(blobs[-2])

            if config['scale']:
                gamma = np.array(blobs[0])
                sparam = [dict(lr_mult=1), dict(lr_mult=1)]
            else:
                gamma = np.ones(mean.shape, dtype=np.float32)
                #sparam=[dict(lr_mult=0, decay_mult=0), dict(lr_mult=1, decay_mult=1)]
                sparam = [dict(lr_mult=0), dict(lr_mult=1)]
                #sparam=[dict(lr_mult=0), dict(lr_mult=0)]

            if config['center']:
                beta = np.array(blobs[-3])
                param['bias_term'] = True
            else:
                beta = np.zeros(mean.shape, dtype=np.float32)
                param['bias_term'] = False

            caffe_net[name] = L.BatchNorm(caffe_net[outputs[bottom]],
                                          in_place=True)
            #param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=1, decay_mult=1), dict(lr_mult=0, decay_mult=0)])
            #param=[dict(lr_mult=1), dict(lr_mult=1), dict(lr_mult=0)])

            net_params[name] = (mean, variance, np.array(1.0))

            name_s = name + 's'

            caffe_net[name_s] = L.Scale(
                caffe_net[name],
                in_place=True,
                param=sparam,
                scale_param={'bias_term': config['center']})
            net_params[name_s] = (gamma, beta)

        elif layer_type == 'Dense':
            caffe_net[name] = L.InnerProduct(caffe_net[outputs[bottom]],
                                             num_output=config['units'],
                                             weight_filler=dict(type='xavier'))

            if config['use_bias']:
                weight = np.array(blobs[0]).transpose(1, 0)
                if type(layer._inbound_nodes[0].inbound_layers[0]
                        ).__name__ == 'Flatten':
                    flatten_shape = layer._inbound_nodes[0].inbound_layers[
                        0].input_shape
                    for i in range(weight.shape[0]):
                        weight[i] = np.array(weight[i].reshape(
                            flatten_shape[1],
                            flatten_shape[2], flatten_shape[3]).transpose(
                                2, 0, 1).reshape(weight.shape[1]))
                net_params[name] = (weight, np.array(blobs[1]))
            else:
                net_params[name] = (blobs[0])

            name_s = name + 's'
            if config['activation'] == 'softmax':
                caffe_net[name_s] = L.Softmax(caffe_net[name], in_place=True)
            elif config['activation'] == 'relu':
                caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True)

        elif layer_type == 'Activation':
            if config['activation'] == 'relu':
                #caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]], in_place=True)
                if len(layer.input.consumers()) > 1:
                    caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]])
                else:
                    caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]],
                                             in_place=True)
            elif config['activation'] == 'relu6':
                #TODO
                caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]])
            elif config['activation'] == 'softmax':
                caffe_net[name] = L.Softmax(caffe_net[outputs[bottom]],
                                            in_place=True)
            elif config['activation'] == 'sigmoid':
                # name_s = name+'s'
                caffe_net[name] = L.Sigmoid(caffe_net[outputs[bottom]],
                                            in_place=True)
            else:
                raise Exception('Unsupported activation ' +
                                config['activation'])

        elif layer_type == 'Cropping2D':
            shape = layer.output_shape
            ddata = L.DummyData(shape=dict(
                dim=[1, shape[3], shape[1], shape[2]]))
            layers = []
            layers.append(caffe_net[outputs[bottom]])
            layers.append(ddata)  #TODO
            caffe_net[name] = L.Crop(*layers)

        elif layer_type == 'Concatenate' or layer_type == 'Merge':
            layers = []
            for i in layer.input:
                layers.append(caffe_net[outputs[i.name]])
            caffe_net[name] = L.Concat(*layers, axis=1)

        elif layer_type == 'Add':
            layers = []
            for i in layer.input:
                layers.append(caffe_net[outputs[i.name]])
            caffe_net[name] = L.Eltwise(*layers)

        elif layer_type == 'Flatten':
            caffe_net[name] = L.Flatten(caffe_net[outputs[bottom]])

        elif layer_type == 'Reshape':
            shape = config['target_shape']
            if len(shape) == 3:
                #shape = (layer.input_shape[0], shape[2], shape[0], shape[1])
                shape = (1, shape[2], shape[0], shape[1])
            elif len(shape) == 1:
                #shape = (layer.input_shape[0], 1, 1, shape[0])
                shape = (1, 1, 1, shape[0])
            elif len(shape) == 2:
                shape = (0, shape[1], -1, 0)
            caffe_net[name] = L.Reshape(
                caffe_net[outputs[bottom]],
                reshape_param={'shape': {
                    'dim': list(shape)
                }})

        elif layer_type == 'MaxPooling2D' or layer_type == 'AveragePooling2D':

            kwargs = {}

            if layer_type == 'MaxPooling2D':
                kwargs['pool'] = P.Pooling.MAX
            else:
                kwargs['pool'] = P.Pooling.AVE

            pool_size = config['pool_size']
            strides = config['strides']

            if pool_size[0] != pool_size[1]:
                raise Exception('Unsupported pool_size')

            if strides[0] != strides[1]:
                raise Exception('Unsupported strides')

            set_padding(config, layer.input_shape, kwargs)

            caffe_net[name] = L.Pooling(caffe_net[outputs[bottom]],
                                        kernel_size=pool_size[0],
                                        stride=strides[0],
                                        **kwargs)

        elif layer_type == 'Dropout':
            caffe_net[name] = L.Dropout(
                caffe_net[outputs[bottom]],
                dropout_param=dict(dropout_ratio=config['rate']))

        elif layer_type == 'GlobalAveragePooling2D':
            caffe_net[name] = L.Pooling(
                caffe_net[outputs[bottom]],
                pool=P.Pooling.AVE,
                pooling_param=dict(global_pooling=True))

        elif layer_type == 'UpSampling2D':
            if config['size'][0] != config['size'][1]:
                raise Exception('Unsupported upsampling factor')
            factor = config['size'][0]
            kernel_size = 2 * factor - factor % 2
            stride = factor
            pad = int(math.ceil((factor - 1) / 2.0))
            channels = layer.input_shape[-1]
            caffe_net[name] = L.Deconvolution(
                caffe_net[outputs[bottom]],
                convolution_param=dict(num_output=channels,
                                       group=channels,
                                       kernel_size=kernel_size,
                                       stride=stride,
                                       pad=pad,
                                       weight_filler=dict(type='bilinear'),
                                       bias_term=False),
                param=dict(lr_mult=0, decay_mult=0))

        elif layer_type == 'LeakyReLU':
            caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]],
                                     negative_slope=config['alpha'],
                                     in_place=True)

        # Caffe中没有ZeroPadding2D存在,因此需要避免这个Op的应用,即将Padding写进卷积/反卷积/Pooling等层中
        #elif layer_type=='ZeroPadding2D':
        # padding=config['padding']
        # caffe_net[name] = L.Pooling(caffe_net[outputs[bottom]], kernel_size=1,
        #     stride=1, pad_h=padding[0][0]+padding[0][1], pad_w=padding[1][0]+padding[1][1], pool=P.Pooling.AVE)

        else:
            raise Exception('Unsupported layer type: ' + layer_type)

        outputs[top] = name

    #replace empty layer with input blob
    net_proto = input_str + '\n' + 'layer {' + 'layer {'.join(
        str(caffe_net.to_proto()).split('layer {')[2:])

    f = open(caffe_net_file, 'w')
    f.write(net_proto)
    f.close()

    caffe_model = caffe.Net(caffe_net_file, caffe.TEST)

    for layer in caffe_model.params.keys():
        if 'up_sampling2d' in layer:
            continue
        for n in range(0, len(caffe_model.params[layer])):
            caffe_model.params[layer][n].data[...] = net_params[layer][n]

    caffe_model.save(caffe_params_file)
예제 #22
0
    def resnet_mask_rcnn_rpn(self, stage=1):
        channals = self.channals
        if not self.deploy:
            data, im_info, gt_boxes = self.data_layer_train()
        else:
            data, im_info = self.data_layer_test()
            gt_boxes = None
        if stage == 1:
            pre_traned_fixed = True
        else:
            pre_traned_fixed = False
        conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed)
        pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1)
        index = 1
        out = pool1
        if self.module == "normal":
            residual_block = self.residual_block
        else:
            residual_block = self.residual_block_basic

        for i in self.stages[:-1]:
            index += 1
            for j in range(i):
                if j == 0:
                    if index == 2:
                        stride = 1
                    else:
                        stride = 2
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed)
                else:
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed)
            channals *= 2

        if not self.deploy:
            rpn_cls_loss, rpn_loss_bbox, rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data)
        else:
            rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data)
            rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes)

        if not self.deploy:
            self.net["dummy_roi_pool_conv5"] = L.DummyData(name = "dummy_roi_pool_conv5", shape=[dict(dim=[1,channals*2,14,14])])
            out = self.net["dummy_roi_pool_conv5"]
            index += 1
            for j in range(self.stages[-1]):
                if j == 0:
                    stride = 1
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride)
                else:
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals)
            if stage==1:
                self.net["silence_res"] = L.Silence(out, ntop=0)

            if stage==2:
                # for bbox detection
                pool5 = self.ave_pool(7, 1, "pool5", out)
                cls_score, bbox_pred = self.final_cls_bbox(pool5)
                self.net["silence_cls_score"] = L.Silence(cls_score, ntop=0)
                self.net["silence_bbox_pred"] = L.Silence(bbox_pred, ntop=0)

                # for mask prediction
                mask_conv1 = self.conv_factory("mask_conv1", out, 3, 256, 1, 1, bias_term=True)
                mask_out = self.conv_factory("mask_out", mask_conv1, 1, self.classes, 1, 0, bias_term=True)
                self.net["silence_mask_out"] = L.Silence(mask_out, ntop=0)
        return self.net.to_proto()
예제 #23
0
 def dummy_data_layer(self, shape, filler=1):
     #shape should be a list of dimensions
     return L.DummyData(shape=[dict(dim=shape)],
                        data_filler=[self.constant_filler(filler)],
                        ntop=1)
예제 #24
0
파일: import.py 프로젝트: kuangliu/mocha
def input_layer(layer_config):
    input_shape = layer_config['input_shape']
    return L.DummyData(shape=[dict(dim=input_shape)], ntop=1)
예제 #25
0
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
            module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 )
    else:
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
            module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 
    concat_word_embed = [n.embed_tanh, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600

    # LSTM
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
    n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
            reshape_param=dict(shape=dict(dim=[0,0,0,1])))

    '''
    Question Attention
    '''
    n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
                                           weight_filler=dict(type='xavier'))
    n.qatt_relu = L.ReLU(n.qatt_conv1)
    n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
    n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)

    qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
    dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    qatt_feature_list = []
    for i in xrange(config.NUM_QUESTION_GLIMPSE):
        if config.NUM_QUESTION_GLIMPSE == 1:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
        else:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))    
        qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
    n.qatt_feat_concat = L.Concat(*qatt_feature_list) 
    '''
    Image Attention with MFB
    '''
    n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    
    n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 
                                   weight_filler=dict(type='xavier'))
    n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))  
    n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
    n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)


    n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
                                 weight_filler=dict(type='xavier')) 
    n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
                                                                      config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
    n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1])))
    n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
                                                                       config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
                              pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
    
    n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
    n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)


    ## 2 conv layers 1000 -> 512 -> 2
    n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 
                                weight_filler=dict(type='xavier'))
    n.iatt_relu = L.ReLU(n.iatt_conv1)
    n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
    n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
    n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    iatt_feature_list = []
    for i in xrange(config.NUM_IMG_GLIMPSE):
        if config.NUM_IMG_GLIMPSE == 1:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
        else:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
        n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
                                reshape_param=dict(shape=dict(dim=[0,-1]))))
        iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
    n.iatt_feat_concat = L.Concat(*iatt_feature_list)
    n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    
    '''
    Fine-grained Image-Question MFB fusion
    '''

    n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
    n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
    
    n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier')) 
    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size):

    #prototxt 없이 network 생성시 사용
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})

    #지정된 Python 모듈 형식
    #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe
    #해당 클래스를 바탕으로 Layer를 생성하며
    #리턴된 변수에 값을 채워넣으면 자동으로 Run된다.
    #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐.

    #Glove = Global vectors for word representation
    #https://www.aclweb.org/anthology/D14-1162
    #Pretrained 된 GloveVector를 Concat에 사용.

    #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector.

    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    #module = python 파일이름
    #layer = layer형식이 맞춰진 python class
    #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다
    #ntop = 각 setup , forward backward의 top 변수의 크기

    #보통 textual Embed의 뜻은 => texture -> number
    #Embed 3000개의 Vector종류를
    #300개로 compact하게 표현함
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    #Tanh 적용
    n.embed = L.TanH(n.embed_ba)
    #Glove Data와 Concat
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})

    #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조.
    # give top2[~] the name specified by argument `slice_second`
    #변수 부여 기능
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))

    #마지막 LSTM output을 사용.
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    #lstm1의 output => 1024 reshape뒤 dropout
    #lstm2의 output => 1024 reshape뒤 dropout
    #concat

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1)  =>2048,14
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)

    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))

    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    #논문 그림과 달리 Dropout 추가
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    #논문 그림과 달리 output dim이 2
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    #softmax로 attentionmap 생성
    #14x14 Softmax map이 2개 생성

    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    #두가지 att_map을 각각 Slice
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
    #각각 ATT를 곱한값을 연산뒤 Concat한다.

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #그뒤 4096으로 Reshape

    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))

    #논문과 달리 가로축 세로축 inputVector크기가 다름
    #논문 2048 2048
    #코드 4096 2048
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    #SignedSqrt
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    #L2_Normalize
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    #Dropout
    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    #FullyConnected
    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))

    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
예제 #27
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
예제 #28
0
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8)

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights)
    n.embed = L.TanH(n.embed_ba) 

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[T-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[T-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    # Tile question feature
    n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14)
    n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14)

    # Embed image feature
    n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1,
                            num_output=2048, pad=0, weight_filler=dict(type='xavier'),
                            param=fixed_weights)

    # Eltwise product and normalization
    n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_sqrt = L.SignedSqrt(n.eltwise)
    n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt)
    n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for VQA
    n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.att_feature  = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048])))

    # eltwise product + normalization again for VQA
    n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2)
    n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt)
    n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3})

    n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier'))

    # Merge VQA answer and visual+textual feature
    n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    #n.exp_eltwise = L.Eltwise(n.eltwise_drop,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.eltwise_emb,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.exp_att_feature_prev  = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy)
    n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
def silent_net():
    n = caffe.NetSpec()
    n.data, n.data2 = L.DummyData(shape=dict(dim=3), ntop=2)
    n.silence_data = L.Silence(n.data, ntop=0)
    n.silence_data2 = L.Silence(n.data2, ntop=0)
    return n.to_proto()
예제 #30
0
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 )

    # char embedding
    n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \
         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed_c_scale = L.Scale(n.embed_c,
                              n.cont1,
                              scale_param=dict(dict(axis=0)))
    n.embed_c_scale_resh = L.Reshape(
        n.embed_c_scale,
        reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c *
                                           T, -1])))  # N x 1 x T_c x d_c
    tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2})
    for i in xrange(T):
        n.__setattr__('slice_' + str(i + 1), tops[int(i)])

    # char conv
    n.c_feature_1 = L.Convolution(
        n.slice_1,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_2 = L.Convolution(
        n.slice_2,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_3 = L.Convolution(
        n.slice_3,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_4 = L.Convolution(
        n.slice_4,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_5 = L.Convolution(
        n.slice_5,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_6 = L.Convolution(
        n.slice_6,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_7 = L.Convolution(
        n.slice_7,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_8 = L.Convolution(
        n.slice_8,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_9 = L.Convolution(
        n.slice_9,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_10 = L.Convolution(
        n.slice_10,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_11 = L.Convolution(
        n.slice_11,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_12 = L.Convolution(
        n.slice_12,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_13 = L.Convolution(
        n.slice_13,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_14 = L.Convolution(
        n.slice_14,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_15 = L.Convolution(
        n.slice_15,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_16 = L.Convolution(
        n.slice_16,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_17 = L.Convolution(
        n.slice_17,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_18 = L.Convolution(
        n.slice_18,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_19 = L.Convolution(
        n.slice_19,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_20 = L.Convolution(
        n.slice_20,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_21 = L.Convolution(
        n.slice_21,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])
    n.c_feature_22 = L.Convolution(
        n.slice_22,
        convolution_param={
            'kernel_h': 3,
            'kernel_w': 15,
            'stride': 1,
            'num_output': 150,
            'pad_h': 1,
            'pad_w': 0,
            'weight_filler': dict(type='xavier')
        },
        param=[dict(name="conv_c_w"),
               dict(name="conv_c_b")])

    n.c_vec_1 = L.Pooling(n.c_feature_1,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_2 = L.Pooling(n.c_feature_2,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_3 = L.Pooling(n.c_feature_3,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_4 = L.Pooling(n.c_feature_4,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_5 = L.Pooling(n.c_feature_5,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_6 = L.Pooling(n.c_feature_6,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_7 = L.Pooling(n.c_feature_7,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_8 = L.Pooling(n.c_feature_8,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_9 = L.Pooling(n.c_feature_9,
                          kernel_h=T_c,
                          kernel_w=1,
                          stride=T_c,
                          pool=P.Pooling.MAX)
    n.c_vec_10 = L.Pooling(n.c_feature_10,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_11 = L.Pooling(n.c_feature_11,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_12 = L.Pooling(n.c_feature_12,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_13 = L.Pooling(n.c_feature_13,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_14 = L.Pooling(n.c_feature_14,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_15 = L.Pooling(n.c_feature_15,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_16 = L.Pooling(n.c_feature_16,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_17 = L.Pooling(n.c_feature_17,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_18 = L.Pooling(n.c_feature_18,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_19 = L.Pooling(n.c_feature_19,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_20 = L.Pooling(n.c_feature_20,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_21 = L.Pooling(n.c_feature_21,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)
    n.c_vec_22 = L.Pooling(n.c_feature_22,
                           kernel_h=T_c,
                           kernel_w=1,
                           stride=T_c,
                           pool=P.Pooling.MAX)

    n.c_embed_1 = L.Reshape(
        n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_2 = L.Reshape(
        n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_3 = L.Reshape(
        n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_4 = L.Reshape(
        n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_5 = L.Reshape(
        n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_6 = L.Reshape(
        n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_7 = L.Reshape(
        n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_8 = L.Reshape(
        n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_9 = L.Reshape(
        n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_10 = L.Reshape(
        n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_11 = L.Reshape(
        n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_12 = L.Reshape(
        n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_13 = L.Reshape(
        n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_14 = L.Reshape(
        n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_15 = L.Reshape(
        n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_16 = L.Reshape(
        n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_17 = L.Reshape(
        n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_18 = L.Reshape(
        n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_19 = L.Reshape(
        n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_20 = L.Reshape(
        n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_21 = L.Reshape(
        n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))
    n.c_embed_22 = L.Reshape(
        n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150])))

    concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\
     n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22]
    n.concat_char_embed = L.Concat(*concat_c_embed,
                                   concat_param={'axis': 1})  # N x T x d_c

    # word embedding
    n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w

    # combine word and char embedding
    concat_word_embed = [n.embed_w, n.concat_char_embed]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # N x T x (d_c+d_w)

    n.embed_scale = L.Scale(n.concat_embed,
                            n.cont,
                            scale_param=dict(dict(axis=0)))
    n.embed_scale_resh = L.Reshape(
        n.embed_scale,
        reshape_param=dict(shape=dict(
            dim=[batchsize, 1, T, -1])))  # N x 1 x T x (d_c+d_w)

    # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0)))
    # n.glove_scale_resh = L.Reshape(n.glove_scale,\
    #                       reshape_param=dict(\
    #                           shape=dict(dim=[batchsize,1,T,300])))
    # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh]
    # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300

    # convolution
    n.word_feature_2 = L.Convolution(
        n.embed_scale_resh,
        kernel_h=2,
        kernel_w=300,
        stride=1,
        num_output=512,
        pad_h=1,
        pad_w=0,
        weight_filler=dict(type='xavier'))  # N x C x ? x 1
    n.word_feature_3 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=3,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=2,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_4 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=4,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=3,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_feature_5 = L.Convolution(n.embed_scale_resh,
                                     kernel_h=5,
                                     kernel_w=300,
                                     stride=1,
                                     num_output=512,
                                     pad_h=4,
                                     pad_w=0,
                                     weight_filler=dict(type='xavier'))
    n.word_relu_2 = L.ReLU(n.word_feature_2)
    n.word_relu_3 = L.ReLU(n.word_feature_3)
    n.word_relu_4 = L.ReLU(n.word_feature_4)
    n.word_relu_5 = L.ReLU(n.word_feature_5)
    n.word_vec_2 = L.Pooling(n.word_relu_2,
                             kernel_h=T + 1,
                             kernel_w=1,
                             stride=T + 1,
                             pool=P.Pooling.MAX)  # N x C x 1 x 1
    n.word_vec_3 = L.Pooling(n.word_relu_3,
                             kernel_h=T + 2,
                             kernel_w=1,
                             stride=T + 2,
                             pool=P.Pooling.MAX)
    n.word_vec_4 = L.Pooling(n.word_relu_4,
                             kernel_h=T + 3,
                             kernel_w=1,
                             stride=T + 3,
                             pool=P.Pooling.MAX)
    n.word_vec_5 = L.Pooling(n.word_relu_5,
                             kernel_h=T + 4,
                             kernel_w=1,
                             stride=T + 4,
                             pool=P.Pooling.MAX)
    word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
    n.concat_vec = L.Concat(*word_vec, concat_param={'axis':
                                                     1})  # N x 4C x 1 x 1
    n.concat_vec_dropped = L.Dropout(n.concat_vec,
                                     dropout_param={'dropout_ratio': 0.5})

    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.concat_vec_dropped,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()