def deploy_net(conf, batch_size, class_num): ''' :param conf: the data_set_config information, defined in data_info_set.item :param batch_size: the batch_size of prototxt :param class_num: the class_num of the data_set :param channels: the channels of hyperspectral data, maybe it is 224,448 or 103,206 :param kernel_size: the kernel_size of the convolution layer, often is 1/9 of the channels :return: deploy file handle ''' n = caffe.NetSpec() if conf.use_CK is True: n.data, n.label = L.DummyData( shape={'dim': [batch_size, 1, conf.CK_channels, 1]}, ntop=2) n.conv1 = L.Convolution(n.data, kernel_h=conf.CK_kernel_size, kernel_w=1, num_output=20, weight_filler=dict(type='gaussian', std=0.05), bias_filler=dict(type='constant', value=0.1)) else: n.data, n.label = L.DummyData( shape={'dim': [batch_size, 1, conf.channels, 1]}, ntop=2) n.conv1 = L.Convolution(n.data, kernel_h=conf.kernel_size, kernel_w=1, num_output=20, weight_filler=dict(type='gaussian', std=0.05), bias_filler=dict(type='constant', value=0.1)) n.bn1 = L.BatchNorm(n.conv1, use_global_stats=1, in_place=True) n.relu1 = L.PReLU(n.bn1, in_place=True) n.ip1 = L.InnerProduct(n.relu1, num_output=100, weight_filler=dict(type='gaussian', std=0.05), bias_filler=dict(type='constant', value=0.1)) n.drop1 = L.Dropout(n.ip1, dropout_ratio=0.1, in_place=True) n.relu2 = L.PReLU(n.drop1, in_place=True) n.ip2 = L.InnerProduct(n.relu2, num_output=class_num, weight_filler=dict(type='gaussian', std=0.05), bias_filler=dict(type='constant', value=0.1)) return n.to_proto()
def gradient_y(bottom): dummy_data = L.DummyData(dummy_data_param=dict( shape=[dict(dim=[1, 1, 99, 100])])) crop_1 = L.Crop(bottom, dummy_data, crop_param=dict(offset=[1, 0])) crop_2 = L.Crop(bottom, dummy_data, crop_param=dict(offset=[0, 0])) diff = L.Eltwise(crop_1, crop_2, eltwise_param=dict(operation=P.Eltwise.SUM, coeff=[1.0, -1.0])) gradient_y = L.AbsVal(diff) return gradient_y
def __init__(self, data_shape, label_shape, last_layer='fc8', params=[]): data_shape_list = list(data_shape) data_shape_list[0] = 1 label_shape_list = list(label_shape) label_shape_list[0] = 1 self.data_shape = data_shape_list self.data = L.DummyData(shape=dict(dim=data_shape_list)) self.label = L.DummyData(shape=dict(dim=label_shape_list)) self.last_layer = last_layer self.receptiveFieldStride = [] # cumprod of the stride values across the whole net self.net = self.net(params=params) # self.solver = self.solver(params) self.receptiveFieldStride = np.asarray(self.receptiveFieldStride) self.receptiveFieldStride = np.cumprod(self.receptiveFieldStride) np.save(str.split(params['path2net'], '.')[0] +'_stride.npy', self.receptiveFieldStride)
def build_retrieval_model_deploy(self, save_tag, visual_feature_dim, language_feature_dim): image_input = L.DummyData(shape=[dict(dim=[21, visual_feature_dim])], ntop=1) setattr(self.n, 'image_data', image_input) loc_input = L.DummyData(shape=[dict(dim=[21, 2])], ntop=1) setattr(self.n, 'loc_data', loc_input) im_model, lang_model = self.get_models() bottom_visual = im_model(image_input, loc_input) if self.language_layers in recurrent_layers: text_input = L.DummyData(shape=[ dict(dim=[ self.params['sentence_length'], 21, language_feature_dim ]) ], ntop=1) setattr(self.n, 'text_data', text_input) cont_input = L.DummyData( shape=[dict(dim=[self.params['sentence_length'], 21])], ntop=1) setattr(self.n, 'cont_data', cont_input) bottom_text = lang_model(text_input, cont_input) else: text_input = L.DummyData( shape=[dict(dim=[21, language_feature_dim])], ntop=1) bottom_text = lang_model(text_input) if self.language_layers == '0': setattr(self.n, 'text_data', bottom_text) else: setattr(self.n, 'text_data', text_input) self.n.tops['rank_score'] = self.distance_function( bottom_visual, bottom_text) self.write_net(save_tag, self.n)
def create_architecture(self, mode, hdf5_data): """Returns the architecture (i.e., caffe prototxt) of the model. Jer: One day this should probably be written to be more general. """ arch = self.arch pars = self.pars n = caffe.NetSpec() if mode == 'deploy': n.data = L.DummyData(shape=[dict(dim=pars['deploy_dims'])]) elif mode == 'train': n.data, n.label = L.HDF5Data(batch_size=pars['train_batch_size'], source=hdf5_data, ntop=pars['ntop']) else: # Test. n.data, n.label = L.HDF5Data(batch_size=pars['test_batch_size'], source=hdf5_data, ntop=pars['ntop']) # print(n.to_proto()) in_layer = n.data for layer in arch: layer_type, vals = layer if layer_type == 'e2e': in_layer = n.e2e = e2e_conv(in_layer, vals['n_filters'], vals['kernel_h'], vals['kernel_w']) elif layer_type == 'e2n': in_layer = n.e2n = e2n_conv(in_layer, vals['n_filters'], vals['kernel_h'], vals['kernel_w']) elif layer_type == 'fc': in_layer = n.fc = full_connect(in_layer, vals['n_filters']) elif layer_type == 'out': n.out = full_connect(in_layer, vals['n_filters']) # Rename to user specified unique layer name. # n.__setattr__('out', n.new_layer) elif layer_type == 'dropout': in_layer = n.dropout = L.Dropout(in_layer, in_place=True, dropout_param=dict(dropout_ratio=vals['dropout_ratio'])) elif layer_type == 'relu': in_layer = n.relu = L.ReLU(in_layer, in_place=True, relu_param=dict(negative_slope=vals['negative_slope'])) else: raise ValueError('Unknown layer type: ' + str(layer_type)) # ~ end for. if mode != 'deploy': if self.pars['loss'] == 'EuclideanLoss': n.loss = L.EuclideanLoss(n.out, n.label) else: ValueError("Only 'EuclideanLoss' currently implemented for pars['loss']!") return n
def vgg_net(mode, batch_size=1): #This is not the whole network! missing ReLU ect. if mode == "cl": pad_init = 1 elif mode == "sg": pad_init = 96 else: raise ValueError n = caffe.NetSpec() p = 1 pl = P.Pooling.MAX n.data = L.DummyData(shape=[dict(dim=[batch_size, 3, 224, 224])], ntop=1) n.conv1_1 = L.Convolution(n.data, kernel_size=3, pad=pad_init, num_output=64) n.conv1_2 = L.Convolution(n.conv1_1, kernel_size=3, pad=p, num_output=64) n.pool1 = L.Pooling(n.conv1_2, kernel_size=2, stride=2, pool=pl) n.conv2_1 = L.Convolution(n.pool1, kernel_size=3, pad=p, num_output=128) n.conv2_2 = L.Convolution(n.conv2_1, kernel_size=3, pad=p, num_output=128) n.pool2 = L.Pooling(n.conv2_2, kernel_size=2, stride=2, pool=pl) n.conv3_1 = L.Convolution(n.pool2, kernel_size=3, pad=p, num_output=256) n.conv3_2 = L.Convolution(n.conv3_1, kernel_size=3, pad=p, num_output=256) n.conv3_3 = L.Convolution(n.conv3_2, kernel_size=3, pad=p, num_output=256) n.pool3 = L.Pooling(n.conv3_3, kernel_size=2, stride=2, pool=pl) n.conv4_1 = L.Convolution(n.pool3, kernel_size=3, pad=p, num_output=512) n.conv4_2 = L.Convolution(n.conv4_1, kernel_size=3, pad=p, num_output=512) n.conv4_3 = L.Convolution(n.conv4_2, kernel_size=3, pad=p, num_output=512) n.pool4 = L.Pooling(n.conv4_3, kernel_size=2, stride=2, pool=pl) n.conv5_1 = L.Convolution(n.pool4, kernel_size=3, pad=p, num_output=512) n.conv5_2 = L.Convolution(n.conv5_1, kernel_size=3, pad=p, num_output=512) n.conv5_3 = L.Convolution(n.conv5_2, kernel_size=3, pad=p, num_output=512) n.pool5 = L.Pooling(n.conv5_3, kernel_size=2, stride=2, pool=pl) if mode == "cl": n.fc6 = L.InnerProduct(n.pool5, num_output=4096) n.fc7 = L.InnerProduct(n.fc6, num_output=4096) elif mode == "sg": n.fc6 = L.Convolution(n.pool5, kernel_size=7, pad=0, num_output=4096) n.fc7 = L.Convolution(n.fc6, kernel_size=1, pad=0, num_output=4096) else: raise ValueError return n
def __init__(self, batch_size=32, shape=(32, 32)): # Counter for layers of different types, e.g. conv, relu, pool. self.counters = dict() self.n = caffe.NetSpec() # Dummy data layer must be edited manually in prototxt self.n.data, self.n.label = layers.DummyData( shape=[ dict(dim=[batch_size, 1, shape[0], shape[1]]), dict(dim=[batch_size, 1, 1, 1]) ], transform_param=dict(scale=1. / 255), ntop=2)
def build_relational_model_deploy(self, save_tag, visual_feature_dim, language_feature_dim): image_input = L.DummyData( shape=[dict(dim=[21, 1, visual_feature_dim + 2])], ntop=1) setattr(self.n, 'image_data', image_input) image_global = L.DummyData( shape=[dict(dim=[21, 21, visual_feature_dim + 2])], ntop=1) setattr(self.n, 'global_data', image_global) im_model, lang_model = self.get_models() self.silence_count += 1 bottom_tile = L.Tile(image_input, axis=1, tiles=21) bottom_concat = L.Concat(bottom_tile, image_global, axis=2) bottom_visual = im_model(bottom_concat, axis=2) text_input = L.DummyData(shape=[ dict( dim=[self.params['sentence_length'], 21, language_feature_dim]) ], ntop=1) setattr(self.n, 'text_data', text_input) cont_input = L.DummyData( shape=[dict(dim=[self.params['sentence_length'], 21])], ntop=1) setattr(self.n, 'cont_data', cont_input) bottom_text = lang_model(text_input, cont_input) t_reshape = L.Reshape(bottom_text, shape=dict(dim=[self.batch_size, 1, -1])) t_tile = L.Tile(t_reshape, axis=1, tiles=21) self.n.tops['scores'] = self.distance_function(bottom_visual, t_tile)[0] self.write_net(save_tag, self.n)
def conv_pool_net(): n = caffe.NetSpec() n.data = L.DummyData(dummy_data_param=dict(num=20, channels=1, height=64, width=64, data_filler=dict( type="gaussian"))) n.label = L.DummyData(dummy_data_param=dict(num=20, channels=10, height=1, width=1, data_filler=dict( type="gaussian"))) n.conv1 = L.Convolution(n.data, num_output=20, kernel_size=4, stride=3, pad=0) n.relu1 = L.ReLU(n.conv1, in_place=True) n.pool1 = L.Pooling(n.relu1, pool=P.Pooling.MAX, kernel_size=2, stride=2) # 当变量名相同时,caffe会自动将之前的变量都按自定义的方式命名,只有最后一次使用时才保留自己定义的名 for i in range(2): n.conv1 = L.Convolution(n.pool1, num_output=10, kernel_size=4, stride=2, pad=3) n.relu1 = L.ReLU(n.conv1, in_place=True) n.pool1 = L.Pooling(n.relu1, pool=P.Pooling.MAX, kernel_size=2, stride=2) n.ip2 = L.InnerProduct(n.pool1, num_output=10, weight_filler=dict(type='xavier')) n.loss = L.SigmoidCrossEntropyLoss(n.ip2, n.label) return n.to_proto()
def net(): n = caffe.NetSpec() n.data = L.DummyData(dummy_data_param=dict(num=10, channels=1, height=28, width=28, data_filler=dict( type='gaussian'))) n.label = L.DummyData(dummy_data_param=dict(num=10, channels=1, height=1, width=1, data_filler=dict( type='gaussian'))) n.ip1 = L.InnerProduct(n.data, num_output=50, weight_filler=dict(type='xavier')) n.relu1 = L.ReLU(n.ip1, in_place=True) n.ip2 = L.InnerProduct(n.relu1, num_output=4, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.ip2, n.label) return n.to_proto()
def anon_lenet(batch_size): data, label = L.DummyData(shape=[dict(dim=[batch_size, 1, 28, 28]), dict(dim=[batch_size, 1, 1, 1])], transform_param=dict(scale=1./255), ntop=2) conv1 = L.Convolution(data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) pool1 = L.Pooling(conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) conv2 = L.Convolution(pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) pool2 = L.Pooling(conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) ip1 = L.InnerProduct(pool2, num_output=500, weight_filler=dict(type='xavier')) relu1 = L.ReLU(ip1, in_place=True) ip2 = L.InnerProduct(relu1, num_output=10, weight_filler=dict(type='xavier')) loss = L.SoftmaxWithLoss(ip2, label) return loss.to_proto()
def encoder_network(batch_size): n = caffe.NetSpec() n.image = L.DummyData(shape=[dict(dim=[1]), dict(dim=[1])], transform_param=dict(scale=1.0 / 255.0), ntop=2) n.accuracy = L.Python( n.loss, n.label, python_param=dict(module='python_accuracy', layer='PythonAccuracy', param_str='{ "param_name": param_value }'), ntop=1, ) return n.to_proto()
def lenet(batch_size): n = caffe.NetSpec() n.data, n.label = L.DummyData(shape=[dict(dim=[batch_size, 1, 28, 28]), dict(dim=[batch_size, 1, 1, 1])], transform_param=dict(scale=1./255), ntop=2) n.conv1 = L.Convolution(n.data, kernel_size=5, num_output=20, weight_filler=dict(type='xavier')) n.pool1 = L.Pooling(n.conv1, kernel_size=2, stride=2, pool=P.Pooling.MAX) n.conv2 = L.Convolution(n.pool1, kernel_size=5, num_output=50, weight_filler=dict(type='xavier')) n.pool2 = L.Pooling(n.conv2, kernel_size=2, stride=2, pool=P.Pooling.MAX) n.ip1 = L.InnerProduct(n.pool2, num_output=500, weight_filler=dict(type='xavier')) n.relu1 = L.ReLU(n.ip1, in_place=True) n.ip2 = L.InnerProduct(n.relu1, num_output=10, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.ip2, n.label) return n.to_proto()
def make_net_from_python_layer(input_names_and_values, output_names, py_module, py_layer, param_str=None, propagate_down=None): """ wrap a python layer in a "net" :param input_names_and_values: list of tuples [(in_name, in_data_np_array),...] :param output_names: names of outputs, list of strings :param py_module: string, "module" parameter of python layer :param py_layer: string, "layer" parameter for python layer :param param_str: optional string, "param_str" for python layer (default is None) :param propagate_down: list of booleans same length as len(input_names_and_shapes) :return: caffe.Net object encapsulating the tested layer updated propagate_down boolean vector """ # build net ns = caffe.NetSpec() inputs = [] for in_ in input_names_and_values: inl_ = L.DummyData(name=in_[0], dummy_data_param={'shape': {'dim': list(in_[1].shape)}}) ns.__setattr__(in_[0], inl_) inputs.append(inl_) str(ns.to_proto()) python_param = {'module': py_module, 'layer': py_layer} if param_str: python_param['param_str'] = param_str if propagate_down is None: propagate_down = [True for _ in xrange(len(output_names))] outputs = L.Python(*inputs, name='tested_py_layer', ntop=len(output_names), python_param=python_param, propagate_down=propagate_down, loss_weight=[1.0 for _ in output_names]) # mark this layer as "loss" for gradients ns.__setattr__(output_names[0], outputs) # for o, on in zip(outputs, output_names): # ns.__setattr__(on, o) with open('./test_py_layer.prototxt', 'w') as tf: tf.write('name: "test_py_layer"\n') tf.write('force_backward: true\n') # must have. otherwise python's backward is not called at all. tf.write(str(ns.to_proto())) net = caffe.Net('./test_py_layer.prototxt', caffe.TEST) os.unlink('./test_py_layer.prototxt') return net, propagate_down
def batch_norm_radnom_data(self): matconv_net = utils.load_matconvnet_from_file( 'test_data/ucf101-img-resnet-50-split1/net.mat')['net'] batch_norm_layer_id = 1 input_layer = L.DummyData(shape=dict(dim=[1, 1, 2, 2])) batch_norm_layer = matconv_net.layers[batch_norm_layer_id] lr_params_dic = py_matconv_to_caffe.convert_model_params._create_lr_params_dic( matconv_net.params) caffe_layer = _dagnn_BatchNorm([input_layer], batch_norm_layer, utils.get_values_for_multi_keys( lr_params_dic, batch_norm_layer.params)) n = caffe.NetSpec() n.input_layer = input_layer # layer_name = batch_norm_layer.name n.__setattr__(batch_norm_layer.name, caffe_layer) prototxt = str(n.to_proto()) output_proto_fn = join('test_data/batch_norm_test/workspace', 'net.prototxt') with open(output_proto_fn, 'w') as prototxt_file: prototxt_file.write(prototxt) net = caffe.Net(output_proto_fn, caffe.TEST) layer_name = 'bn_conv1' mu = 1 sig = 2 net.params[layer_name][0].data[...] = np.asarray([mu]) net.params[layer_name][1].data[...] = np.asarray([sig**2]) net.params[layer_name][2].data[...] = 1 data = np.arange(4).reshape((1, 1, 2, 2)) * 1.0 net.blobs['input_layer'].data[...] = data data -= mu data /= sig net.forward() np.testing.assert_array_almost_equal(data, net.blobs[layer_name].data, decimal=1)
def gen_net(train_hdf5_in, train_batch_size, test_hdf5_in, test_batch_size, deploy=False): # Input Layers n = caffe.NetSpec() if deploy: n.data = L.DummyData(ntop=1, shape=[dict(dim=[1, 1, 20, 20, 20])]) else: n.data, n.label = L.HDF5Data( ntop=2, include=dict(phase=caffe.TRAIN), hdf5_data_param=dict(batch_size=train_batch_size), source=train_hdf5_in) n.data2 = L.HDF5Data(ntop=0, top=['data', 'label'], include=dict(phase=caffe.TEST), hdf5_data_param=dict(batch_size=test_batch_size), source=test_hdf5_in) # Core Architecture n.deconv1 = Deconvolution(n.data) n.conv1, n.bn1, n.relu1 = Convolution_BN_ReLU(n.deconv1, num_output=64) n.conv2, n.bn2, n.relu2 = Convolution_BN_ReLU(n.relu1, num_output=64) n.conv3, n.bn3, n.relu3 = Convolution_BN_ReLU(n.relu2, num_output=32) n.conv4, n.bn4, n.relu4 = Convolution_BN_ReLU(n.relu3, num_output=16) n.conv5, n.bn5, n.relu5 = Convolution_BN_ReLU(n.relu4, num_output=16) n.conv6 = Convolution(n.relu5, num_output=1, param=[dict(lr_mult=0.1), dict(lr_mult=0.1)]) n.recon = L.Eltwise(n.deconv1, n.conv6, operation=P.Eltwise.SUM) # Output Layers if not deploy: n.loss = L.EuclideanLoss(n.recon, n.label) #n.loss = L.Python (n.recon, n.label, python_param=dict(module='pyloss',layer='SmoothL1LossLayer_2'),loss_weight=1) # Return the network return n.to_proto()
def concat_slice_net(): n = caffe.NetSpec() n.data = L.DummyData(dummy_data_param=dict(num=20, channels=50, height=64, width=64, data_filler=dict( type="gaussian"))) # 将输入的data层分为a,b,c输出,slice_point比Slice的个数少1 # 如本例将输入的data层分为a,b,c输出,即top有三个,slice_point则有两个, # 其中第一个slice_point=20是top:"a"的个数,第二个slice_point=30是top:"b"+top:"a"的个数 # 而top:"c"的个数:channels-第二个slice_point=50-30=20, # 因此a,b,c的channels分别是:20,10,20 n.a, n.b, n.c = L.Slice(n.data, ntop=3, slice_point=[20, 30], axis=0) n.d = L.Concat(n.a, n.b, axis=0) # Eltwise层的操作有三个:product(点乘), sum(相加减) 和 max(取大值),其中sum是默认操作 n.e = L.Eltwise(n.a, n.c) return n.to_proto()
def example_network(batch_size, fname='network.prototxt'): n = caffe.NetSpec() n.data, n.label = L.DummyData( shape=[dict(dim=[batch_size, 3]), dict(dim=[batch_size])], transform_param=dict(scale=1.0 / 255.0), ntop=2) n.affine = L.InnerProduct(n.data, num_output=3) n.lowrank = L.Python( n.affine, n.label, python_param=dict(module='LowRankLoss', layer='LowRankLossLayer'), ntop=1, ) #param_str='{ "param_name": param_value }'), f = open(fname, 'w') f.write(str(n.to_proto())) f.close()
def LowAGAN(w, batchsize, n): #input w = 11 damit output 10 <= 8*10 ist maximum #input w = 16 damit output 20 <= 4*20 ist maximum level = 2 listofsizes = [] if full_conv: listofsizes = [w] for i in range(0, level-1): alast = listofsizes[i] listofsizes.append((alast - 4)*2) listofsizes[0] -= 4 transform_param = dict(mirror=False, crop_size=w, scale=1., mean_value=103.939) if full_conv: transform_param = dict(mirror=False, crop_size=120, scale=1., mean_value=103.939) n.Adata, n.Anothing = L.ImageData(transform_param=transform_param, source='datasource.txt', is_color=False, shuffle=True, batch_size=batchsize, ntop=2) n.Aresize = L.Python(n.Adata, python_param=dict(module='resizelayer', layer='ResizeData'), param_str=str(4)) n.Acropped = L.Python(n.Aresize, python_param=dict(module='randomrot', layer='RandomRotLayer'), param_str=str(listofsizes[level -1] - 4)) codings = [8, 16, 24, 32, 40] d=w outname = "" for i in range(0,level): if full_conv: n["AZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level-1 -i] + 4, listofsizes[level-1 -i] + 4])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1) else: n["AZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, d, d])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1) n, outname = convBlock("AconvA"+str(i), codings[0], n, "AZrand_"+str(i), train=False) d /= 2 n, outname = joinBlock("AjoinA", codings[0], n, outname, 'gelu'+'AconvA0'+'_3', train=False) n, outname = convBlock("AconvB", codings[1], n, outname, train=False) convolution_param = dict(num_output=1, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier')) n["Atexture"] = L.Convolution(n[outname], convolution_param=convolution_param, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], name="Atextureparam") #0 => blurdat #1 => data n.Aswap, n.Alabels = L.Python(n["Atexture"], n.Acropped, python_param=dict(module='swaplayer', layer='SwapLayer'), propagate_down=[False, False], ntop=2) if full_conv: n.Anoise = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level -1] - 4, listofsizes[level -1] - 4])], data_filler=dict(type='gaussian', std=2.0), ntop=1) else: n.Anoise = L.DummyData(shape=[dict(dim=[batchsize, 1, w, w])], data_filler=dict(type='gaussian', std=2.0), ntop=1) n.Ainp = L.Eltwise(n.Aswap, n.Anoise, eltwise_param={'operation':1}) #GAN network convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganAconv1 = L.Convolution(n.Ainp, convolution_param=convolution_param) n.ganAconv1 = L.ReLU(n.ganAconv1, negative_slope=0.1) convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganAconv2 = L.Convolution(n.ganAconv1, convolution_param=convolution_param) n.ganAconv2 = L.ReLU(n.ganAconv2, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier')) n.ganAconv3 = L.Convolution(n.ganAconv2, convolution_param=convolution_param) n.ganAconv3 = L.ReLU(n.ganAconv3, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganAconv4 = L.Convolution(n.ganAconv3, convolution_param=convolution_param) n.ganAconv4 = L.ReLU(n.ganAconv4, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganAconv5 = L.Convolution(n.ganAconv4, convolution_param=convolution_param) #n.ganAconv5 = L.BatchNorm(n.ganAconv5, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganAconv5 = L.ReLU(n.ganAconv5, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier')) n.ganAconv6 = L.Convolution(n.ganAconv5, convolution_param=convolution_param) #n.ganAconv6 = L.BatchNorm(n.ganAconv6, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganAconv6 = L.ReLU(n.ganAconv6, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier')) n.ganAconv7 = L.Convolution(n.ganAconv6, convolution_param=convolution_param) #n.ganAconv7 = L.BatchNorm(n.ganAconv7, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganAconv7 = L.ReLU(n.ganAconv7, negative_slope=0.1) n.ganAconv7_pool = L.Pooling(n.ganAconv7, global_pooling=True, pool=P.Pooling.AVE) n.Aip3 = L.InnerProduct(n.ganAconv7_pool, num_output=1, weight_filler=dict(type='xavier'), name="last") n.Aloss = L.SigmoidCrossEntropyLoss(n.Aip3, n.Alabels) return n
def LowABGAN(w, batchsize, n): n.ABnothing = L.DummyData(shape=[dict(dim=[batchsize, 1, 1, 1])], data_filler=dict(type='constant'), ntop=1) n.ABlabels = L.Python(n.ABnothing, python_param=dict(module='destroy', layer='DestroyLayer')) codings = [8, 16, 24, 32, 40] level = 2 listofsizes = [] if full_conv: listofsizes = [w] for i in range(0, level-1): alast = listofsizes[i] listofsizes.append((alast - 4)*2) listofsizes[0] -= 4 d=w outname = "" for i in range(0,level): if full_conv: n["ABZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, listofsizes[level-1 -i] + 4, listofsizes[level-1 -i] + 4])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1) else: n["ABZrand_"+str(i)] = L.DummyData(shape=[dict(dim=[batchsize, 1, d, d])], data_filler=dict(type='uniform',min=0., max=255.), ntop=1) n, outname = convBlock("ABconvA"+str(i), codings[0], n, "ABZrand_"+str(i), train=True) d /= 2 n, outname = joinBlock("ABjoinA", codings[0], n, outname, 'gelu'+'ABconvA0'+'_3', train=True) n, outname = convBlock("ABconvB", codings[1], n, outname, train=True) convolution_param = dict(num_output=1, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier')) n["ABtexture"] = L.Convolution(n[outname], convolution_param=convolution_param, name="Atextureparam") #GAN network convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganABconv1 = L.Convolution(n["ABtexture"], param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) n.ganABconv1 = L.ReLU(n.ganABconv1, negative_slope=0.1) convolution_param = dict(num_output=16, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganABconv2 = L.Convolution(n.ganABconv1, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) n.ganABconv2 = L.ReLU(n.ganABconv2, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier')) n.ganABconv3 = L.Convolution(n.ganABconv2, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) n.ganABconv3 = L.ReLU(n.ganABconv3, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganABconv4 = L.Convolution(n.ganABconv3, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) n.ganABconv4 = L.ReLU(n.ganABconv4, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=3, stride=1, pad=1, weight_filler = dict(type='xavier')) n.ganABconv5 = L.Convolution(n.ganABconv4, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) #n.ganAconv5 = L.BatchNorm(n.ganAconv5, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganABconv5 = L.ReLU(n.ganABconv5, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=2, stride=2, pad=0, weight_filler = dict(type='xavier')) n.ganABconv6 = L.Convolution(n.ganABconv5, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) #n.ganAconv6 = L.BatchNorm(n.ganAconv6, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganABconv6 = L.ReLU(n.ganABconv6, negative_slope=0.1) convolution_param = dict(num_output=32, kernel_size=1, stride=1, pad=0, weight_filler = dict(type='xavier')) n.ganABconv7 = L.Convolution(n.ganABconv6, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], convolution_param=convolution_param) #n.ganAconv7 = L.BatchNorm(n.ganAconv7, use_global_stats=global_stat, name=allparamnames.pop(0))#, param=[{"lr_mult":0},{"lr_mult":0},{"lr_mult":0}]) n.ganABconv7 = L.ReLU(n.ganABconv7, negative_slope=0.1) n.ganABconv7_pool = L.Pooling(n.ganABconv7, global_pooling=True, pool=P.Pooling.AVE) n.ABip3 = L.InnerProduct(n.ganABconv7_pool, param=[dict(lr_mult=0, decay_mult= 0),dict(lr_mult=0, decay_mult= 0)], num_output=1, weight_filler=dict(type='xavier'), name="last") n.ABloss = L.SigmoidCrossEntropyLoss(n.ABip3, n.ABlabels) return n
def convert(keras_model, caffe_net_file, caffe_params_file): caffe_net = caffe.NetSpec() net_params = dict() outputs = dict() shape = () input_str = '' for layer in keras_model.layers: name = layer.name layer_type = type(layer).__name__ config = layer.get_config() blobs = layer.get_weights() blobs_num = len(blobs) if type(layer.output) == list: raise Exception('Layers with multiply outputs are not supported') else: top = layer.output.name if type(layer.input) != list: bottom = layer.input.name if layer_type == 'InputLayer' or len(caffe_net.tops) == 0: input_name = 'data' caffe_net[input_name] = L.Layer() input_shape = config['batch_input_shape'] input_str = 'input: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}'.format( '"' + input_name + '"', 1, input_shape[3], input_shape[1], input_shape[2]) outputs[layer.input.name] = input_name if layer_type == 'InputLayer': continue if layer_type == 'Conv2D' or layer_type == 'Convolution2D': strides = config['strides'] kernel_size = config['kernel_size'] kwargs = {'num_output': config['filters']} if kernel_size[0] == kernel_size[1]: kwargs['kernel_size'] = kernel_size[0] else: kwargs['kernel_h'] = kernel_size[0] kwargs['kernel_w'] = kernel_size[1] if strides[0] == strides[1]: kwargs['stride'] = strides[0] else: kwargs['stride_h'] = strides[0] kwargs['stride_w'] = strides[1] if not config['use_bias']: kwargs['bias_term'] = False #kwargs['param']=[dict(lr_mult=0)] else: #kwargs['param']=[dict(lr_mult=0), dict(lr_mult=0)] pass set_padding(config, layer.input_shape, kwargs) caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]], **kwargs) blobs[0] = np.array(blobs[0]).transpose(3, 2, 0, 1) net_params[name] = blobs if config['activation'] == 'relu': name_s = name + 's' caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True) elif config['activation'] == 'sigmoid': name_s = name + 's' caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True) elif config['activation'] == 'linear': #do nothing pass else: raise Exception('Unsupported activation ' + config['activation']) elif layer_type == 'Conv2DTranspose': stride = config['strides'] kernel_size = config['kernel_size'] channels = config['filters'] group = config['group'] w = layer.input_shape[1] h = layer.input_shape[2] out_w = math.ceil(w / float(stride[1])) pad_w = int((kernel_size[1] * out_w - (kernel_size[1] - strides[1]) * (out_w - 1) - w) / 2) out_h = math.ceil(h / float(strides[0])) pad_h = int((kernel_size[0] * out_h - (kernel_size[0] - strides[0]) * (out_h - 1) - h) / 2) if not config['use_bias']: bias_flag = False else: bias_flag = True if pad_w == 0: caffe_net[name] = L.Deconvolution( caffe_net[outputs[bottom]], convolution_param=dict(num_output=channels, group=channels, kernel_size=kernel_size, stride=stride, weight_filler=dict(type='bilinear'), bias_term=bias_flag), param=dict(lr_mult=0, decay_mult=0)) else: if pad_w == pad_h: config_caffe['pad'] = pad_w else: config_caffe['pad_h'] = pad_h config_caffe['pad_w'] = pad_w caffe_net[name] = L.Deconvolution( caffe_net[outputs[bottom]], convolution_param=dict(num_output=channels, group=channels, kernel_size=kernel_size, stride=stride, pad=pad, weight_filler=dict(type='bilinear'), bias_term=bias_flag), param=dict(lr_mult=0, decay_mult=0)) blob = np.array(blobs[0]).transpose(2, 3, 0, 1) blob.shape = (1, ) + blob.shape net_params[name] = blob if config['activation'] == 'relu': name_s = name + 's' caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True) elif config['activation'] == 'sigmoid': name_s = name + 's' caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True) elif config['activation'] == 'linear': #do nothing pass else: raise Exception('Unsupported activation ' + config['activation']) # 深度可分离卷积 elif layer_type == 'DepthwiseConv2D': strides = config['strides'] kernel_size = config['kernel_size'] kwargs = {'num_output': layer.input_shape[3]} if kernel_size[0] == kernel_size[1]: kwargs['kernel_size'] = kernel_size[0] else: kwargs['kernel_h'] = kernel_size[0] kwargs['kernel_w'] = kernel_size[1] if strides[0] == strides[1]: kwargs['stride'] = strides[0] else: kwargs['stride_h'] = strides[0] kwargs['stride_w'] = strides[1] set_padding(config, layer.input_shape, kwargs) kwargs['group'] = layer.input_shape[3] kwargs['bias_term'] = False caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]], **kwargs) blob = np.array(blobs[0]).transpose(2, 3, 0, 1) blob.shape = (1, ) + blob.shape net_params[name] = blob if config['activation'] == 'relu': name_s = name + 's' caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True) elif config['activation'] == 'sigmoid': name_s = name + 's' caffe_net[name_s] = L.Sigmoid(caffe_net[name], in_place=True) elif config['activation'] == 'linear': #do nothing pass else: raise Exception('Unsupported activation ' + config['activation']) elif layer_type == 'SeparableConv2D': strides = config['strides'] kernel_size = config['kernel_size'] kwargs = {'num_output': layer.input_shape[3]} if kernel_size[0] == kernel_size[1]: kwargs['kernel_size'] = kernel_size[0] else: kwargs['kernel_h'] = kernel_size[0] kwargs['kernel_w'] = kernel_size[1] if strides[0] == strides[1]: kwargs['stride'] = strides[0] else: kwargs['stride_h'] = strides[0] kwargs['stride_w'] = strides[1] set_padding(config, layer.input_shape, kwargs) kwargs['group'] = layer.input_shape[3] kwargs['bias_term'] = False caffe_net[name] = L.Convolution(caffe_net[outputs[bottom]], **kwargs) blob = np.array(blobs[0]).transpose(2, 3, 0, 1) blob.shape = (1, ) + blob.shape net_params[name] = blob name2 = name + '_' kwargs = { 'num_output': config['filters'], 'kernel_size': 1, 'bias_term': config['use_bias'] } caffe_net[name2] = L.Convolution(caffe_net[name], **kwargs) if config['use_bias'] == True: blob2 = [] blob2.append(np.array(blobs[1]).transpose(3, 2, 0, 1)) blob2.append(np.array(blobs[2])) blob2[0].shape = (1, ) + blob2[0].shape else: blob2 = np.array(blobs[1]).transpose(3, 2, 0, 1) blob2.shape = (1, ) + blob2.shape net_params[name2] = blob2 name = name2 elif layer_type == 'BatchNormalization': param = dict() variance = np.array(blobs[-1]) mean = np.array(blobs[-2]) if config['scale']: gamma = np.array(blobs[0]) sparam = [dict(lr_mult=1), dict(lr_mult=1)] else: gamma = np.ones(mean.shape, dtype=np.float32) #sparam=[dict(lr_mult=0, decay_mult=0), dict(lr_mult=1, decay_mult=1)] sparam = [dict(lr_mult=0), dict(lr_mult=1)] #sparam=[dict(lr_mult=0), dict(lr_mult=0)] if config['center']: beta = np.array(blobs[-3]) param['bias_term'] = True else: beta = np.zeros(mean.shape, dtype=np.float32) param['bias_term'] = False caffe_net[name] = L.BatchNorm(caffe_net[outputs[bottom]], in_place=True) #param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=1, decay_mult=1), dict(lr_mult=0, decay_mult=0)]) #param=[dict(lr_mult=1), dict(lr_mult=1), dict(lr_mult=0)]) net_params[name] = (mean, variance, np.array(1.0)) name_s = name + 's' caffe_net[name_s] = L.Scale( caffe_net[name], in_place=True, param=sparam, scale_param={'bias_term': config['center']}) net_params[name_s] = (gamma, beta) elif layer_type == 'Dense': caffe_net[name] = L.InnerProduct(caffe_net[outputs[bottom]], num_output=config['units'], weight_filler=dict(type='xavier')) if config['use_bias']: weight = np.array(blobs[0]).transpose(1, 0) if type(layer._inbound_nodes[0].inbound_layers[0] ).__name__ == 'Flatten': flatten_shape = layer._inbound_nodes[0].inbound_layers[ 0].input_shape for i in range(weight.shape[0]): weight[i] = np.array(weight[i].reshape( flatten_shape[1], flatten_shape[2], flatten_shape[3]).transpose( 2, 0, 1).reshape(weight.shape[1])) net_params[name] = (weight, np.array(blobs[1])) else: net_params[name] = (blobs[0]) name_s = name + 's' if config['activation'] == 'softmax': caffe_net[name_s] = L.Softmax(caffe_net[name], in_place=True) elif config['activation'] == 'relu': caffe_net[name_s] = L.ReLU(caffe_net[name], in_place=True) elif layer_type == 'Activation': if config['activation'] == 'relu': #caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]], in_place=True) if len(layer.input.consumers()) > 1: caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]]) else: caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]], in_place=True) elif config['activation'] == 'relu6': #TODO caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]]) elif config['activation'] == 'softmax': caffe_net[name] = L.Softmax(caffe_net[outputs[bottom]], in_place=True) elif config['activation'] == 'sigmoid': # name_s = name+'s' caffe_net[name] = L.Sigmoid(caffe_net[outputs[bottom]], in_place=True) else: raise Exception('Unsupported activation ' + config['activation']) elif layer_type == 'Cropping2D': shape = layer.output_shape ddata = L.DummyData(shape=dict( dim=[1, shape[3], shape[1], shape[2]])) layers = [] layers.append(caffe_net[outputs[bottom]]) layers.append(ddata) #TODO caffe_net[name] = L.Crop(*layers) elif layer_type == 'Concatenate' or layer_type == 'Merge': layers = [] for i in layer.input: layers.append(caffe_net[outputs[i.name]]) caffe_net[name] = L.Concat(*layers, axis=1) elif layer_type == 'Add': layers = [] for i in layer.input: layers.append(caffe_net[outputs[i.name]]) caffe_net[name] = L.Eltwise(*layers) elif layer_type == 'Flatten': caffe_net[name] = L.Flatten(caffe_net[outputs[bottom]]) elif layer_type == 'Reshape': shape = config['target_shape'] if len(shape) == 3: #shape = (layer.input_shape[0], shape[2], shape[0], shape[1]) shape = (1, shape[2], shape[0], shape[1]) elif len(shape) == 1: #shape = (layer.input_shape[0], 1, 1, shape[0]) shape = (1, 1, 1, shape[0]) elif len(shape) == 2: shape = (0, shape[1], -1, 0) caffe_net[name] = L.Reshape( caffe_net[outputs[bottom]], reshape_param={'shape': { 'dim': list(shape) }}) elif layer_type == 'MaxPooling2D' or layer_type == 'AveragePooling2D': kwargs = {} if layer_type == 'MaxPooling2D': kwargs['pool'] = P.Pooling.MAX else: kwargs['pool'] = P.Pooling.AVE pool_size = config['pool_size'] strides = config['strides'] if pool_size[0] != pool_size[1]: raise Exception('Unsupported pool_size') if strides[0] != strides[1]: raise Exception('Unsupported strides') set_padding(config, layer.input_shape, kwargs) caffe_net[name] = L.Pooling(caffe_net[outputs[bottom]], kernel_size=pool_size[0], stride=strides[0], **kwargs) elif layer_type == 'Dropout': caffe_net[name] = L.Dropout( caffe_net[outputs[bottom]], dropout_param=dict(dropout_ratio=config['rate'])) elif layer_type == 'GlobalAveragePooling2D': caffe_net[name] = L.Pooling( caffe_net[outputs[bottom]], pool=P.Pooling.AVE, pooling_param=dict(global_pooling=True)) elif layer_type == 'UpSampling2D': if config['size'][0] != config['size'][1]: raise Exception('Unsupported upsampling factor') factor = config['size'][0] kernel_size = 2 * factor - factor % 2 stride = factor pad = int(math.ceil((factor - 1) / 2.0)) channels = layer.input_shape[-1] caffe_net[name] = L.Deconvolution( caffe_net[outputs[bottom]], convolution_param=dict(num_output=channels, group=channels, kernel_size=kernel_size, stride=stride, pad=pad, weight_filler=dict(type='bilinear'), bias_term=False), param=dict(lr_mult=0, decay_mult=0)) elif layer_type == 'LeakyReLU': caffe_net[name] = L.ReLU(caffe_net[outputs[bottom]], negative_slope=config['alpha'], in_place=True) # Caffe中没有ZeroPadding2D存在,因此需要避免这个Op的应用,即将Padding写进卷积/反卷积/Pooling等层中 #elif layer_type=='ZeroPadding2D': # padding=config['padding'] # caffe_net[name] = L.Pooling(caffe_net[outputs[bottom]], kernel_size=1, # stride=1, pad_h=padding[0][0]+padding[0][1], pad_w=padding[1][0]+padding[1][1], pool=P.Pooling.AVE) else: raise Exception('Unsupported layer type: ' + layer_type) outputs[top] = name #replace empty layer with input blob net_proto = input_str + '\n' + 'layer {' + 'layer {'.join( str(caffe_net.to_proto()).split('layer {')[2:]) f = open(caffe_net_file, 'w') f.write(net_proto) f.close() caffe_model = caffe.Net(caffe_net_file, caffe.TEST) for layer in caffe_model.params.keys(): if 'up_sampling2d' in layer: continue for n in range(0, len(caffe_model.params[layer])): caffe_model.params[layer][n].data[...] = net_params[layer][n] caffe_model.save(caffe_params_file)
def resnet_mask_rcnn_rpn(self, stage=1): channals = self.channals if not self.deploy: data, im_info, gt_boxes = self.data_layer_train() else: data, im_info = self.data_layer_test() gt_boxes = None if stage == 1: pre_traned_fixed = True else: pre_traned_fixed = False conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed) pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1) index = 1 out = pool1 if self.module == "normal": residual_block = self.residual_block else: residual_block = self.residual_block_basic for i in self.stages[:-1]: index += 1 for j in range(i): if j == 0: if index == 2: stride = 1 else: stride = 2 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed) channals *= 2 if not self.deploy: rpn_cls_loss, rpn_loss_bbox, rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data) else: rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data) rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes) if not self.deploy: self.net["dummy_roi_pool_conv5"] = L.DummyData(name = "dummy_roi_pool_conv5", shape=[dict(dim=[1,channals*2,14,14])]) out = self.net["dummy_roi_pool_conv5"] index += 1 for j in range(self.stages[-1]): if j == 0: stride = 1 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals) if stage==1: self.net["silence_res"] = L.Silence(out, ntop=0) if stage==2: # for bbox detection pool5 = self.ave_pool(7, 1, "pool5", out) cls_score, bbox_pred = self.final_cls_bbox(pool5) self.net["silence_cls_score"] = L.Silence(cls_score, ntop=0) self.net["silence_bbox_pred"] = L.Silence(bbox_pred, ntop=0) # for mask prediction mask_conv1 = self.conv_factory("mask_conv1", out, 3, 256, 1, 1, bias_term=True) mask_out = self.conv_factory("mask_out", mask_conv1, 1, self.classes, 1, 0, bias_term=True) self.net["silence_mask_out"] = L.Silence(mask_out, ntop=0) return self.net.to_proto()
def dummy_data_layer(self, shape, filler=1): #shape should be a list of dimensions return L.DummyData(shape=[dict(dim=shape)], data_filler=[self.constant_filler(filler)], ntop=1)
def input_layer(layer_config): input_shape = layer_config['input_shape'] return L.DummyData(shape=[dict(dim=input_shape)], ntop=1)
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) if mode == 'val': n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \ module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) else: n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='xavier')) n.embed_tanh = L.TanH(n.embed) concat_word_embed = [n.embed_tanh, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM,\ weight_filler=dict(type='xavier'))) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0])) n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \ reshape_param=dict(shape=dict(dim=[0,0,0,1]))) ''' Question Attention ''' n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.qatt_relu = L.ReLU(n.qatt_conv1) n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0, weight_filler=dict(type='xavier')) n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15 n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2) qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1}) dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) qatt_feature_list = [] for i in xrange(config.NUM_QUESTION_GLIMPSE): if config.NUM_QUESTION_GLIMPSE == 1: n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm)) else: n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm)) qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i)) n.qatt_feat_concat = L.Concat(*qatt_feature_list) ''' Image Attention with MFB ''' n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1]))) n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH) n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH) n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0, weight_filler=dict(type='xavier')) n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE, config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0)) n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1]))) n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3])) n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE, config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3])) n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2) n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt) ## 2 conv layers 1000 -> 512 -> 2 n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.iatt_relu = L.ReLU(n.iatt_conv1) n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0, weight_filler=dict(type='xavier')) n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE]))) n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2) n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1}) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) iatt_feature_list = [] for i in xrange(config.NUM_IMG_GLIMPSE): if config.NUM_IMG_GLIMPSE == 1: n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy)) else: n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy)) n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \ reshape_param=dict(shape=dict(dim=[0,-1])))) iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i)) n.iatt_feat_concat = L.Concat(*iatt_feature_list) n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) ''' Fine-grained Image-Question MFB fusion ''' n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0)) n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out) n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) if mode == 'val': n.loss = L.SoftmaxWithLoss(n.prediction, n.label) else: n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): #prototxt 없이 network 생성시 사용 n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) #지정된 Python 모듈 형식 #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe #해당 클래스를 바탕으로 Layer를 생성하며 #리턴된 변수에 값을 채워넣으면 자동으로 Run된다. #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐. #Glove = Global vectors for word representation #https://www.aclweb.org/anthology/D14-1162 #Pretrained 된 GloveVector를 Concat에 사용. #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector. n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) #module = python 파일이름 #layer = layer형식이 맞춰진 python class #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다 #ntop = 각 setup , forward backward의 top 변수의 크기 #보통 textual Embed의 뜻은 => texture -> number #Embed 3000개의 Vector종류를 #300개로 compact하게 표현함 n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) #Tanh 적용 n.embed = L.TanH(n.embed_ba) #Glove Data와 Concat concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조. # give top2[~] the name specified by argument `slice_second` #변수 부여 기능 for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) #마지막 LSTM output을 사용. n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) #lstm1의 output => 1024 reshape뒤 dropout #lstm2의 output => 1024 reshape뒤 dropout #concat n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1) =>2048,14 n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) #논문 그림과 달리 Dropout 추가 n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) #논문 그림과 달리 output dim이 2 n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) #softmax로 attentionmap 생성 #14x14 Softmax map이 2개 생성 n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) #두가지 att_map을 각각 Slice att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) #각각 ATT를 곱한값을 연산뒤 Concat한다. # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #그뒤 4096으로 Reshape n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #논문과 달리 가로축 세로축 inputVector크기가 다름 #논문 2048 2048 #코드 4096 2048 n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) #SignedSqrt n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) #L2_Normalize n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) #Dropout n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) #FullyConnected n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights) n.embed = L.TanH(n.embed_ba) # LSTM1 n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[T-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[T-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) # Tile question feature n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14) n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14) # Embed image feature n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) # Eltwise product and normalization n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_sqrt = L.SignedSqrt(n.eltwise) n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt) n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for VQA n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048]))) # eltwise product + normalization again for VQA n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights) n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2) n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt) n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3}) n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights) # Take GT answer or Take the logits of the VQA model and get predicted answer to embed if use_gt: n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) else: n.vqa_ans = L.ArgMax(n.prediction, axis=1) n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Merge VQA answer and visual+textual feature n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) #n.exp_eltwise = L.Eltwise(n.eltwise_drop, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.eltwise_emb, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier')) n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD}) n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0) return n.to_proto()
def silent_net(): n = caffe.NetSpec() n.data, n.data2 = L.DummyData(shape=dict(dim=3), ntop=2) n.silence_data = L.Silence(n.data, ntop=0) n.silence_data2 = L.Silence(n.data2, ntop=0) return n.to_proto()
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 ) # char embedding n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_c_scale = L.Scale(n.embed_c, n.cont1, scale_param=dict(dict(axis=0))) n.embed_c_scale_resh = L.Reshape( n.embed_c_scale, reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c * T, -1]))) # N x 1 x T_c x d_c tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2}) for i in xrange(T): n.__setattr__('slice_' + str(i + 1), tops[int(i)]) # char conv n.c_feature_1 = L.Convolution( n.slice_1, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_2 = L.Convolution( n.slice_2, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_3 = L.Convolution( n.slice_3, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_4 = L.Convolution( n.slice_4, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_5 = L.Convolution( n.slice_5, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_6 = L.Convolution( n.slice_6, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_7 = L.Convolution( n.slice_7, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_8 = L.Convolution( n.slice_8, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_9 = L.Convolution( n.slice_9, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_10 = L.Convolution( n.slice_10, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_11 = L.Convolution( n.slice_11, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_12 = L.Convolution( n.slice_12, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_13 = L.Convolution( n.slice_13, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_14 = L.Convolution( n.slice_14, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_15 = L.Convolution( n.slice_15, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_16 = L.Convolution( n.slice_16, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_17 = L.Convolution( n.slice_17, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_18 = L.Convolution( n.slice_18, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_19 = L.Convolution( n.slice_19, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_20 = L.Convolution( n.slice_20, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_21 = L.Convolution( n.slice_21, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_22 = L.Convolution( n.slice_22, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_vec_1 = L.Pooling(n.c_feature_1, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_2 = L.Pooling(n.c_feature_2, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_3 = L.Pooling(n.c_feature_3, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_4 = L.Pooling(n.c_feature_4, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_5 = L.Pooling(n.c_feature_5, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_6 = L.Pooling(n.c_feature_6, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_7 = L.Pooling(n.c_feature_7, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_8 = L.Pooling(n.c_feature_8, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_9 = L.Pooling(n.c_feature_9, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_10 = L.Pooling(n.c_feature_10, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_11 = L.Pooling(n.c_feature_11, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_12 = L.Pooling(n.c_feature_12, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_13 = L.Pooling(n.c_feature_13, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_14 = L.Pooling(n.c_feature_14, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_15 = L.Pooling(n.c_feature_15, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_16 = L.Pooling(n.c_feature_16, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_17 = L.Pooling(n.c_feature_17, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_18 = L.Pooling(n.c_feature_18, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_19 = L.Pooling(n.c_feature_19, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_20 = L.Pooling(n.c_feature_20, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_21 = L.Pooling(n.c_feature_21, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_22 = L.Pooling(n.c_feature_22, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_embed_1 = L.Reshape( n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_2 = L.Reshape( n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_3 = L.Reshape( n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_4 = L.Reshape( n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_5 = L.Reshape( n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_6 = L.Reshape( n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_7 = L.Reshape( n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_8 = L.Reshape( n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_9 = L.Reshape( n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_10 = L.Reshape( n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_11 = L.Reshape( n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_12 = L.Reshape( n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_13 = L.Reshape( n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_14 = L.Reshape( n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_15 = L.Reshape( n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_16 = L.Reshape( n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_17 = L.Reshape( n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_18 = L.Reshape( n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_19 = L.Reshape( n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_20 = L.Reshape( n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_21 = L.Reshape( n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_22 = L.Reshape( n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\ n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22] n.concat_char_embed = L.Concat(*concat_c_embed, concat_param={'axis': 1}) # N x T x d_c # word embedding n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w # combine word and char embedding concat_word_embed = [n.embed_w, n.concat_char_embed] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # N x T x (d_c+d_w) n.embed_scale = L.Scale(n.concat_embed, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape( n.embed_scale, reshape_param=dict(shape=dict( dim=[batchsize, 1, T, -1]))) # N x 1 x T x (d_c+d_w) # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0))) # n.glove_scale_resh = L.Reshape(n.glove_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300 # convolution n.word_feature_2 = L.Convolution( n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x C x ? x 1 n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_relu_2 = L.ReLU(n.word_feature_2) n.word_relu_3 = L.ReLU(n.word_feature_3) n.word_relu_4 = L.ReLU(n.word_feature_4) n.word_relu_5 = L.ReLU(n.word_feature_5) n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T + 1, kernel_w=1, stride=T + 1, pool=P.Pooling.MAX) # N x C x 1 x 1 n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T + 2, kernel_w=1, stride=T + 2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T + 3, kernel_w=1, stride=T + 3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T + 4, kernel_w=1, stride=T + 4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 4C x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()