def net(): n = caffe.NetSpec() n.data = L.Input(input_param=dict(shape=dict(dim=data_shape))) n.dataout = L.Embed(n.data, param=[dict(lr_mult=1)], input_dim=_input_dim, num_output=_num_output, weight_filler=dict(type="xavier", std=0), bias_filler=dict(type='constant', value=0.03)) return n.to_proto()
def exp_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.exp_att_feature, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='exp_data_provider_layer', layer='ExpDataProviderLayer', param_str=mode_str, ntop=5) n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_embed = L.TanH(n.exp_embed_ba) # LSTM1 for Explanation n.exp_lstm1 = L.LSTM(\ n.exp_embed, n.exp_cont_1,\ recurrent_param=dict(\ num_output=2048,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1, dropout_param={'dropout_ratio': 0.3}) # Merge with LSTM1 for explanation n.exp_att_resh = L.Reshape( n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048]))) n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T) n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped, n.exp_att_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all) n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt) n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2, dropout_param={'dropout_ratio': 0.3}) # LSTM2 for Explanation n.exp_lstm2 = L.LSTM(\ n.exp_eltwise_all_drop, n.exp_cont_2,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2, dropout_param={'dropout_ratio': 0.3}) n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped, num_output=exp_vocab_size, weight_filler=dict(type='xavier'), axis=2) n.silence_exp_prediction = L.Silence(n.exp_prediction, ntop=0) return n.to_proto()
def generate_scores(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.img_feature, n.spatial, n.label = L.Python(module=config.data_provider, layer='TossLayer', param_str=mode_str, ntop=5) # embedding n.embed = L.Embed(n.language, input_dim=config.vocab_size, num_output=config.embed_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) # LSTM n.lstm = L.LSTM(n.embed, n.cont, recurrent_param=dict(num_output=config.lstm_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0))) tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0)) for i in range(config.T - 1): n.__setattr__('slice'+str(i), tops[i]) n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0)) n.lstm_out = tops[-1] n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # L2 Normalize image and language features n.img_l2norm = L.L2Normalize(n.img_feature) n.lstm_l2norm = L.L2Normalize(n.lstm_feat) n.img_l2norm_resh = L.Reshape(n.img_l2norm, reshape_param=dict(shape=dict(dim=[-1, config.D_im]))) n.lstm_l2norm_resh = L.Reshape(n.lstm_l2norm, reshape_param=dict(shape=dict(dim=[-1, config.D_text]))) # Concatenate n.feat_all = L.Concat(n.lstm_l2norm_resh, n.img_l2norm_resh, n.spatial, concat_param=dict(axis=1)) # MLP Classifier over concatenated feature n.mlp_l1, n.mlp_relu1 = fc_relu(n.feat_all, config.mlp_hidden_dims) if config.mlp_dropout: n.mlp_drop1 = L.Dropout(n.mlp_relu1, dropout_ratio=0.5, in_place=True) n.scores = fc(n.mlp_drop1, 1) else: n.scores = fc(n.mlp_relu1, 1) # Loss Layer n.loss = L.SigmoidCrossEntropyLoss(n.scores, n.label) return n.to_proto()
def embed(self, bottom, nout, input_dim=8801, weight_filler=None, bias_filler=None, bias_term=True, axis=1, learning_param=None, propagate_down=None): return L.Embed(bottom, input_dim=input_dim, num_output=nout, weight_filler=weight_filler, bias_filler=bias_filler, bias_term=bias_term, param=learning_param, propagate_down=propagate_down)
def embed(self, bottom, nout, input_dim=8801, bias_term=True, axis=1, propagate_down=False, weight_filler=None, bias_filler=None, learning_param=None): kwargs = { 'num_output': nout, 'input_dim': input_dim, 'bias_term': bias_term } if not isinstance(propagate_down, bool): kwargs['propagate_down'] = propagate_down kwargs = self.init_fillers(kwargs, weight_filler, bias_filler, learning_param) return L.Embed(bottom, **kwargs)
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights) n.embed = L.TanH(n.embed_ba) # LSTM1 n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[T-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[T-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) # Tile question feature n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14) n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14) # Embed image feature n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) # Eltwise product and normalization n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_sqrt = L.SignedSqrt(n.eltwise) n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt) n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for VQA n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048]))) # eltwise product + normalization again for VQA n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights) n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2) n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt) n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3}) n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights) # Take GT answer or Take the logits of the VQA model and get predicted answer to embed if use_gt: n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) else: n.vqa_ans = L.ArgMax(n.prediction, axis=1) n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Merge VQA answer and visual+textual feature n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) #n.exp_eltwise = L.Eltwise(n.eltwise_drop, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.eltwise_emb, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier')) n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD}) n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): #prototxt 없이 network 생성시 사용 n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) #지정된 Python 모듈 형식 #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe #해당 클래스를 바탕으로 Layer를 생성하며 #리턴된 변수에 값을 채워넣으면 자동으로 Run된다. #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐. #Glove = Global vectors for word representation #https://www.aclweb.org/anthology/D14-1162 #Pretrained 된 GloveVector를 Concat에 사용. #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector. n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) #module = python 파일이름 #layer = layer형식이 맞춰진 python class #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다 #ntop = 각 setup , forward backward의 top 변수의 크기 #보통 textual Embed의 뜻은 => texture -> number #Embed 3000개의 Vector종류를 #300개로 compact하게 표현함 n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) #Tanh 적용 n.embed = L.TanH(n.embed_ba) #Glove Data와 Concat concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조. # give top2[~] the name specified by argument `slice_second` #변수 부여 기능 for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) #마지막 LSTM output을 사용. n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) #lstm1의 output => 1024 reshape뒤 dropout #lstm2의 output => 1024 reshape뒤 dropout #concat n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1) =>2048,14 n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) #논문 그림과 달리 Dropout 추가 n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) #논문 그림과 달리 output dim이 2 n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) #softmax로 attentionmap 생성 #14x14 Softmax map이 2개 생성 n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) #두가지 att_map을 각각 Slice att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) #각각 ATT를 곱한값을 연산뒤 Concat한다. # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #그뒤 4096으로 Reshape n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #논문과 달리 가로축 세로축 inputVector크기가 다름 #논문 2048 2048 #코드 4096 2048 n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) #SignedSqrt n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) #L2_Normalize n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) #Dropout n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) #FullyConnected n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def mfh_baseline(mode, batchsize, T, question_vocab_size, folder): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) if mode == 'val': n.data, n.cont, n.img_feature, n.label = L.Python( \ module='vqa_data_layer', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4 ) else: n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4 ) n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='xavier')) n.embed_tanh = L.TanH(n.embed) # LSTM #n.lstm1 = L.LSTM(\ # n.embed_tanh, n.cont,\ # recurrent_param=dict(\ # num_output=config.LSTM_UNIT_NUM,\ # weight_filler=dict(type='xavier'))) #tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) #for i in xrange(config.MAX_WORDS_IN_QUESTION-1): # n.__setattr__('slice_first'+str(i), tops1[int(i)]) # n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) #n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] #n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ # reshape_param=dict(\ # shape=dict(dim=[-1,1024]))) #n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM,\ weight_filler=dict(type='xavier'))) tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) for i in xrange(config.MAX_WORDS_IN_QUESTION-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM, weight_filler=dict(type='xavier'))) tops2 = L.Slice(n.lstm2, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) for i in xrange(config.MAX_WORDS_IN_QUESTION-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[config.MAX_WORDS_IN_QUESTION-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.q_feat = L.Concat(*concat_botom) ''' Coarse Image-Question MFH fusion ''' n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0)) n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out) n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt) n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0)) n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out) n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt) n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2) n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) if mode == 'val': n.loss = L.SoftmaxWithLoss(n.prediction, n.label) else: n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) # word embedding n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # n.embed = L.TanH(n.embed_ba) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape(n.embed_scale,\ reshape_param=dict(\ shape=dict(dim=[batchsize,1,T,300]))) # Convolution n.word_feature_2 = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_2_g = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_3_g = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4_g = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5_g = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_2_acti = L.TanH(n.word_feature_2) n.word_3_acti = L.TanH(n.word_feature_3) n.word_4_acti = L.TanH(n.word_feature_4) n.word_5_acti = L.TanH(n.word_feature_5) n.word_2_gate = L.Sigmoid(n.word_feature_2_g) n.word_3_gate = L.Sigmoid(n.word_feature_3_g) n.word_4_gate = L.Sigmoid(n.word_feature_4_g) n.word_5_gate = L.Sigmoid(n.word_feature_5_g) n.word_2 = L.Eltwise(n.word_2_acti, n.word_2_gate, operation=P.Eltwise.PROD) n.word_3 = L.Eltwise(n.word_3_acti, n.word_3_gate, operation=P.Eltwise.PROD) n.word_4 = L.Eltwise(n.word_4_acti, n.word_4_gate, operation=P.Eltwise.PROD) n.word_5 = L.Eltwise(n.word_5_acti, n.word_5_gate, operation=P.Eltwise.PROD) n.word_vec_2 = L.Pooling(n.word_2, kernel_h=T + 1, kernel_w=1, stride=T + 1, pool=P.Pooling.MAX) n.word_vec_3 = L.Pooling(n.word_3, kernel_h=T + 2, kernel_w=1, stride=T + 2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_4, kernel_h=T + 3, kernel_w=1, stride=T + 3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_5, kernel_h=T + 4, kernel_w=1, stride=T + 4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 4*d_w x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def dis_net(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2, n.dis_label = L.Python( module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=9) n.vqa_exp_emb = L.Embed( n.exp, input_dim=exp_vocab_size, num_output=exp_vocab_size, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) #n.vqa_exp=batchxseqx exp_vocab_size n.vqa_exp = L.TanH(n.vqa_exp_emb) n.vqa_exp_reshape = L.Reshape( n.vqa_exp, reshape_param=dict(shape=dict(dim=[-1, exp_vocab_size]))) n.exp_embed_ba = L.InnerProduct(n.vqa_exp_reshape, num_output=300, weight_filler=dict(type='xavier')) n.exp_embed_ba_reshape = L.Reshape( n.exp_embed_ba, reshape_param=dict(shape=dict(dim=[-1, batchsize, 300]))) n.exp_embed = L.TanH(n.exp_embed_ba_reshape) # Embed VQA GT answer during training n.vqa_ans = L.Embed( n.label, input_dim=3000, num_output=3000, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) #n.vqa_ans=batchxseqx3000 n.exp_emb_ans = L.InnerProduct(n.vqa_ans, num_output=300, weight_filler=dict(type='xavier')) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Exp LSTM1 n.exp_lstm1 = L.LSTM(n.exp_embed, n.exp_cont_1, recurrent_param=dict(num_output=1024, weight_filler=dict( type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0)), param=fixed_weights_lstm) exp_tops1 = L.Slice(n.exp_lstm1, ntop=exp_T, slice_param={'axis': 0}) for i in range(T - 1): n.__setattr__('slice_first' + str(i), exp_tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(exp_tops1[int(i)], ntop=0)) n.exp_lstm1_out = exp_tops1[T - 1] n.exp_lstm1_reshaped = L.Reshape( n.exp_lstm1_out, reshape_param=dict(shape=dict(dim=[-1, 1024]))) n.exp_lstm1_reshaped_droped = L.Dropout( n.exp_lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.exp_lstm1_droped = L.Dropout(n.exp_lstm1, dropout_param={'dropout_ratio': 0.3}) # Exp LSTM2 n.exp_lstm2 = L.LSTM(n.exp_lstm1_droped, n.exp_cont_2, recurrent_param=dict(num_output=1024, weight_filler=dict( type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0)), param=fixed_weights_lstm) exp_tops2 = L.Slice(n.exp_lstm2, ntop=exp_T, slice_param={'axis': 0}) for i in range(T - 1): n.__setattr__('slice_second' + str(i), exp_tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(exp_tops2[int(i)], ntop=0)) n.exp_lstm2_out = exp_tops2[T - 1] n.exp_lstm2_reshaped = L.Reshape( n.exp_lstm2_out, reshape_param=dict(shape=dict(dim=[-1, 1024]))) n.exp_lstm2_reshaped_droped = L.Dropout( n.exp_lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.exp_lstm1_reshaped_droped, n.exp_lstm2_reshaped_droped] n.exp_lstm_12 = L.Concat(*concat_botom) #To concat ans and exp concat_ans_exp = [n.exp_emb_ans2, n.exp_lstm_12] n.concat_ans_exp_layer = L.Concat(*concat_ans_exp) n.concat_ans_exp_layer_dis = L.InnerProduct( n.concat_ans_exp_layer, num_output=1, weight_filler=dict(type='xavier')) n.discr_loss = L.SoftmaxWithLoss(n.concat_ans_exp_layer_dis, n.dis_label, loss_param=dict(ignore_label=-1)) return n.to_proto()
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) if mode == 'val': n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \ module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) else: n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='xavier')) n.embed_tanh = L.TanH(n.embed) concat_word_embed = [n.embed_tanh, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM,\ weight_filler=dict(type='xavier'))) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0])) n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \ reshape_param=dict(shape=dict(dim=[0,0,0,1]))) ''' Question Attention ''' n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.qatt_relu = L.ReLU(n.qatt_conv1) n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0, weight_filler=dict(type='xavier')) n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15 n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2) qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1}) dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) qatt_feature_list = [] for i in xrange(config.NUM_QUESTION_GLIMPSE): if config.NUM_QUESTION_GLIMPSE == 1: n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm)) else: n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm)) qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i)) n.qatt_feat_concat = L.Concat(*qatt_feature_list) ''' Image Attention with MFB ''' n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1]))) n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH) n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH) n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0, weight_filler=dict(type='xavier')) n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE, config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0)) n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1]))) n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3])) n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE, config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3])) n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2) n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt) ## 2 conv layers 1000 -> 512 -> 2 n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.iatt_relu = L.ReLU(n.iatt_conv1) n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0, weight_filler=dict(type='xavier')) n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE]))) n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2) n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1}) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) iatt_feature_list = [] for i in xrange(config.NUM_IMG_GLIMPSE): if config.NUM_IMG_GLIMPSE == 1: n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy)) else: n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy)) n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \ reshape_param=dict(shape=dict(dim=[0,-1])))) iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i)) n.iatt_feat_concat = L.Concat(*iatt_feature_list) n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) ''' Fine-grained Image-Question MFB fusion ''' n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0)) n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out) n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) if mode == 'val': n.loss = L.SoftmaxWithLoss(n.prediction, n.label) else: n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size, embed_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) # word embedding (static + dynamic) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # N x T x d_w n.embed_scale_resh = L.Reshape( n.embed_scale, reshape_param=dict(shape=dict(dim=[batchsize, T, embed_size, 1]))) # avg of word embedding n.embed_avg = L.Convolution(n.embed_scale_resh, convolution_param={ 'kernel_size': 1, 'num_output': 1, 'bias_term': False, 'weight_filler': dict(type='constant', value=1) }, param=dict(lr_mult=0, decay_mult=0)) # N x 1 x d_w x 1 n.embed_avg_resh = L.Reshape( n.embed_avg, reshape_param=dict(shape=dict(dim=[batchsize, embed_size, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.embed_avg_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def net(split, vocab_size, opts): n = caffe.NetSpec() param_str = json.dumps({'split': split, 'batchsize': cfg.BATCHSIZE}) n.qvec, n.cvec, n.img_feat, n.spt_feat, n.query_label, n.query_label_mask, n.query_bbox_targets, \ n.query_bbox_inside_weights, n.query_bbox_outside_weights = L.Python( \ name='data', module='networks.data_layer', layer='DataProviderLayer', param_str=param_str, ntop=9 ) n.embed_ba = L.Embed(n.qvec, input_dim=vocab_size, num_output=cfg.WORD_EMB_SIZE, \ weight_filler=dict(type='xavier')) n.embed = L.TanH(n.embed_ba) word_emb = n.embed # LSTM1 n.lstm1 = L.LSTM(\ word_emb, n.cvec,\ recurrent_param=dict(\ num_output=cfg.RNN_DIM,\ weight_filler=dict(type='xavier'))) tops1 = L.Slice(n.lstm1, ntop=cfg.QUERY_MAXLEN, slice_param={'axis': 0}) for i in xrange(cfg.QUERY_MAXLEN - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[cfg.QUERY_MAXLEN - 1] n.lstm1_reshaped = L.Reshape( n.lstm1_out, reshape_param=dict(shape=dict(dim=[-1, cfg.RNN_DIM]))) n.lstm1_droped = L.Dropout( n.lstm1_reshaped, dropout_param={'dropout_ratio': cfg.DROPOUT_RATIO}) n.lstm_l2norm = L.L2Normalize(n.lstm1_droped) n.q_emb = L.Reshape(n.lstm_l2norm, reshape_param=dict(shape=dict(dim=[0, -1]))) q_layer = n.q_emb # (N, 1024) v_layer = proc_img(n, n.img_feat, n.spt_feat) #out: (N, 100, 2053) out_layer = concat(n, q_layer, v_layer) # predict score n.query_score_fc = L.InnerProduct(out_layer, num_output=1, weight_filler=dict(type='xavier')) n.query_score_pred = L.Reshape( n.query_score_fc, reshape_param=dict(shape=dict(dim=[-1, cfg.RPN_TOPN]))) if cfg.USE_KLD: n.loss_query_score = L.SoftmaxKLDLoss(n.query_score_pred, n.query_label, n.query_label_mask, propagate_down=[1, 0, 0], loss_weight=1.0) else: n.loss_query_score = L.SoftmaxWithLoss(n.query_score_pred, n.query_label, n.query_label_mask, propagate_down=[1, 0, 0], loss_weight=1.0) # predict bbox n.query_bbox_pred = L.InnerProduct(out_layer, num_output=4, weight_filler=dict(type='xavier')) if cfg.USE_REG: n.loss_query_bbox = L.SmoothL1Loss( n.query_bbox_pred, n.query_bbox_targets, \ n.query_bbox_inside_weights, n.query_bbox_outside_weights, loss_weight=1.0) else: n.__setattr__('silence_query_bbox_pred', L.Silence(n.query_bbox_pred, ntop=0)) n.__setattr__('silence_query_bbox_targets', L.Silence(n.query_bbox_targets, ntop=0)) n.__setattr__('silence_query_bbox_inside_weights', L.Silence(n.query_bbox_inside_weights, ntop=0)) n.__setattr__('silence_query_bbox_outside_weights', L.Silence(n.query_bbox_outside_weights, ntop=0)) return n.to_proto()
def qlstm(mode, batchsize, max_words_in_question, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \ module='vqa_data_provider_layer', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # T x N -> T x N x 300 n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=max_words_in_question, slice_param={'axis': 0}) for i in xrange(max_words_in_question - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[max_words_in_question - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_drop = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_drop = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_drop, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=max_words_in_question, slice_param={'axis': 0}) for i in xrange(max_words_in_question - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[max_words_in_question - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_drop = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_lstms = [n.lstm1_reshaped_drop, n.lstm2_reshaped_drop] n.lstm_12 = L.Concat(*concat_lstms) n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12, \ reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature, \ reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh, n.i_emb_tanh_droped_resh, \ compact_bilinear_param=dict(num_output=16000,sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.blcf_droped_resh = L.Reshape( n.blcf_droped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.blcf_droped_resh, num_output=config.NUM_OUTPUT_UNITS, \ weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, T_c, question_c_vocab_size, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.data1, n.cont1, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=6)#5 ) # char embedding n.embed_c = L.Embed(n.data1, input_dim=question_c_vocab_size, num_output=15, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_c_scale = L.Scale(n.embed_c, n.cont1, scale_param=dict(dict(axis=0))) n.embed_c_scale_resh = L.Reshape( n.embed_c_scale, reshape_param=dict(shape=dict(dim=[batchsize, 1, T_c * T, -1]))) # N x 1 x T_c x d_c tops = L.Slice(n.embed_c_scale_resh, ntop=T, slice_param={'axis': 2}) for i in xrange(T): n.__setattr__('slice_' + str(i + 1), tops[int(i)]) # char conv n.c_feature_1 = L.Convolution( n.slice_1, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_2 = L.Convolution( n.slice_2, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_3 = L.Convolution( n.slice_3, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_4 = L.Convolution( n.slice_4, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_5 = L.Convolution( n.slice_5, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_6 = L.Convolution( n.slice_6, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_7 = L.Convolution( n.slice_7, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_8 = L.Convolution( n.slice_8, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_9 = L.Convolution( n.slice_9, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_10 = L.Convolution( n.slice_10, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_11 = L.Convolution( n.slice_11, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_12 = L.Convolution( n.slice_12, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_13 = L.Convolution( n.slice_13, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_14 = L.Convolution( n.slice_14, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_15 = L.Convolution( n.slice_15, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_16 = L.Convolution( n.slice_16, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_17 = L.Convolution( n.slice_17, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_18 = L.Convolution( n.slice_18, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_19 = L.Convolution( n.slice_19, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_20 = L.Convolution( n.slice_20, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_21 = L.Convolution( n.slice_21, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_feature_22 = L.Convolution( n.slice_22, convolution_param={ 'kernel_h': 3, 'kernel_w': 15, 'stride': 1, 'num_output': 150, 'pad_h': 1, 'pad_w': 0, 'weight_filler': dict(type='xavier') }, param=[dict(name="conv_c_w"), dict(name="conv_c_b")]) n.c_vec_1 = L.Pooling(n.c_feature_1, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_2 = L.Pooling(n.c_feature_2, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_3 = L.Pooling(n.c_feature_3, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_4 = L.Pooling(n.c_feature_4, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_5 = L.Pooling(n.c_feature_5, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_6 = L.Pooling(n.c_feature_6, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_7 = L.Pooling(n.c_feature_7, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_8 = L.Pooling(n.c_feature_8, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_9 = L.Pooling(n.c_feature_9, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_10 = L.Pooling(n.c_feature_10, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_11 = L.Pooling(n.c_feature_11, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_12 = L.Pooling(n.c_feature_12, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_13 = L.Pooling(n.c_feature_13, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_14 = L.Pooling(n.c_feature_14, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_15 = L.Pooling(n.c_feature_15, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_16 = L.Pooling(n.c_feature_16, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_17 = L.Pooling(n.c_feature_17, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_18 = L.Pooling(n.c_feature_18, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_19 = L.Pooling(n.c_feature_19, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_20 = L.Pooling(n.c_feature_20, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_21 = L.Pooling(n.c_feature_21, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_vec_22 = L.Pooling(n.c_feature_22, kernel_h=T_c, kernel_w=1, stride=T_c, pool=P.Pooling.MAX) n.c_embed_1 = L.Reshape( n.c_vec_1, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_2 = L.Reshape( n.c_vec_2, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_3 = L.Reshape( n.c_vec_3, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_4 = L.Reshape( n.c_vec_4, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_5 = L.Reshape( n.c_vec_5, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_6 = L.Reshape( n.c_vec_6, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_7 = L.Reshape( n.c_vec_7, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_8 = L.Reshape( n.c_vec_8, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_9 = L.Reshape( n.c_vec_9, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_10 = L.Reshape( n.c_vec_10, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_11 = L.Reshape( n.c_vec_11, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_12 = L.Reshape( n.c_vec_12, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_13 = L.Reshape( n.c_vec_13, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_14 = L.Reshape( n.c_vec_14, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_15 = L.Reshape( n.c_vec_15, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_16 = L.Reshape( n.c_vec_16, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_17 = L.Reshape( n.c_vec_17, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_18 = L.Reshape( n.c_vec_18, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_19 = L.Reshape( n.c_vec_19, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_20 = L.Reshape( n.c_vec_20, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_21 = L.Reshape( n.c_vec_21, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) n.c_embed_22 = L.Reshape( n.c_vec_22, reshape_param=dict(shape=dict(dim=[batchsize, 1, 150]))) concat_c_embed = [n.c_embed_1, n.c_embed_2, n.c_embed_3, n.c_embed_4, n.c_embed_5, n.c_embed_6, n.c_embed_7, n.c_embed_8, n.c_embed_9, n.c_embed_10,\ n.c_embed_11, n.c_embed_12, n.c_embed_13, n.c_embed_14, n.c_embed_15, n.c_embed_16, n.c_embed_17, n.c_embed_18, n.c_embed_19, n.c_embed_20, n.c_embed_21, n.c_embed_22] n.concat_char_embed = L.Concat(*concat_c_embed, concat_param={'axis': 1}) # N x T x d_c # word embedding n.embed_w = L.Embed(n.data, input_dim=question_vocab_size, num_output=150, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # N x T x d_w # combine word and char embedding concat_word_embed = [n.embed_w, n.concat_char_embed] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # N x T x (d_c+d_w) n.embed_scale = L.Scale(n.concat_embed, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape( n.embed_scale, reshape_param=dict(shape=dict( dim=[batchsize, 1, T, -1]))) # N x 1 x T x (d_c+d_w) # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0))) # n.glove_scale_resh = L.Reshape(n.glove_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300 # convolution n.word_feature_2 = L.Convolution( n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x C x ? x 1 n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) n.word_relu_2 = L.ReLU(n.word_feature_2) n.word_relu_3 = L.ReLU(n.word_feature_3) n.word_relu_4 = L.ReLU(n.word_feature_4) n.word_relu_5 = L.ReLU(n.word_feature_5) n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T + 1, kernel_w=1, stride=T + 1, pool=P.Pooling.MAX) # N x C x 1 x 1 n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T + 2, kernel_w=1, stride=T + 2, pool=P.Pooling.MAX) n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T + 3, kernel_w=1, stride=T + 3, pool=P.Pooling.MAX) n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T + 4, kernel_w=1, stride=T + 4, pool=P.Pooling.MAX) word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 4C x 1 x 1 n.concat_vec_dropped = L.Dropout(n.concat_vec, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def pj_x(mode, batchsize, exp_T, exp_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='activity_data_provider_layer', layer='ActivityDataProviderLayer', param_str=mode_str, ntop=6) # Attention n.att_conv1 = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048]))) # Prediction n.prediction = L.InnerProduct(n.att_feature_resh, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) n.accuracy = L.Accuracy(n.prediction, n.label) # Embed Activity GT answer during training n.exp_emb_ans = L.Embed(n.label, input_dim=config.NUM_OUTPUT_UNITS, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # merge activity answer and visual feature n.exp_emb_resh = L.Reshape( n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) n.img_embed = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.img_embed, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape( n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape( n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape( n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) # Embed explanation n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_embed = L.TanH(n.exp_embed_ba) # LSTM1 for Explanation n.exp_lstm1 = L.LSTM(\ n.exp_embed, n.exp_cont_1,\ recurrent_param=dict(\ num_output=2048,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1, dropout_param={'dropout_ratio': 0.3}) # merge with LSTM1 for explanation n.exp_att_resh = L.Reshape( n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048]))) n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T) n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped, n.exp_att_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all) n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2, dropout_param={'dropout_ratio': 0.3}) # LSTM2 for Explanation n.exp_lstm2 = L.LSTM(\ n.exp_eltwise_all_drop, n.exp_cont_2,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2, dropout_param={'dropout_ratio': 0.3}) n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped, num_output=exp_vocab_size, weight_filler=dict(type='xavier'), axis=2) n.exp_loss = L.SoftmaxWithLoss(n.exp_prediction, n.exp_out, loss_param=dict(ignore_label=-1), softmax_param=dict(axis=2)) n.exp_accuracy = L.Accuracy(n.exp_prediction, n.exp_out, axis=2, ignore_label=-1) return n.to_proto()
gpu_id = 0 batch_size = 100 num_clas = 2 sub_nets = ('generator2', 'discriminator2', 'data2') ############ creating the data net ############################# data = caffe.NetSpec() data.ECAL, data.TAG = L.HDF5Data(batch_size = batch_size, source = "train.txt", ntop = 2) # train.txt is a text file containing the path to the training data folder with open('data2.prototxt', 'w') as f: f.write(str(data.to_proto())) ############ creating the generator net ######################## n = caffe.NetSpec() n.feat = L.Input(shape=dict(dim=[batch_size, latent])) # random array n.clas = L.Input(shape=dict(dim=[batch_size,1])) # array with classes n.embed = L.Embed(n.clas, input_dim=num_clas, num_output=latent, weight_filler=dict(type='xavier')) # class dependant embedding (xavier for glorot_normal in keras) n.flat = L.Flatten(n.embed) n.mult = L.Eltwise(n.flat, n.feat, operation=0) # 0 = multiplication mode n.Dense = L.InnerProduct(n.mult, num_output=7*7*8*8, weight_filler=dict(type='msra')) # 3136 n.resh = L.Reshape(n.Dense, reshape_param ={'shape':{'dim':[100, 7, 7, 8, 8]}}) n.conv5 = L.Convolution(n.resh, num_output=64, kernel_size= [6, 6, 8], pad=[2, 2, 3], engine=1) # (not working for nd) weight_filler=dict(type='msra') => keras he_uniform n.relu5 = L.ReLU(n.conv5, negative_slope=0.3, engine=1) n.bn5 = L.BatchNorm(n.relu5, in_place=True) n.upsmpl5 = L.Deconvolution(n.bn5, convolution_param=dict(num_output=1, group=1, kernel_size=4, stride = 2, pad=1)) #f=2, kernel_size:{{2*f- f%2}} stride:{{f}} num_output:{{C}} group:{{C}} pad:{{ceil((f-1)/2.)}} (gives error for nd) weight_filler: "bilinear" n.conv4 = L.Convolution(n.upsmpl5, num_output=6, kernel_size= [6, 5, 8], pad=[2, 2, 0], engine=1)# (not working for nd) weight_filler=dict(type='msra') => keras he_uniform n.relu4 = L.ReLU(n.conv4, negative_slope=0.3, engine=1) n.bn4 = L.BatchNorm(n.relu4, in_place=True) n.upsmpl4 = L.Deconvolution(n.bn4, convolution_param=dict(num_output=1, group=1, kernel_size=[4, 4, 5], stride = [2, 2, 3], pad=1)) # f = [2, 2, 3] n.conv3 = L.Convolution(n.upsmpl4, num_output=6, kernel_size= [3, 3, 8], pad=[1, 0, 3], engine=1) # (not working for nd) weight_filler=dict(type='msra') => keras he_uniform n.relu3 = L.ReLU(n.conv3, negative_slope=0.3, engine=1) n.conv2 = L.Convolution(n.relu3, num_output=1, kernel_size= [2, 2, 2],pad = [2, 0, 3], engine=1) # (not working for nd) weight_filler=dict(type='xavier')
def act_proto(mode, batchsize, exp_vocab_size, use_gt=True): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='activity_data_provider_layer', layer='ActivityDataProviderLayer', param_str=mode_str, ntop=6) # Attention n.att_conv1 = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048]))) # Prediction n.prediction = L.InnerProduct(n.att_feature_resh, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier'), param=fixed_weights) # Take GT answer or Take the logits of the VQA model and get predicted answer to embed if use_gt: n.exp_emb_ans = L.Embed(n.label, input_dim=config.NUM_OUTPUT_UNITS, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) else: n.vqa_ans = L.ArgMax(n.prediction, axis=1) n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=config.NUM_OUTPUT_UNITS, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Merge activity answer and visual feature n.exp_emb_resh = L.Reshape( n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) n.img_embed = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.img_embed, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape( n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape( n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape( n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0) return n.to_proto()
def jsonToPrototxt(net, net_name): # assumption: a layer can accept only one input blob # the data layer produces two blobs: data and label # the loss layer requires two blobs: <someData> and label # the label blob is hardcoded. # layers name have to be unique # custom DFS of the network input_dim = None def get_iterable(x): if isinstance(x, collections.Iterable): return x else: return (x, ) stack = [] layersProcessed = {} processOrder = [] blobNames = {} for layerId in net: layersProcessed[layerId] = False blobNames[layerId] = { 'bottom': [], 'top': [], } blobId = 0 def isProcessPossible(layerId): inputs = net[layerId]['connection']['input'] for layerId in inputs: if layersProcessed[layerId] is False: return False return True # finding the data layer for layerId in net: if (net[layerId]['info']['type'] == 'Data' or net[layerId]['info']['type'] == 'Input' or net[layerId]['info']['type'] == 'HDF5Data'): stack.append(layerId) def changeTopBlobName(layerId, newName): blobNames[layerId]['top'] = newName while len(stack): i = len(stack) - 1 while isProcessPossible(stack[i]) is False: i = i - 1 layerId = stack[i] stack.remove(stack[i]) inputs = net[layerId]['connection']['input'] if len(inputs) > 0: if len(inputs) == 2 and (net[inputs[0]]['info']['phase'] is not None) \ and (net[inputs[1]]['info']['phase']): commonBlobName = blobNames[inputs[0]]['top'] changeTopBlobName(inputs[1], commonBlobName) blobNames[layerId]['bottom'] = commonBlobName else: inputBlobNames = [] for inputId in inputs: inputBlobNames.extend(blobNames[inputId]['top']) blobNames[layerId]['bottom'] = inputBlobNames blobNames[layerId]['top'] = ['blob' + str(blobId)] blobId = blobId + 1 for outputId in net[layerId]['connection']['output']: if outputId not in stack: stack.append(outputId) layersProcessed[layerId] = True processOrder.append(layerId) ns_train = caffe.NetSpec() ns_test = caffe.NetSpec() for layerId in processOrder: layer = net[layerId] layerParams = layer['params'] layerType = layer['info']['type'] layerPhase = layer['info']['phase'] if (layerType == 'Data' or layerType == 'Input'): # This is temporary # Has to be improved later # If we have data layer then it is converted to input layer with some default dimensions ''' data_param = {} if layerParams['source'] != '': data_param['source'] = layerParams['source'] # hardcoding mnsit dataset -change this later if layerPhase is not None: if int(layerPhase) == 0: data_param['source'] = 'examples/mnist/mnist_train_lmdb' elif int(layerPhase) == 1: data_param['source'] = 'examples/mnist/mnist_test_lmdb' if layerParams['batch_size'] != '': data_param['batch_size'] = int(float(layerParams['batch_size'])) if layerParams['backend'] != '': backend = layerParams['backend'] if(backend == 'LEVELDB'): backend = 0 elif(backend == 'LMDB'): backend = 1 data_param['backend'] = backend transform_param = {} if layerParams['scale'] != '': transform_param['scale'] = float(layerParams['scale']) if layerPhase is not None: caffeLayer = get_iterable(L.Data( ntop=1, transform_param=transform_param, data_param=data_param, include={ 'phase': int(layerPhase) })) if int(layerPhase) == 0: #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_train[key] = value elif int(layerPhase) == 1: #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_test[key] = value else: for ns in (ns_train,ns_test): caffeLayer = get_iterable(L.Data( ntop=2, transform_param=transform_param, data_param=data_param)) #for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value ''' if 'dim' not in layerParams: layerParams['dim'] = '10,3,224,224' input_dim = layerParams['dim'] if layerPhase is not None: caffeLayer = get_iterable( L.Input(input_param={ 'shape': { 'dim': map(int, layerParams['dim'].split(',')) } }, include={'phase': int(layerPhase)})) if int(layerPhase) == 0: # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_train[key] = value elif int(layerPhase) == 1: # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_test[key] = value else: for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Input( input_param={ 'shape': { 'dim': map(int, layerParams['dim'].split( ',')) } })) # for key, value in zip(blobNames[layerId]['top'] + ['label'], caffeLayer): for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Crop'): crop_param = {} if layerParams['axis'] != '': crop_param['axis'] = int(float(layerParams['axis'])) if layerParams['offset'] != '': crop_param['offset'] = int(float(layerParams['offset'])) for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Crop(*[ns[x] for x in blobNames[layerId]['bottom']], crop_param=crop_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Convolution'): convolution_param = {} if layerParams['kernel_h'] != '': convolution_param['kernel_h'] = int( float(layerParams['kernel_h'])) if layerParams['kernel_w'] != '': convolution_param['kernel_w'] = int( float(layerParams['kernel_w'])) if layerParams['stride_h'] != '': convolution_param['stride_h'] = int( float(layerParams['stride_h'])) if layerParams['stride_w'] != '': convolution_param['stride_w'] = int( float(layerParams['stride_w'])) if layerParams['num_output'] != '': convolution_param['num_output'] = int( float(layerParams['num_output'])) if layerParams['pad_h'] != '': convolution_param['pad_h'] = int(float(layerParams['pad_h'])) if layerParams['pad_w'] != '': convolution_param['pad_w'] = int(float(layerParams['pad_w'])) if layerParams['weight_filler'] != '': convolution_param['weight_filler'] = {} convolution_param['weight_filler']['type'] = layerParams[ 'weight_filler'] if layerParams['bias_filler'] != '': convolution_param['bias_filler'] = {} convolution_param['bias_filler']['type'] = layerParams[ 'bias_filler'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Convolution( *[ns[x] for x in blobNames[layerId]['bottom']], convolution_param=convolution_param, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }])) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Deconvolution'): convolution_param = {} if layerParams['kernel_h'] != '': convolution_param['kernel_h'] = int( float(layerParams['kernel_h'])) if layerParams['kernel_w'] != '': convolution_param['kernel_w'] = int( float(layerParams['kernel_w'])) if layerParams['stride_h'] != '': convolution_param['stride_h'] = int( float(layerParams['stride_h'])) if layerParams['stride_w'] != '': convolution_param['stride_w'] = int( float(layerParams['stride_w'])) if layerParams['num_output'] != '': convolution_param['num_output'] = int( float(layerParams['num_output'])) if layerParams['pad_h'] != '': convolution_param['pad_h'] = int(float(layerParams['pad_h'])) if layerParams['pad_w'] != '': convolution_param['pad_w'] = int(float(layerParams['pad_w'])) if layerParams['weight_filler'] != '': convolution_param['weight_filler'] = {} convolution_param['weight_filler']['type'] = layerParams[ 'weight_filler'] if layerParams['bias_filler'] != '': convolution_param['bias_filler'] = {} convolution_param['bias_filler']['type'] = layerParams[ 'bias_filler'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Deconvolution( *[ns[x] for x in blobNames[layerId]['bottom']], convolution_param=convolution_param, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }])) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'ReLU'): inplace = layerParams['inplace'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.ReLU(*[ns[x] for x in blobNames[layerId]['bottom']], in_place=inplace)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Pooling'): pooling_param = {} if layerParams['kernel_h'] != '': pooling_param['kernel_h'] = int(float(layerParams['kernel_h'])) if layerParams['kernel_w'] != '': pooling_param['kernel_w'] = int(float(layerParams['kernel_w'])) if layerParams['stride_h'] != '': pooling_param['stride_h'] = int(float(layerParams['stride_h'])) if layerParams['stride_w'] != '': pooling_param['stride_w'] = int(float(layerParams['stride_w'])) if layerParams['pad_h'] != '': pooling_param['pad_h'] = int(float(layerParams['pad_h'])) if layerParams['pad_w'] != '': pooling_param['pad_w'] = int(float(layerParams['pad_w'])) if layerParams['pool'] != '': pool = layerParams['pool'] if (pool == 'MAX'): pool = 0 elif (pool == 'AVE'): pool = 1 elif (pool == 'STOCHASTIC'): pool = 2 pooling_param['pool'] = pool for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Pooling(*[ns[x] for x in blobNames[layerId]['bottom']], pooling_param=pooling_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'InnerProduct'): inner_product_param = {} if layerParams['num_output'] != '': inner_product_param['num_output'] = int( float(layerParams['num_output'])) if layerParams['weight_filler'] != '': inner_product_param['weight_filler'] = {} inner_product_param['weight_filler']['type'] = layerParams[ 'weight_filler'] if layerParams['bias_filler'] != '': inner_product_param['bias_filler'] = {} inner_product_param['bias_filler']['type'] = layerParams[ 'bias_filler'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.InnerProduct( *[ns[x] for x in blobNames[layerId]['bottom']], inner_product_param=inner_product_param, param=[{ 'lr_mult': 1 }, { 'lr_mult': 2 }])) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'SoftmaxWithLoss'): pass for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.SoftmaxWithLoss( # try L['SoftmaxWithLoss'] *([ns[x] for x in blobNames[layerId]['bottom']]))) # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]))) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Accuracy'): pass if layerPhase is not None: caffeLayer = get_iterable( L.Accuracy( *([ns[x] for x in blobNames[layerId]['bottom']]), # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]), include={'phase': int(layerPhase)})) if int(layerPhase) == 0: for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_train[key] = value elif int(layerPhase) == 1: for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns_test[key] = value else: for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Accuracy( *([ns[x] for x in blobNames[layerId]['bottom']]))) # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]))) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Dropout'): # inplace dropout? caffe-tensorflow do not work inplace = layerParams['inplace'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Dropout(*[ns[x] for x in blobNames[layerId]['bottom']], in_place=inplace)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'LRN'): for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.LRN(*[ns[x] for x in blobNames[layerId]['bottom']])) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Concat'): for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Concat(*[ns[x] for x in blobNames[layerId]['bottom']], ntop=len(blobNames[layerId]['top']))) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Eltwise'): eltwise_param = {} if layerParams['operation'] != '': elt = layerParams['operation'] if (elt == 'PROD'): elt = 0 elif (elt == 'SUM'): elt = 1 elif (elt == 'MAX'): elt = 2 else: elt = 1 # Default is sum eltwise_param['operation'] = elt for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Eltwise(*[ns[x] for x in blobNames[layerId]['bottom']], eltwise_param=eltwise_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Softmax'): for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Softmax(*([ns[x] for x in blobNames[layerId]['bottom']]))) # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]))) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Embed'): for ns in (ns_train, ns_test): print ns.tops caffeLayer = get_iterable( L.Embed(*[ns[x] for x in blobNames[layerId]['bottom']], param=[{ 'lr_mult': 1, 'decay_mult': 1 }, { 'lr_mult': 2, 'decay_mult': 0 }])) # *([ns[x] for x in blobNames[layerId]['bottom']] + [ns.label]))) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'LSTM'): recurrent_param = {} if layerParams['num_output'] != '': recurrent_param['num_output'] = int(layerParams['num_output']) if layerParams['weight_filler'] != '': recurrent_param['weight_filler'] = { 'type': layerParams['weight_filler'] } if layerParams['bias_filler'] != '': recurrent_param['bias_filler'] = { 'type': layerParams['bias_filler'] } for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.LSTM(*[ns[x] for x in blobNames[layerId]['bottom']], recurrent_param=recurrent_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Reshape'): reshape_param = { 'shape': { 'dim': map(int, layerParams['dim'].split(',')) } } for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Reshape(*[ns[x] for x in blobNames[layerId]['bottom']], reshape_param=reshape_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'HDF5Data'): layerPhase = layer['info']['phase'] hdf5_data_param = {} if layerParams['source'] != '': hdf5_data_param['source'] = layerParams['source'] if layerParams['batch_size'] != '': hdf5_data_param['batch_size'] = layerParams['batch_size'] for ns in (ns_train, ns_test): if layerPhase is not None: caffeLayer = get_iterable( L.HDF5Data( *[ns[x] for x in blobNames[layerId]['bottom']], hdf5_data_param=hdf5_data_param, include={'phase': int(layerPhase)})) else: caffeLayer = get_iterable( L.HDF5Data( *[ns[x] for x in blobNames[layerId]['bottom']], hdf5_data_param=hdf5_data_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'BatchNorm'): batch_norm_param = {} if layerParams['use_global_stats'] != '': batch_norm_param['use_global_stats'] = layerParams[ 'use_global_stats'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.BatchNorm(*[ns[x] for x in blobNames[layerId]['bottom']], batch_norm_param=batch_norm_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Scale'): scale_param = {} if layerParams['bias_term'] != '': scale_param['bias_term'] = layerParams['bias_term'] for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Scale(*[ns[x] for x in blobNames[layerId]['bottom']], scale_param=scale_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value elif (layerType == 'Eltwise'): eltwise_param = {} if layerParams['operation'] != '': eltwise_param['operation'] = int(layerParams['operation']) for ns in (ns_train, ns_test): caffeLayer = get_iterable( L.Eltwise(*[ns[x] for x in blobNames[layerId]['bottom']], eltwise_param=eltwise_param)) for key, value in zip(blobNames[layerId]['top'], caffeLayer): ns[key] = value train = 'name: "' + net_name + '"\n' + str(ns_train.to_proto()) test = str(ns_test.to_proto()) # merge the train and test prototxt to get a single train_test prototxt testIndex = [m.start() for m in re.finditer('layer', test)] previousIndex = -1 for i in range(len(testIndex)): if i < len(testIndex) - 1: layer = test[testIndex[i]:testIndex[i + 1]] else: layer = test[testIndex[i]:] a = train.find(layer) if a != -1: l = test[testIndex[previousIndex + 1]:testIndex[i]] train = train[0:a] + l + train[a:] previousIndex = i if previousIndex < len(testIndex) - 1: l = test[testIndex[previousIndex + 1]:] train = train + l prototxt = train return prototxt, input_dim
def generate_model(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider, layer=config.data_provider_layer, param_str=mode_str, ntop=5) # the base net (VGG-16) n.conv1_1, n.relu1_1 = conv_relu(n.image, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool5 = max_pool(n.relu5_3) # fully conv n.fcn_fc6, n.fcn_relu6 = conv_relu(n.pool5, 4096, ks=7, pad=3) if config.vgg_dropout: n.fcn_drop6 = L.Dropout(n.fcn_relu6, dropout_ratio=0.5, in_place=True) n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_drop6, 4096, ks=1, pad=0) n.fcn_drop7 = L.Dropout(n.fcn_relu7, dropout_ratio=0.5, in_place=True) n.fcn_fc8 = conv(n.fcn_drop7, 1000, ks=1, pad=0) else: n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_relu6, 4096, ks=1, pad=0) n.fcn_fc8 = conv(n.fcn_relu7, 1000, ks=1, pad=0) # embedding n.embed = L.Embed(n.language, input_dim=config.vocab_size, num_output=config.embed_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) # LSTM n.lstm = L.LSTM(n.embed, n.cont, recurrent_param=dict(num_output=config.lstm_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0))) tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0)) for i in range(config.T - 1): n.__setattr__('slice'+str(i), tops[i]) n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0)) n.lstm_out = tops[-1] n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # Tile LSTM feature n.lstm_resh = L.Reshape(n.lstm_feat, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim, 1, 1]))) n.lstm_tile_1 = L.Tile(n.lstm_resh, axis=2, tiles=config.featmap_H) n.lstm_tile_2 = L.Tile(n.lstm_tile_1, axis=3, tiles=config.featmap_W) # L2 Normalize image and language features n.img_l2norm = L.L2Normalize(n.fcn_fc8) n.lstm_l2norm = L.L2Normalize(n.lstm_tile_2) # Concatenate n.feat_all = L.Concat(n.lstm_l2norm, n.img_l2norm, n.spatial, concat_param=dict(axis=1)) # MLP Classifier over concatenated feature n.fcn_l1, n.fcn_relu1 = conv_relu(n.feat_all, config.mlp_hidden_dims, ks=1, pad=0) if config.mlp_dropout: n.fcn_drop1 = L.Dropout(n.fcn_relu1, dropout_ratio=0.5, in_place=True) n.fcn_scores = conv(n.fcn_drop1, 1, ks=1, pad=0) else: n.fcn_scores = conv(n.fcn_relu1, 1, ks=1, pad=0) # Loss Layer n.loss = L.SigmoidCrossEntropyLoss(n.fcn_scores, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4)#5 ) # # word embedding (static + dynamic) # n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ # weight_filler=dict(type='uniform',min=-0.08,max=0.08)) # n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # n.embed_scale_resh = L.Reshape(n.embed_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # n.glove_scale = L.Scale(n.glove, n.cont, scale_param=dict(dict(axis=0))) # n.glove_scale_resh = L.Reshape(n.glove_scale,\ # reshape_param=dict(\ # shape=dict(dim=[batchsize,1,T,300]))) # concat_word_embed = [n.embed_scale_resh, n.glove_scale_resh] # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 1}) # N x 2 x T x 300 # char embedding n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=50, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) n.embed_scale_resh = L.Reshape(n.embed_scale,\ reshape_param=dict(\ shape=dict(dim=[batchsize,1,T,50]))) # char deep convolution n.char_conv_1 = L.Convolution( n.embed_scale_resh, kernel_h=5, kernel_w=50, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 1 x 100 x 50 -> N x 256 x 96 x 1 n.char_relu_1 = L.ReLU(n.char_conv_1) n.char_pool_1 = L.Pooling( n.char_relu_1, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 96 x 1 -> N x 256 x 48 x 1 n.char_conv_2 = L.Convolution( n.char_pool_1, kernel_h=5, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 48 x 1 -> N x 256 x 44 x 1 n.char_relu_2 = L.ReLU(n.char_conv_2) n.char_pool_2 = L.Pooling( n.char_relu_2, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 44 x 1 -> N x 256 x 22 x 1 n.char_conv_3 = L.Convolution( n.char_pool_2, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 22 x 1 -> N x 256 x 20 x 1 n.char_relu_3 = L.ReLU(n.char_conv_3) n.char_conv_4 = L.Convolution( n.char_relu_3, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 20 x 1 -> N x 256 x 18 x 1 n.char_relu_4 = L.ReLU(n.char_conv_4) n.char_conv_5 = L.Convolution( n.char_relu_4, kernel_h=3, kernel_w=1, stride=1, num_output=256, weight_filler=dict(type='gaussian', std=0.05)) # N x 256 x 18 x 1 -> N x 256 x 16 x 1 n.char_relu_5 = L.ReLU(n.char_conv_5) n.char_pool_3 = L.Pooling( n.char_relu_5, kernel_h=2, kernel_w=1, stride=2, pool=P.Pooling.MAX) # N x 256 x 16 x 1 -> N x 256 x 8 x 1 n.vec_reshape = L.Reshape( n.char_pool_3, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.concat_vec_dropped = L.Dropout(n.vec_reshape, dropout_param={'dropout_ratio': 0.5}) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()