예제 #1
0
def mfh_baseline(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label = L.Python( \
            module='vqa_data_layer', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 )
    else:
        n.data, n.cont, n.img_feature, n.label = L.Python(\
            module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 

    # LSTM
    #n.lstm1 = L.LSTM(\
    #               n.embed_tanh, n.cont,\
    #               recurrent_param=dict(\
    #                   num_output=config.LSTM_UNIT_NUM,\
    #                   weight_filler=dict(type='xavier')))
    #tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    #for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
    #    n.__setattr__('slice_first'+str(i), tops1[int(i)])
    #    n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    #n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    #n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
    #                      reshape_param=dict(\
    #                          shape=dict(dim=[-1,1024])))
    #n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})

    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM, 
                       weight_filler=dict(type='xavier')))
    tops2 = L.Slice(n.lstm2, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.q_feat = L.Concat(*concat_botom) 
    '''
    Coarse Image-Question MFH fusion
    '''

    n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out)
    n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt)

    n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0))
    n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out)
    n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt)

    n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2)

    n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'))

    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
예제 #2
0
def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
            module='vqa_data_layer_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 )
    else:
        n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
            module='vqa_data_layer_kld_hdf5', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=5 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 
    concat_word_embed = [n.embed_tanh, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600

    # LSTM
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
    n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
            reshape_param=dict(shape=dict(dim=[0,0,0,1])))

    '''
    Question Attention
    '''
    n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
                                           weight_filler=dict(type='xavier'))
    n.qatt_relu = L.ReLU(n.qatt_conv1)
    n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
    n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)

    qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
    dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    qatt_feature_list = []
    for i in xrange(config.NUM_QUESTION_GLIMPSE):
        if config.NUM_QUESTION_GLIMPSE == 1:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
        else:
            n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))    
        qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
    n.qatt_feat_concat = L.Concat(*qatt_feature_list) 
    '''
    Image Attention with MFB
    '''
    n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    
    n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 
                                   weight_filler=dict(type='xavier'))
    n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))  
    n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
    n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)


    n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
                                 weight_filler=dict(type='xavier')) 
    n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
                                                                      config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
    n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,config.IMG_FEAT_SIZE,1])))
    n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
    n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
                                                                       config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
                              pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
    
    n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
    n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)


    ## 2 conv layers 1000 -> 512 -> 2
    n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 
                                weight_filler=dict(type='xavier'))
    n.iatt_relu = L.ReLU(n.iatt_conv1)
    n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
                                           weight_filler=dict(type='xavier')) 
    n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
    n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
    n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
    iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    iatt_feature_list = []
    for i in xrange(config.NUM_IMG_GLIMPSE):
        if config.NUM_IMG_GLIMPSE == 1:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
        else:
            n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
        n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
                                reshape_param=dict(shape=dict(dim=[0,-1]))))
        iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
    n.iatt_feat_concat = L.Concat(*iatt_feature_list)
    n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
    
    '''
    Fine-grained Image-Question MFB fusion
    '''

    n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
    n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
    
    n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier')) 
    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()
예제 #3
0
파일: models.py 프로젝트: yuzcccc/DDPN
def net(split, vocab_size, opts):
    n = caffe.NetSpec()
    param_str = json.dumps({'split': split, 'batchsize': cfg.BATCHSIZE})
    n.qvec, n.cvec, n.img_feat, n.spt_feat, n.query_label, n.query_label_mask, n.query_bbox_targets, \
                n.query_bbox_inside_weights, n.query_bbox_outside_weights =  L.Python( \
                                        name='data', module='networks.data_layer', layer='DataProviderLayer', param_str=param_str, ntop=9 )

    n.embed_ba = L.Embed(n.qvec, input_dim=vocab_size, num_output=cfg.WORD_EMB_SIZE, \
                         weight_filler=dict(type='xavier'))
    n.embed = L.TanH(n.embed_ba)
    word_emb = n.embed

    # LSTM1
    n.lstm1 = L.LSTM(\
                   word_emb, n.cvec,\
                   recurrent_param=dict(\
                       num_output=cfg.RNN_DIM,\
                       weight_filler=dict(type='xavier')))

    tops1 = L.Slice(n.lstm1, ntop=cfg.QUERY_MAXLEN, slice_param={'axis': 0})
    for i in xrange(cfg.QUERY_MAXLEN - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[cfg.QUERY_MAXLEN - 1]
    n.lstm1_reshaped = L.Reshape(
        n.lstm1_out, reshape_param=dict(shape=dict(dim=[-1, cfg.RNN_DIM])))
    n.lstm1_droped = L.Dropout(
        n.lstm1_reshaped, dropout_param={'dropout_ratio': cfg.DROPOUT_RATIO})
    n.lstm_l2norm = L.L2Normalize(n.lstm1_droped)
    n.q_emb = L.Reshape(n.lstm_l2norm,
                        reshape_param=dict(shape=dict(dim=[0, -1])))
    q_layer = n.q_emb  # (N, 1024)

    v_layer = proc_img(n, n.img_feat, n.spt_feat)  #out: (N, 100, 2053)
    out_layer = concat(n, q_layer, v_layer)
    # predict score
    n.query_score_fc = L.InnerProduct(out_layer,
                                      num_output=1,
                                      weight_filler=dict(type='xavier'))
    n.query_score_pred = L.Reshape(
        n.query_score_fc,
        reshape_param=dict(shape=dict(dim=[-1, cfg.RPN_TOPN])))
    if cfg.USE_KLD:
        n.loss_query_score = L.SoftmaxKLDLoss(n.query_score_pred,
                                              n.query_label,
                                              n.query_label_mask,
                                              propagate_down=[1, 0, 0],
                                              loss_weight=1.0)
    else:
        n.loss_query_score = L.SoftmaxWithLoss(n.query_score_pred,
                                               n.query_label,
                                               n.query_label_mask,
                                               propagate_down=[1, 0, 0],
                                               loss_weight=1.0)

    # predict bbox
    n.query_bbox_pred = L.InnerProduct(out_layer,
                                       num_output=4,
                                       weight_filler=dict(type='xavier'))
    if cfg.USE_REG:
        n.loss_query_bbox = L.SmoothL1Loss( n.query_bbox_pred, n.query_bbox_targets, \
                                        n.query_bbox_inside_weights, n.query_bbox_outside_weights, loss_weight=1.0)
    else:
        n.__setattr__('silence_query_bbox_pred',
                      L.Silence(n.query_bbox_pred, ntop=0))
        n.__setattr__('silence_query_bbox_targets',
                      L.Silence(n.query_bbox_targets, ntop=0))
        n.__setattr__('silence_query_bbox_inside_weights',
                      L.Silence(n.query_bbox_inside_weights, ntop=0))
        n.__setattr__('silence_query_bbox_outside_weights',
                      L.Silence(n.query_bbox_outside_weights, ntop=0))
    return n.to_proto()