Esempio n. 1
0
def silent_net():
    n = caffe.NetSpec()
    n.data, n.data2 = L.DummyData(shape=[dict(dim=[3]), dict(dim=[4, 2])],
                                  ntop=2)
    n.silence_data = L.Silence(n.data, ntop=0)
    n.silence_data2 = L.Silence(n.data2, ntop=0)
    return n.to_proto()
 def test_type_error(self):
     """Test that a TypeError is raised when a Function input isn't a Top."""
     data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
     r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
     with self.assertRaisesRegexp(TypeError, r):
         L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
     L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top
Esempio n. 3
0
 def silence(self, bottom):
     if isinstance(bottom, list):
         self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
             *bottom, ntop=0)
     else:
         self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
             bottom, ntop=0)
     self.silence_count += 1
Esempio n. 4
0
    def start(args):
        #data_shape = [args.depth, args.width, args.height]
        input_shape = [132, 132, 132]
        output_shape = [44, 44, 44]

        # Start a network
        net = caffe.NetSpec()

        # Data input layer
        #net.data = L.MemoryData(dim=[1, 1], ntop=1)
        net.data, net.datai = L.MemoryData(dim=[1, 1] + input_shape, ntop=2)

        # Label input layer
        net.label, net.labeli = L.MemoryData(dim=[1, 3] + output_shape,
                                             ntop=2,
                                             include=[dict(phase=0)])

        # Components label layer
        net.components, net.componentsi = L.MemoryData(
            dim=[1, 1] + output_shape,
            ntop=2,
            include=[dict(phase=0, stage='malis')])

        # Scale input layer
        net.scale, net.scalei = L.MemoryData(
            dim=[1, 3] + output_shape,
            ntop=2,
            include=[dict(phase=0, stage='euclid')])

        # Silence the not needed data and label integer values
        net.nhood, net.nhoodi = L.MemoryData(
            dim=[1, 1, 3, 3], ntop=2, include=[dict(phase=0, stage='malis')])

        # Silence the not needed data and label integer values
        net.silence1 = L.Silence(net.datai,
                                 net.labeli,
                                 net.scalei,
                                 ntop=0,
                                 include=[dict(phase=0, stage='euclid')])
        net.silence2 = L.Silence(net.datai,
                                 net.labeli,
                                 net.componentsi,
                                 net.nhoodi,
                                 ntop=0,
                                 include=[dict(phase=0, stage='malis')])
        net.silence3 = L.Silence(net.datai, ntop=0, include=[dict(phase=1)])

        return net
Esempio n. 5
0
    def get_phocnet(self, word_image_lmdb_path, phoc_lmdb_path,
                    phoc_size=604, generate_deploy=False):
        '''
        Returns a NetSpec definition of the PHOCNet. The definition can then be transformed
        into a protobuffer message by casting it into a str.
        '''
        n = NetSpec()
        # Data
        self.set_phocnet_data(n=n, generate_deploy=generate_deploy,
                              word_image_lmdb_path=word_image_lmdb_path,
                              phoc_lmdb_path=phoc_lmdb_path)

        # Conv Part
        self.set_phocnet_conv_body(n=n, relu_in_place=True)

        # FC Part
        n.spp5 = L.SPP(n.relu4_3, spp_param=dict(pool=P.SPP.MAX, pyramid_height=3, engine=self.spp_engine))
        n.fc6, n.relu6, n.drop6 = self.fc_relu(bottom=n.spp5, layer_size=4096,
                                               dropout_ratio=0.5, relu_in_place=True)
        n.fc7, n.relu7, n.drop7 = self.fc_relu(bottom=n.drop6, layer_size=4096,
                                               dropout_ratio=0.5, relu_in_place=True)
        n.fc8 = L.InnerProduct(n.drop7, num_output=phoc_size,
                               weight_filler=dict(type=self.initialization),
                               bias_filler=dict(type='constant'))
        n.sigmoid = L.Sigmoid(n.fc8, include=dict(phase=self.phase_test))

        # output part
        if not generate_deploy:
            n.silence = L.Silence(n.sigmoid, ntop=0, include=dict(phase=self.phase_test))
            n.loss = L.SigmoidCrossEntropyLoss(n.fc8, n.phocs)

        return n.to_proto()
def minivggnet(data,
               labels=None,
               train=False,
               cudnn=False,
               param=learned_param,
               num_classes=100,
               with_labels=True):
    """
    Returns a protobuf text file specifying a variant of VGG
    """
    n = caffe.NetSpec()
    n.data = data
    conv_kwargs = dict(param=param, train=train, cudnn=cudnn)
    n.conv1, n.relu1 = conv_relu(n.data, 7, 96, stride=2, **conv_kwargs)
    n.norm1 = L.LRN(n.relu1, local_size=5, alpha=0.0005, beta=0.75, k=2)
    n.pool1 = max_pool(n.norm1, 3, stride=3, train=train, cudnn=cudnn)
    n.conv2, n.relu2 = conv_relu(n.pool1,
                                 5,
                                 256,
                                 pad=1,
                                 stride=2,
                                 group=2,
                                 **conv_kwargs)
    n.pool2 = max_pool(n.relu2, 2, stride=2, train=train, cudnn=cudnn)
    n.conv3, n.relu3 = conv_relu(n.pool2, 3, 512, pad=1, **conv_kwargs)
    n.conv4, n.relu4 = conv_relu(n.relu3,
                                 3,
                                 512,
                                 pad=1,
                                 group=2,
                                 **conv_kwargs)
    n.conv5, n.relu5 = conv_relu(n.relu4,
                                 3,
                                 512,
                                 pad=1,
                                 group=2,
                                 **conv_kwargs)
    n.pool5 = max_pool(n.relu5, 3, stride=3, train=train, cudnn=cudnn)
    n.fc6, n.relu6 = fc_relu(n.pool5, 1024, param=param)
    n.drop6 = L.Dropout(n.relu6, in_place=True)
    n.fc7, n.relu7 = fc_relu(n.drop6, 1024, param=param)
    n.drop7 = L.Dropout(n.relu7, in_place=True)
    preds = n.fc8 = L.InnerProduct(n.drop7,
                                   num_output=num_classes,
                                   param=param)
    if not train:
        # Compute the per-label probabilities at test/inference time.
        preds = n.probs = L.Softmax(n.fc8)
    if with_labels:
        n.label = labels
        n.loss = L.SoftmaxWithLoss(n.fc8, n.label)
        n.accuracy_at_1 = L.Accuracy(preds, n.label)
        n.accuracy_at_5 = L.Accuracy(preds,
                                     n.label,
                                     accuracy_param=dict(top_k=5))
    else:
        n.ignored_label = labels
        n.silence_label = L.Silence(n.ignored_label, ntop=0)
    return to_tempfile(str(n.to_proto()))
def conv1_autoencoder(split, batch_sz):
    n = caffe.NetSpec()
    n.data, n.label = L.ImageData(image_data_param=dict(source=split,
                                                        batch_size=batch_sz,
                                                        new_height=height,
                                                        new_width=width,
                                                        is_color=False),
                                  ntop=2)
    n.silence = L.Silence(n.label, ntop=0)
    n.flatdata_i = L.Flatten(n.data)

    n.conv1 = conv(n.data, 5, 5, 64, pad=2)
    n.bn1 = L.BatchNorm(n.conv1,
                        use_global_stats=False,
                        in_place=True,
                        param=[{
                            "lr_mult": 0
                        }, {
                            "lr_mult": 0
                        }, {
                            "lr_mult": 0
                        }])
    n.scale1 = L.Scale(n.bn1, bias_term=True, in_place=True)
    n.relu1 = L.ReLU(n.scale1, relu_param=dict(negative_slope=0.1))
    n.pool1 = max_pool(n.relu1, 2, stride=2)

    n.code = conv(n.pool1, 5, 5, 64, pad=2)

    n.upsample1 = L.Deconvolution(n.code,
                                  param=dict(lr_mult=0, decay_mult=0),
                                  convolution_param=dict(
                                      group=64,
                                      num_output=64,
                                      kernel_size=4,
                                      stride=2,
                                      pad=1,
                                      bias_term=False,
                                      weight_filler=dict(type="bilinear")))
    n.deconv1 = conv(n.upsample1, 5, 5, 1, pad=2)
    n.debn1 = L.BatchNorm(n.deconv1,
                          use_global_stats=False,
                          in_place=True,
                          param=[{
                              "lr_mult": 0
                          }, {
                              "lr_mult": 0
                          }, {
                              "lr_mult": 0
                          }])
    n.descale1 = L.Scale(n.debn1, bias_term=True, in_place=True)
    n.derelu1 = L.ReLU(n.descale1, relu_param=dict(negative_slope=0.1))

    n.flatdata_o = L.Flatten(n.derelu1)
    n.loss_s = L.SigmoidCrossEntropyLoss(n.flatdata_o,
                                         n.flatdata_i,
                                         loss_weight=1)
    n.loss_e = L.EuclideanLoss(n.flatdata_o, n.flatdata_i, loss_weight=0)

    return str(n.to_proto())
Esempio n. 8
0
    def build_retrieval_model(self, param_str, save_tag):

        data = L.Python(module="data_processing",
                        layer=self.data_layer,
                        param_str=str(param_str),
                        ntop=self.top_size)
        for key, value in zip(self.params['top_names_dict'].keys(),
                              self.params['top_names_dict'].values()):
            setattr(self.n, key, data[value])

        im_model, lang_model = self.get_models()

        data_bottoms = []

        #bottoms which are always produced
        bottom_positive = data[self.top_name_dict['features_p']]
        query = data[self.top_name_dict['BoG']]
        p_time_stamp = data[self.top_name_dict['features_time_stamp_p']]
        n_time_stamp = data[self.top_name_dict['features_time_stamp_n']]
        if self.inter:
            bottom_inter = data[self.top_name_dict['features_inter']]
        if self.intra:
            bottom_intra = data[self.top_name_dict['features_intra']]

        bottom_positive = im_model(bottom_positive, p_time_stamp)
        if self.inter:
            bottom_inter = im_model(bottom_inter, p_time_stamp)
        if self.intra:
            bottom_intra = im_model(bottom_intra, n_time_stamp)
        if (self.inter) & (not self.intra):
            self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
                n_time_stamp, ntop=0)
            self.silence_count += 1

        cont = data[self.top_name_dict['cont']]
        query = lang_model(query, cont)

        if not args.tall_loss:
            if self.inter:
                self.n.tops['ranking_loss_inter'] = self.ranking_loss(
                    bottom_positive, bottom_inter, query, lw=self.lw_inter)
            if self.intra:
                self.n.tops['ranking_loss_intra'] = self.ranking_loss(
                    bottom_positive, bottom_intra, query, lw=self.lw_intra)
        else:
            if self.inter:
                self.n.tops['tall_loss_inter'] = self.tall_loss(
                    bottom_positive, bottom_inter, query, lw=self.lw_inter)
            if self.intra:
                self.n.tops['tall_loss_intra'] = self.tall_loss(
                    bottom_positive, bottom_intra, query, lw=self.lw_intra)

        self.write_net(save_tag, self.n)
Esempio n. 9
0
def make_net_train(lmdb, preselection, batch_size=8, weights = [0, 0, 0.005, 0.01, 0.02, 0.08, 0.32]):

    net = caffe.NetSpec()

    net.img0, net.img1, net.flow_gt, net.aux= L.CustomData(  
        data_param=dict(source=lmdb, preselection_file = preselection, backend=P.Data.LMDB, batch_size=batch_size, 
            preselection_label=1, rand_permute=True, rand_permute_seed=77, slice_point=[3,6,8], encoding=[1,1,2,3], 
            verbose=True),  ntop=4, include=dict(phase=0))

    net.img0_subtract = L.Eltwise(net.img0, eltwise_param=dict(operation=1,coeff=0.00392156862745))  
    net.img1_subtract = L.Eltwise(net.img1, eltwise_param=dict(operation=1,coeff=0.00392156862745))  

    net.img0_aug, net.img0_aug_params = augment_first_image(net.img0_subtract)

    aug_params      = generate_aug_params(net.img0_aug_params, net.img0_subtract, net.img0_aug)    
    net.img1_aug    = augment_second_image(net.img1_subtract, aug_params)

    net.flow_gt_aug     = L.FlowAugmentation(net.flow_gt, net.img0_aug_params, aug_params, augmentation_param=dict(crop_width=448, crop_height=320))
    net.scaled_flow_gt  = L.Eltwise(net.flow_gt_aug, eltwise_param=dict(operation=1,coeff=0.05))  

    net = make_pwc_net_encoder_plus(net, net.img0_aug, net.img1_aug) 
    
    for i in range(1, len(weights)):
        if weights[i] > 0.:
            scaled_flow_name  = 'scaled_flow_gt{}'.format(i)
            predict_flow_name = 'predict_flow{}'.format(i)
            loss_name         = 'loss{}'.format(i)
            setattr(net, scaled_flow_name, L.Downsample(net.scaled_flow_gt, getattr(net, predict_flow_name), propagate_down=[False, False]) )
            setattr(net, loss_name, L.L1Loss(getattr(net, predict_flow_name), getattr(net, scaled_flow_name), loss_weight=weights[i], l1_loss_param=dict(l2_per_location=True)))
    # loss at level 0: don't scale GT
    if weights[0] > 0.:
        net.loss0 = L.L1Loss(net.predict_flow0, net.scaled_flow_gt, loss_weight=weights[0] , l1_loss_param=dict(l2_per_location=True), propagate_down=[True, False])

    net.Silence0 = L.Silence(net.img0, ntop=0)
    net.Silence1 = L.Silence(net.img1, ntop=0)
    net.Silence2 = L.Silence(net.flow_gt, ntop=0)
    net.Silence3 = L.Silence(net.aux, ntop=0)
    # net.Silence4 = L.Silence(net.predict_flow2_scale, ntop=0)

    return net.to_proto()
def add_cnn(n, data, act, batch_size, T, K, num_step, mode='train'):
    n.x_flat = L.Flatten(data, axis=1, end_axis=2)
    n.act_flat = L.Flatten(act, axis=1, end_axis=2)
    if mode == 'train':
        x = L.Slice(n.x_flat, axis=1, ntop=T)
        act_slice = L.Slice(n.act_flat, axis=1, ntop=T - 1)
        x_set = ()
        label_set = ()
        x_hat_set = ()
        silence_set = ()
        for i in range(T):
            t = tag(i + 1)
            n.tops['x' + t] = x[i]
            if i < K:
                x_set += (x[i], )
            if i < T - 1:
                n.tops['act' + t] = act_slice[i]
            if i < K - 1:
                silence_set += (n.tops['act' + t], )
            if i >= K:
                label_set += (x[i], )
        n.label = L.Concat(*label_set, axis=0)
        input_list = list(x_set)
        for step in range(0, num_step):
            step_tag = tag(step + 1) if step > 0 else ''
            t = tag(step + K)
            tp = tag(step + K + 1)
            input_tuple = tuple(input_list)
            n.tops['input' + step_tag] = L.Concat(*input_tuple, axis=1)
            top = add_conv_enc(n, n.tops['input' + step_tag], tag=step_tag)
            n.tops['x_hat' + tp] = add_decoder(n,
                                               top,
                                               n.tops['act' + t],
                                               flatten=False,
                                               tag=step_tag)
            input_list.pop(0)
            input_list.append(n.tops['x_hat' + tp])
    else:
        top = add_conv_enc(n, n.x_flat)
        n.tops['x_hat' + tag(K + 1)] = add_decoder(n,
                                                   top,
                                                   n.act_flat,
                                                   flatten=False)
    if mode == 'train':
        x_hat = ()
        for i in range(K, T):
            t = tag(i + 1)
            x_hat += (n.tops['x_hat' + t], )
        n.x_hat = L.Concat(*x_hat, axis=0)
        n.silence = L.Silence(*silence_set, ntop=0)
        n.l2_loss = L.EuclideanLoss(n.x_hat, n.label)
    return n
def exp_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.exp_att_feature, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='exp_data_provider_layer', layer='ExpDataProviderLayer', param_str=mode_str, ntop=5)

    n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_embed = L.TanH(n.exp_embed_ba)

    # LSTM1 for Explanation
    n.exp_lstm1 = L.LSTM(\
                   n.exp_embed, n.exp_cont_1,\
                   recurrent_param=dict(\
                       num_output=2048,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))

    n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1,
                                    dropout_param={'dropout_ratio': 0.3})

    # Merge with LSTM1 for explanation
    n.exp_att_resh = L.Reshape(
        n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048])))
    n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T)
    n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped,
                                  n.exp_att_tiled,
                                  eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all)
    n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt)
    n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2,
                                       dropout_param={'dropout_ratio': 0.3})

    # LSTM2 for Explanation
    n.exp_lstm2 = L.LSTM(\
                   n.exp_eltwise_all_drop, n.exp_cont_2,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2,
                                    dropout_param={'dropout_ratio': 0.3})

    n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped,
                                      num_output=exp_vocab_size,
                                      weight_filler=dict(type='xavier'),
                                      axis=2)

    n.silence_exp_prediction = L.Silence(n.exp_prediction, ntop=0)

    return n.to_proto()
Esempio n. 12
0
def datalayer_test(imdb, batch_size=4):

    from caffe import layers as L, params as P, to_proto
    from caffe.proto import caffe_pb2

    w_filler_params = {'weight_filler': {'type': 'xavier'}}
    b_filler_params = {'bias_filler': {'type': 'constant', 'value': 0.01}}

    n = caffe.NetSpec()
    n.image, n.depth = L.Python(name='data_train', ntop=2, include={'phase':0}, \
        python_param={'module':'data_layer', 'layer':'EigenDataLayer', 'param_str': "{'data_type': 'train', 'year': '2012'}"})

    n.image, n.depth = L.Python(name='data_test', ntop=2, include={'phase':1}, \
        python_param={'module':'data_layer', 'layer':'EigenDataLayer', 'param_str': "{'data_type': 'test', 'year': '2012'}"})

    n.image_s = L.Silence(n.image, name='silence_image', ntop=0)
    n.depth_s = L.Silence(n.depth, name='silence_depth', ntop=0)

    # n.conv1_1 = L.Convolution( n.image, name='conv1_1', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 0, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } )
    # n.conv1_1 = L.ReLU( n.conv1_1 )

    # n.conv1_2 = L.Convolution( n.conv1_1, name='conv1_2', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 0, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } )
    # n.conv1_2 = L.ReLU( n.conv1_1 )
    # n.pool1_2 = L.Pooling

    # n.conv2 = L.Convolution( n.lrn1, name='conv2', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 1, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } )
    # n.conv2 = L.ReLU( n.conv2 )
    # n.lrn2 = L.LRN( n.conv2, name='lrn2', lrn_param={'local_size': 5, 'alpha': 0.0001, 'beta': 0.75} )

    # n.pred = L.Convolution( n.lrn2, name='conv3', convolution_param={'num_output': 1, 'kernel_size': 3, 'pad': 1, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } )

    # n.lossSqrSum, n.lossSumSqr, n.lossSmooth = L.Python(n.pred, n.depth, name='loss', ntop=3, loss_weight=[1,1,1], \
    #             python_param={'module':'loss_layer', 'layer':'EigenLossLayer'})
    # n.loss = L.EuclideanLoss(n.pred, n.depth, ntop=1)

    # n.lrn1_2 = L.LRN( n.conv1_1, name='lrn1', lrn_param={'local_size': 5, 'alpha': 0.0001, 'beta': 0.75} )

    return n.to_proto()
Esempio n. 13
0
def define_network(args, imageFile, vidIds, radarFiles, training=False):
  net = caffe.NetSpec()
  
  # Setting up data layer
  transformParam = dict(mirror=training, mean_value = args.mean)
  pydataParams = dict(radar_files = radarFiles, videos = vidIds, batch_size = args.batchSize)
  
  net.data, net.label = L.ImageData(transform_param = transformParam, source=imageFile, shuffle=False, batch_size=args.batchSize, ntop=2)
  if args.expType != 'image':
    net.radar = L.Python(module='radarDataLayer', layer='RadarDataLayer', param_str=str(pydataParams), ntop=1)
 
  if args.expType == "joint" or args.expType == "image":
    net.conv1, net.relu1 = conv_relu(net.data, 11, 96, stride=4)
    net.pool1 = max_pool(net.relu1, 3, stride=2)
    net.norm1 = L.LRN(net.pool1, local_size=5, alpha=1e-4, beta=0.75)

    net.conv2, net.relu2 = conv_relu(net.norm1, 5, 256, pad=2, group=2)
    net.pool2 = max_pool(net.relu2, 3, stride=2)
    net.norm2 = L.LRN(net.pool2, local_size=5, alpha=1e-4, beta=0.75)

    net.conv3, net.relu3 = conv_relu(net.norm2, 3, 384, pad=1)
    net.conv4, net.relu4 = conv_relu(net.relu3, 3, 384, pad=1, group=2)
    net.conv5, net.relu5 = conv_relu(net.relu4, 3, 256, pad=1, group=2)
    net.pool5 = max_pool(net.relu5, 3, stride=2)

    net.fc6_new, net.relu6_new = fc_relu(net.pool5, 4096)
    net.drop6 = L.Dropout(net.relu6_new, in_place=True)

    net.fc7_new = L.InnerProduct(net.drop6, num_output=4096, param=learned_param, weight_filler=fc_filler)
    
    if args.expType == "joint":
      net.concat = L.Concat(net.fc7_new, net.radar)
      net.relu7 = L.ReLU(net.concat, in_place=True)
    else:
      net.relu7 = L.ReLU(net.fc7_new, in_place=True)

    net.drop7 = L.Dropout(net.relu7, in_place=True)
    net.final = L.InnerProduct(net.drop7, num_output=args.num_out, param=learned_param, weight_filler=fc_filler)

  elif args.expType == "radar":
    net.silence = L.Silence(net.data, ntop=0)
    net.fc7_new = L.InnerProduct(net.radar, num_output=1024, param=learned_param, weight_filler=fc_filler)
    net.relu7 = L.ReLU(net.fc7_new, in_place=True)
    net.drop7 = L.Dropout(net.relu7, in_place=True)
    net.final = L.InnerProduct(net.drop7, num_output=args.num_out, param=learned_param, weight_filler=fc_filler)

  net.loss = L.SoftmaxWithLoss(net.final, net.label)
  net.acc = L.Accuracy(net.final, net.label)
  return net.to_proto()
Esempio n. 14
0
def generate_scores(split, config):

    n = caffe.NetSpec()
    batch_size = config.N
    mode_str = str(dict(split=split, batch_size=batch_size))
    n.language, n.cont, n.img_feature, n.spatial, n.label = L.Python(module=config.data_provider,
                                                                     layer='TossLayer',
                                                                     param_str=mode_str,
                                                                     ntop=5)
    # embedding
    n.embed = L.Embed(n.language, input_dim=config.vocab_size,
                      num_output=config.embed_dim,
                      weight_filler=dict(type='uniform', min=-0.08, max=0.08))

    # LSTM
    n.lstm = L.LSTM(n.embed, n.cont,
                    recurrent_param=dict(num_output=config.lstm_dim,
                                         weight_filler=dict(type='uniform', min=-0.08, max=0.08),
                                         bias_filler=dict(type='constant', value=0)))
    tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0))
    for i in range(config.T - 1):
        n.__setattr__('slice'+str(i), tops[i])
        n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0))
    n.lstm_out = tops[-1]
    n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim])))

    # L2 Normalize image and language features
    n.img_l2norm = L.L2Normalize(n.img_feature)
    n.lstm_l2norm = L.L2Normalize(n.lstm_feat)
    n.img_l2norm_resh = L.Reshape(n.img_l2norm,
                                  reshape_param=dict(shape=dict(dim=[-1, config.D_im])))
    n.lstm_l2norm_resh = L.Reshape(n.lstm_l2norm,
                                  reshape_param=dict(shape=dict(dim=[-1, config.D_text])))

    # Concatenate
    n.feat_all = L.Concat(n.lstm_l2norm_resh, n.img_l2norm_resh, n.spatial, concat_param=dict(axis=1))

    # MLP Classifier over concatenated feature
    n.mlp_l1, n.mlp_relu1 = fc_relu(n.feat_all, config.mlp_hidden_dims)
    if config.mlp_dropout:
        n.mlp_drop1 = L.Dropout(n.mlp_relu1, dropout_ratio=0.5, in_place=True)
        n.scores = fc(n.mlp_drop1, 1)
    else:
        n.scores = fc(n.mlp_relu1, 1)

    # Loss Layer
    n.loss = L.SigmoidCrossEntropyLoss(n.scores, n.label)

    return n.to_proto()
Esempio n. 15
0
def build_test_train(n, top, train, with_labels, labels):
    """Take in current netspec and top, and adds final layers."""
    if train:
        preds = top
    else:
        # Compute the per-label probabilities at test/inference time.
        preds = n.probs = layers.Softmax(top)
    if with_labels:
        n.label = labels
        n.loss = layers.SoftmaxWithLoss(top, labels)
        n.accuracy_at_1 = layers.Accuracy(preds, labels)
        n.accuracy_at_5 = layers.Accuracy(preds,
                                          labels,
                                          accuracy_param=dict(top_k=5))
    else:
        n.ignored_label = labels
        n.silence_label = layers.Silence(n.ignored_label, ntop=0)
Esempio n. 16
0
    def build_retrieval_model(self, param_str, save_tag):

        #TODO:  This would perhaps be cleaner if I did not co-sample inter/intra positives negatives; shouldn't have to do that and could get rid of determining top size...

        #gets all the tops from the data layer, and names them sensible things.
        data = L.Python(module="data_processing",
                        layer=self.data_layer,
                        param_str=str(param_str),
                        ntop=self.top_size)
        for key, value in zip(self.params['top_names_dict'].keys(),
                              self.params['top_names_dict'].values()):
            setattr(self.n, key, data[value])

        im_model, lang_model = self.get_models()

        data_bottoms = []

        #bottoms which are always produced
        bottom_positive = data[self.top_name_dict['features_p']]
        query = data[self.top_name_dict['query']]
        p_time_stamp = data[self.top_name_dict['features_time_stamp_p']]
        n_time_stamp = data[self.top_name_dict['features_time_stamp_n']]
        if self.inter:
            bottom_inter = data[self.top_name_dict['features_inter']]
        if self.intra:
            bottom_intra = data[self.top_name_dict['features_intra']]

        bottom_positive = im_model(bottom_positive, p_time_stamp)
        if self.inter:
            bottom_inter = im_model(bottom_inter, p_time_stamp)
        if self.intra:
            bottom_intra = im_model(bottom_intra, n_time_stamp)
        if (self.inter) & (not self.intra):
            self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
                n_time_stamp, ntop=0)
            self.silence_count += 1

        cont = data[self.top_name_dict['cont']]
        query = lang_model(query, cont)
        if self.inter:
            self.n.tops['ranking_loss_inter'] = self.ranking_loss(
                bottom_positive, bottom_inter, query, lw=self.lw_inter)
        if self.intra:
            self.n.tops['ranking_loss_intra'] = self.ranking_loss(
                bottom_positive, bottom_intra, query, lw=self.lw_intra)
        self.write_net(save_tag, self.n)
Esempio n. 17
0
    def language_model_lstm_no_embed(self,
                                     sent_bottom,
                                     cont_bottom,
                                     text_name='embedding_text',
                                     tag=''):

        lstm_lr = self.args.lstm_lr
        embedding_lr = self.args.language_embedding_lr

        lstm = L.LSTM(
            sent_bottom,
            cont_bottom,
            recurrent_param=dict(num_output=self.language_embedding_dim[0],
                                 weight_filler=self.uniform_weight_filler(
                                     -0.08, 0.08),
                                 bias_filler=self.constant_filler(0)),
            param=self.learning_params(
                [[lstm_lr, lstm_lr], [lstm_lr, lstm_lr], [lstm_lr, lstm_lr]],
                ['lstm1' + tag, 'lstm2' + tag, 'lstm3' + tag]))
        lstm_slices = L.Slice(lstm,
                              slice_point=self.params['sentence_length'] - 1,
                              axis=0,
                              ntop=2)
        self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence(
            lstm_slices[0], ntop=0)
        self.silence_count += 1
        top_lstm = L.Reshape(
            lstm_slices[1],
            shape=dict(dim=[-1, self.language_embedding_dim[0]]))
        top_text = L.InnerProduct(
            top_lstm,
            num_output=self.language_embedding_dim[1],
            weight_filler=self.uniform_weight_filler(-0.08, .08),
            bias_filler=self.constant_filler(0),
            param=self.learning_params(
                [[embedding_lr, embedding_lr], [embedding_lr * 2, 0]],
                ['lstm_embed1' + tag, 'lstm_embed_1b' + tag]))

        setattr(self.n, text_name, top_text)
        return top_text
Esempio n. 18
0
def minialexnet(data, labels=None, train=False, param=learned_param,
                num_classes=100, with_labels=True):
    """
    Returns a protobuf text file specifying a variant of AlexNet, following the
    original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt).
    The changes with respect to the original AlexNet are:
        - LRN (local response normalization) layers are not included
        - The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions
          due to the lower resolution of mini-places images (128x128) compared
          with ImageNet images (usually resized to 256x256)
    """
    n = caffe.NetSpec()
    n.data = data
    conv_kwargs = dict(param=param, train=train)
    n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4, **conv_kwargs)
    n.pool1 = max_pool(n.relu1, 3, stride=2, train=train)
    n.conv2, n.relu2 = conv_relu(n.pool1, 5, 256, pad=2, group=2, **conv_kwargs)
    n.pool2 = max_pool(n.relu2, 3, stride=2, train=train)
    n.conv3, n.relu3 = conv_relu(n.pool2, 3, 384, pad=1, **conv_kwargs)
    n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2, **conv_kwargs)
    n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2, **conv_kwargs)
    n.pool5 = max_pool(n.relu5, 3, stride=2, train=train)
    n.fc6, n.relu6 = fc_relu(n.pool5, 1024, param=param)
    n.drop6 = L.Dropout(n.relu6, in_place=True)
    n.fc7, n.relu7 = fc_relu(n.drop6, 1024, param=param)
    n.drop7 = L.Dropout(n.relu7, in_place=True)
    preds = n.fc8 = L.InnerProduct(n.drop7, num_output=num_classes, param=param)
    if not train:
        # Compute the per-label probabilities at test/inference time.
        preds = n.probs = L.Softmax(n.fc8)
    if with_labels:
        n.label = labels
        n.loss = L.SoftmaxWithLoss(n.fc8, n.label)
        n.accuracy_at_1 = L.Accuracy(preds, n.label)
        n.accuracy_at_5 = L.Accuracy(preds, n.label,
                                     accuracy_param=dict(top_k=5))
    else:
        n.ignored_label = labels
        n.silence_label = L.Silence(n.ignored_label, ntop=0)
    return to_tempfile(str(n.to_proto()))
def qlstm(mode, batchsize, T, question_vocab_size):

    #prototxt 없이 network 생성시 사용
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})

    #지정된 Python 모듈 형식
    #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe
    #해당 클래스를 바탕으로 Layer를 생성하며
    #리턴된 변수에 값을 채워넣으면 자동으로 Run된다.
    #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐.

    #Glove = Global vectors for word representation
    #https://www.aclweb.org/anthology/D14-1162
    #Pretrained 된 GloveVector를 Concat에 사용.

    #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector.

    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
    #module = python 파일이름
    #layer = layer형식이 맞춰진 python class
    #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다
    #ntop = 각 setup , forward backward의 top 변수의 크기

    #보통 textual Embed의 뜻은 => texture -> number
    #Embed 3000개의 Vector종류를
    #300개로 compact하게 표현함
    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    #Tanh 적용
    n.embed = L.TanH(n.embed_ba)
    #Glove Data와 Concat
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})

    #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조.
    # give top2[~] the name specified by argument `slice_second`
    #변수 부여 기능
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))

    #마지막 LSTM output을 사용.
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    #lstm1의 output => 1024 reshape뒤 dropout
    #lstm2의 output => 1024 reshape뒤 dropout
    #concat

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1)  =>2048,14
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)

    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))

    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    #논문 그림과 달리 Dropout 추가
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    #논문 그림과 달리 output dim이 2
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    #softmax로 attentionmap 생성
    #14x14 Softmax map이 2개 생성

    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    #두가지 att_map을 각각 Slice
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
    #각각 ATT를 곱한값을 연산뒤 Concat한다.

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    #그뒤 4096으로 Reshape

    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))

    #논문과 달리 가로축 세로축 inputVector크기가 다름
    #논문 2048 2048
    #코드 4096 2048
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    #SignedSqrt
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    #L2_Normalize
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    #Dropout
    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    #FullyConnected
    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))

    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
Esempio n. 20
0
def qlstm(mode, batchsize, T, question_vocab_size):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
        module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08))
    n.embed = L.TanH(n.embed_ba)
    concat_word_embed = [n.embed, n.glove]
    n.concat_embed = L.Concat(*concat_word_embed,
                              concat_param={'axis': 2})  # T x N x 600

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.concat_embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_first' + str(i), tops1[int(i)])
        n.__setattr__('silence_data_first' + str(i),
                      L.Silence(tops1[int(i)], ntop=0))
    n.lstm1_out = tops1[T - 1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)))
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0})
    for i in xrange(T - 1):
        n.__setattr__('slice_second' + str(i), tops2[int(i)])
        n.__setattr__('silence_data_second' + str(i),
                      L.Silence(tops2[int(i)], ntop=0))
    n.lstm2_out = tops2[T - 1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,
                                        dropout_param={'dropout_ratio': 0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    n.q_emb_tanh_droped_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh,
                                              axis=2,
                                              tiles=14)
    n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1,
                                            axis=3,
                                            tiles=14)
    n.i_emb_tanh_droped_resh = L.Reshape(
        n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14])))
    n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled,
                               n.i_emb_tanh_droped_resh,
                               compact_bilinear_param=dict(num_output=16000,
                                                           sum_pool=False))
    n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
    n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
    n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,
                              dropout_param={'dropout_ratio': 0.1})

    # multi-channel attention
    n.att_conv1 = L.Convolution(n.blcf_droped,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=2,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att = L.Reshape(n.att_softmax,
                      reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14])))
    att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1})
    n.att_map0 = att_maps[0]
    n.att_map1 = att_maps[1]
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0,
                                     dummy)
    n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1,
                                     dummy)
    n.att_feature0_resh = L.Reshape(
        n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature1_resh = L.Reshape(
        n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)

    # merge attention and lstm with compact bilinear pooling
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1])))
    n.lstm_12_resh = L.Reshape(
        n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh,
                                      n.lstm_12_resh,
                                      compact_bilinear_param=dict(
                                          num_output=16000, sum_pool=False))
    n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
    n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)

    n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2,
                             dropout_param={'dropout_ratio': 0.1})
    n.bc_dropped_resh = L.Reshape(
        n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))

    n.prediction = L.InnerProduct(n.bc_dropped_resh,
                                  num_output=3000,
                                  weight_filler=dict(type='xavier'))
    n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    return n.to_proto()
def caffenet(netmode):
    # Start Caffe proto net
    net = caffe.NetSpec()
    # Specify input data structures

    if netmode == caffe_pb2.TEST:
        if netconf.loss_function == 'malis':
            fmaps_end = 11

        if netconf.loss_function == 'euclid':
            fmaps_end = 11

        if netconf.loss_function == 'softmax':
            fmaps_end = 2

        net.data, net.datai = data_layer([1, 1, 44, 132, 132])
        net.silence = L.Silence(net.datai, ntop=0)

        # Shape specs:
        # 00.    Convolution buffer size
        # 01.    Weight memory size
        # 03.    Num. channels
        # 04.    [d] parameter running value
        # 05.    [w] parameter running value
        run_shape_in = [[0, 0, 1, [1, 1, 1], [44, 132, 132]]]
        run_shape_out = run_shape_in

        last_blob = implement_usknet(net, run_shape_out, 64, fmaps_end)

        # Implement the prediction layer
        if netconf.loss_function == 'malis':
            net.prob = L.Sigmoid(last_blob, ntop=1)

        if netconf.loss_function == 'euclid':
            net.prob = L.Sigmoid(last_blob, ntop=1)

        if netconf.loss_function == 'softmax':
            net.prob = L.Softmax(last_blob, ntop=1)

        for i in range(0, len(run_shape_out)):
            print(run_shape_out[i])

        print("Max. memory requirements: %s B" %
              (compute_memory_buffers(run_shape_out) +
               compute_memory_weights(run_shape_out) +
               compute_memory_blobs(run_shape_out)))
        print("Weight memory: %s B" % compute_memory_weights(run_shape_out))
        print("Max. conv buffer: %s B" % compute_memory_buffers(run_shape_out))

    else:
        if netconf.loss_function == 'malis':
            net.data, net.datai = data_layer([1, 1, 44, 132, 132])
            net.label, net.labeli = data_layer([1, 1, 16, 44, 44])
            net.label_affinity, net.label_affinityi = data_layer(
                [1, 11, 16, 44, 44])
            net.affinity_edges, net.affinity_edgesi = data_layer([1, 1, 11, 3])
            net.silence = L.Silence(net.datai,
                                    net.labeli,
                                    net.label_affinityi,
                                    net.affinity_edgesi,
                                    ntop=0)
            fmaps_end = 11

        if netconf.loss_function == 'euclid':
            net.data, net.datai = data_layer([1, 1, 44, 132, 132])
            net.label, net.labeli = data_layer([1, 11, 16, 44, 44])
            net.scale, net.scalei = data_layer([1, 11, 16, 44, 44])
            net.silence = L.Silence(net.datai, net.labeli, net.scalei, ntop=0)
            fmaps_end = 11

        if netconf.loss_function == 'softmax':
            net.data, net.datai = data_layer([1, 1, 44, 132, 132])
            # Currently only supports binary classification
            net.label, net.labeli = data_layer([1, 1, 16, 44, 44])
            net.silence = L.Silence(net.datai, net.labeli, ntop=0)
            fmaps_end = 2

        run_shape_in = [[0, 1, 1, [1, 1, 1], [44, 132, 132]]]
        run_shape_out = run_shape_in

        # Start the actual network
        last_blob = implement_usknet(net, run_shape_out, 64, fmaps_end)

        for i in range(0, len(run_shape_out)):
            print(run_shape_out[i])

        print("Max. memory requirements: %s B" %
              (compute_memory_buffers(run_shape_out) +
               compute_memory_weights(run_shape_out) +
               2 * compute_memory_blobs(run_shape_out)))
        print("Weight memory: %s B" % compute_memory_weights(run_shape_out))
        print("Max. conv buffer: %s B" % compute_memory_buffers(run_shape_out))

        # Implement the loss
        if netconf.loss_function == 'malis':
            last_blob = L.Sigmoid(last_blob, in_place=True)
            net.loss = L.MalisLoss(last_blob,
                                   net.label_affinity,
                                   net.label,
                                   net.affinity_edges,
                                   ntop=0)

        if netconf.loss_function == 'euclid':
            last_blob = L.Sigmoid(last_blob, in_place=True)
            net.loss = L.EuclideanLoss(last_blob, net.label, net.scale, ntop=0)

        if netconf.loss_function == 'softmax':
            net.loss = L.SoftmaxWithLoss(last_blob, net.label, ntop=0)

    # Return the protocol buffer of the generated network
    return net.to_proto()
Esempio n. 22
0
    def resnet_mask_rcnn_mask_rcnn(self, stage=1):
        channals = self.channals
        if not self.deploy:
            data, rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights, mask_rois, masks = \
                self.data_layer_train_with_ins(with_rpn=False)
            im_info = None
        else:
            data, im_info = self.data_layer_test()
        gt_boxes = None
        if stage == 1:
            pre_traned_fixed = False
        else:
            pre_traned_fixed = True
        conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed)
        pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1)
        index = 1
        out = pool1
        if self.module == "normal":
            residual_block = self.residual_block
        else:
            residual_block = self.residual_block_basic

        for i in self.stages[:-1]:
            index += 1
            for j in range(i):
                if j == 0:
                    if index == 2:
                        stride = 1
                    else:
                        stride = 2
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed)
                else:
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed)
            channals *= 2

        if not self.deploy:
            rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data, fixed=True)
            self.net["silence_rpn_cls_score_reshape"] = L.Silence(rpn_cls_score_reshape, ntop=0)
            self.net["silence_rpn_bbox_pred"] = L.Silence(rpn_bbox_pred, ntop=0)
        else:
            rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data)
            rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes)

        feat_out = out

        if not self.deploy:
            self.net["rois_cat"] = L.Concat(rois, mask_rois, name="rois_cat", axis=0)
            rois=self.net["rois_cat"]

        feat_aligned = self.roi_align("det_mask", feat_out, rois)
        # if not self.deploy:
        #     self.net["silence_mask_rois"] = L.Silence(mask_rois, ntop=0)
        # if not self.deploy:
        #     mask_feat_aligned = self.roi_align("mask", feat_out, mask_rois)
        # else:
        #     mask_feat_aligned = self.roi_align("mask", feat_out, rois)
        out = feat_aligned

        index += 1
        for j in range(self.stages[-1]):
            if j == 0:
                stride = 1
                out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride)
            else:
                out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals)

        if not self.deploy:
            self.net["det_feat"], self.net["mask_feat"] = L.Slice(out, ntop=2, name='slice', slice_param=dict(slice_dim=0, slice_point=self.rois_num))
            feat_mask = self.net["mask_feat"]
            out = self.net["det_feat"]

        # for bbox detection
        pool5 = self.ave_pool(7, 1, "pool5",  out)
        cls_score, bbox_pred = self.final_cls_bbox(pool5)

        if not self.deploy:
            self.net["loss_cls"] = L.SoftmaxWithLoss(cls_score, labels, loss_weight=1, propagate_down=[1, 0])
            self.net["loss_bbox"] = L.SmoothL1Loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, \
                                                   loss_weight=1)
        else:
            self.net["cls_prob"] = L.Softmax(cls_score)


        # # for mask prediction
        if not self.deploy:
            mask_feat_aligned = feat_mask
        else:
            mask_feat_aligned = out
        # out = mask_feat_aligned
        out = L.Deconvolution(mask_feat_aligned, name = "mask_deconv1",convolution_param=dict(kernel_size=2, stride=2,
                                            num_output=256, pad=0, bias_term=False,
                                            weight_filler=dict(type='msra'),
                                            bias_filler=dict(type='constant')))
        out = L.BatchNorm(out, name="bn_mask_deconv1",in_place=True, batch_norm_param=dict(use_global_stats=self.deploy))
        out = L.Scale(out, name = "scale_mask_deconv1", in_place=True, scale_param=dict(bias_term=True))
        out = L.ReLU(out, name="mask_deconv1_relu", in_place=True)
        mask_out = self.conv_factory("mask_out", out, 1, self.classes-1, 1, 0, bias_term=True)
        # for i in range(4):
        #     out = self.conv_factory("mask_conv"+str(i), out, 3, 256, 1, 1, bias_term=False)
        # mask_out = self.conv_factory("mask_out", out, 1, 1, 1, 0, bias_term=False)

        if not self.deploy:
            self.net["loss_mask"] = L.SigmoidCrossEntropyLoss(mask_out, masks, loss_weight=1, propagate_down=[1, 0],
                                                      loss_param=dict(
                                                          normalization=1,
                                                          ignore_label = -1
                                                      ))
        else:
            self.net["mask_prob"] = L.Sigmoid(mask_out)

        return self.net.to_proto()
Esempio n. 23
0
    def resnet_mask_rcnn_rpn(self, stage=1):
        channals = self.channals
        if not self.deploy:
            data, im_info, gt_boxes = self.data_layer_train()
        else:
            data, im_info = self.data_layer_test()
            gt_boxes = None
        if stage == 1:
            pre_traned_fixed = True
        else:
            pre_traned_fixed = False
        conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed)
        pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1)
        index = 1
        out = pool1
        if self.module == "normal":
            residual_block = self.residual_block
        else:
            residual_block = self.residual_block_basic

        for i in self.stages[:-1]:
            index += 1
            for j in range(i):
                if j == 0:
                    if index == 2:
                        stride = 1
                    else:
                        stride = 2
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed)
                else:
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed)
            channals *= 2

        if not self.deploy:
            rpn_cls_loss, rpn_loss_bbox, rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data)
        else:
            rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data)
            rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes)

        if not self.deploy:
            self.net["dummy_roi_pool_conv5"] = L.DummyData(name = "dummy_roi_pool_conv5", shape=[dict(dim=[1,channals*2,14,14])])
            out = self.net["dummy_roi_pool_conv5"]
            index += 1
            for j in range(self.stages[-1]):
                if j == 0:
                    stride = 1
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride)
                else:
                    out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals)
            if stage==1:
                self.net["silence_res"] = L.Silence(out, ntop=0)

            if stage==2:
                # for bbox detection
                pool5 = self.ave_pool(7, 1, "pool5", out)
                cls_score, bbox_pred = self.final_cls_bbox(pool5)
                self.net["silence_cls_score"] = L.Silence(cls_score, ntop=0)
                self.net["silence_bbox_pred"] = L.Silence(bbox_pred, ntop=0)

                # for mask prediction
                mask_conv1 = self.conv_factory("mask_conv1", out, 3, 256, 1, 1, bias_term=True)
                mask_out = self.conv_factory("mask_out", mask_conv1, 1, self.classes, 1, 0, bias_term=True)
                self.net["silence_mask_out"] = L.Silence(mask_out, ntop=0)
        return self.net.to_proto()
Esempio n. 24
0
def compute_valid_io_shapes(netconf,
                            netmode,
                            min_output_shape,
                            max_output_shape,
                            fmaps_in=1,
                            fmaps_out=1,
                            constraints=None):

    valid_in_shapes = []
    valid_out_shapes = []

    dims = len(min_output_shape)

    for current_dim in range(0, dims):
        filtered_in_shapes = copy.deepcopy(valid_in_shapes)

        if not (constraints is
                None) and len(constraints) > current_dim and not (
                    constraints[current_dim] is None):
            in_shape = [(constraints[i](filtered_in_shapes[0])
                         if i >= current_dim else filtered_in_shapes[0][i])
                        for i in range(0, current_dim + 1)]
        else:
            in_shape = [(min_output_shape[i]
                         if i >= current_dim else filtered_in_shapes[0][i])
                        for i in range(0, current_dim + 1)]

        in_index = 0
        valid_in_shapes = []
        valid_out_shapes = []

        while (True):
            net = caffe.NetSpec()

            run_shape = RunShape(None, None)
            run_shape.shape = in_shape[0:current_dim + 1]
            run_shape.dilation = [1 for i in range(0, dims)]
            run_shape.fmaps = fmaps_in

            run_shape_in = [run_shape]
            run_shape_out = run_shape_in

            netgen = NetworkGenerator(netconf, netmode)

            limit_reached = False
            valid_io_shape = True

            try:
                net.data, net.datai = netgen.data_layer(
                    [1] + [fmaps_in] + in_shape[0:current_dim + 1])
                net.silence = L.Silence(net.datai, ntop=0)
                # Chained blob list to construct the network (forward direction)
                blobs = []
                # All networks start with data
                blobs = blobs + [net.data]
                netgen.implement_usknet(netconf, net, run_shape_out, blobs, 1,
                                        fmaps_out)
            except MemoryLimitException:
                limit_reached = True
                valid_io_shape = True
            except ConvolutionBufferException:
                limit_reached = True
                valid_io_shape = True
            except ShapeException:
                limit_reached = False
                valid_io_shape = False
            except LayerException:
                limit_reached = False
                valid_io_shape = False

            if (valid_io_shape and not limit_reached
                    and not reduce(lambda a, b: a and b, [
                        run_shape_out[-1].shape[i] >= max_output_shape[i]
                        for i in range(0, current_dim + 1)
                    ], True)):
                print("++++ Valid: %s => %s" %
                      (run_shape_out[0].shape, run_shape_out[-1].shape))
                valid_in_shapes += [run_shape_out[0].shape]
                valid_out_shapes += [run_shape_out[-1].shape]
            else:
                print("-- Invalid: %s => []" % (run_shape_out[0].shape))

            incremented = False

            if not incremented and ((valid_io_shape or limit_reached)
                                    and len(filtered_in_shapes) > 0):
                if in_index >= len(filtered_in_shapes) - 1:
                    in_index = 0
                    in_shape[0:current_dim] = filtered_in_shapes[in_index]
                else:
                    in_index += 1
                    in_shape[0:current_dim] = filtered_in_shapes[in_index]
                    incremented = True

            if not (constraints is
                    None) and len(constraints) > current_dim and not (
                        constraints[current_dim] is None):
                in_shape[current_dim] = constraints[current_dim](in_shape)
                if in_index > 0:
                    incremented = True
            else:
                if not incremented:
                    if in_shape[current_dim] >= max_output_shape[current_dim]:
                        in_shape[current_dim] = min_output_shape[current_dim]
                    else:
                        in_shape[current_dim] += 1
                        incremented = True

            if not incremented:
                break

        if (len(valid_in_shapes) == 0):
            break

    max_fmap_counts = []
    for shape_idx in range(0, len(valid_in_shapes)):

        incexp = True
        bisect = False

        fmaps_start = 1
        lower_limit = 1
        upper_limit = 1

        while (True):
            net = caffe.NetSpec()

            run_shape = RunShape(None, None)
            run_shape.shape = valid_in_shapes[shape_idx]
            run_shape.dilation = [1 for i in range(0, dims)]
            run_shape.fmaps = 1

            run_shape_in = [run_shape]
            run_shape_out = run_shape_in

            netgen = NetworkGenerator(netconf, netmode)

            limit_reached = False
            valid_io_shape = True

            try:
                net.data, net.datai = netgen.data_layer(
                    [1] + [1] + valid_in_shapes[shape_idx])
                net.silence = L.Silence(net.datai, ntop=0)
                # Chained blob list to construct the network (forward direction)
                blobs = []
                # All networks start with data
                blobs = blobs + [net.data]
                netgen.implement_usknet(netconf, net, run_shape_out, blobs,
                                        fmaps_start, fmaps_out)
            except (MemoryLimitException, ConvolutionBufferException,
                    ShapeException, LayerException):
                limit_reached = True

            if (not limit_reached and incexp):
                fmaps_start *= 2
            elif (limit_reached and incexp):
                incexp = False
                bisect = True
                lower_limit = fmaps_start / 2
                upper_limit = fmaps_start
            elif (not limit_reached and bisect):
                if (lower_limit >= upper_limit):
                    break
                lower_limit = fmaps_start + 1
            elif (limit_reached and bisect):
                upper_limit = fmaps_start - 1

            if bisect:
                fmaps_start = (upper_limit + lower_limit) / 2

            print("%s in [%s, %s]" % (fmaps_start, lower_limit, upper_limit))

        max_fmap_counts += [upper_limit]
        print("Current shape: %s, %s, %s" %
              (shape_idx, valid_in_shapes[shape_idx], upper_limit))

    return valid_in_shapes, valid_out_shapes, max_fmap_counts
Esempio n. 25
0
def caffenet(netconf, netmode):
    # Start Caffe proto net
    net = caffe.NetSpec()
    # Specify input data structures

    dims = len(netconf.input_shape)

    run_shape = RunShape(None, None)
    run_shape.shape = netconf.input_shape
    run_shape.dilation = [1 for i in range(0, dims)]
    run_shape.fmaps = 1

    run_shape_in = [run_shape]
    run_shape_out = run_shape_in

    offsets = [0, 0, 0]
    offsets[0] = (netconf.input_shape3d[-3] -
                  netconf.output_shape3d[-3]) / 2 - 1
    offsets[1] = (netconf.input_shape3d[-2] -
                  netconf.output_shape3d[-2]) / 2 - 1
    offsets[2] = (netconf.input_shape3d[-1] -
                  netconf.output_shape3d[-1]) / 2 - 1
    sizes = netconf.output_shape3d
    param = {"offsets": offsets, "sizes": sizes}
    param_json = json.dumps(param)

    if netmode == caffe_pb2.TEST:
        netgen = NetworkGenerator(netconf, netmode)

        net.data, net.datai = netgen.data_layer([1] + [netconf.fmap_input] +
                                                netconf.input_shape3d)
        net.silence = L.Silence(net.datai, ntop=0)
        # Chained blob list to construct the network (forward direction)
        blobs = []
        # All networks start with data
        #blobs = blobs + [net.data]

        #blobs, run_shape_out = netgen.implement_usknet(netconf, net, run_shape_out, blobs, netconf.fmap_start, netconf.fmap_output)
        net_blobs, loss_flag = implement_parallel_unets(
            netconf, netgen, net, netmode)
        blobs = blobs + net_blobs
        last_blob = blobs[-1]

        # Implement the prediction layer
        if netconf.loss_function == 'malis':
            net.prob = L.Sigmoid(last_blob, ntop=1)

        if netconf.loss_function == 'euclid':
            net.prob = L.Sigmoid(last_blob, ntop=1)

        if netconf.loss_function == 'softmax':
            net.prob = L.Softmax(last_blob, ntop=1)

    else:
        netgen = NetworkGenerator(netconf, netmode)

        net.data, net.datai = netgen.data_layer([1] + [netconf.fmap_input] +
                                                netconf.input_shape3d)

        if netconf.loss_function == 'malis':
            net.label, net.labeli = netgen.data_layer([1] +
                                                      [netconf.fmap_output] +
                                                      netconf.output_shape)
            net.components, net.componentsi = netgen.data_layer(
                [1, 1] + netconf.output_shape)
            net.nhood, net.nhoodi = netgen.data_layer([1, 1] +
                                                      [netconf.fmap_output] +
                                                      [3])
            net.silence = L.Silence(net.datai,
                                    net.labeli,
                                    net.componentsi,
                                    net.nhoodi,
                                    ntop=0)

        if netconf.loss_function == 'euclid':
            net.label, net.labeli = netgen.data_layer([1] +
                                                      [netconf.fmap_output3d] +
                                                      netconf.output_shape3d)
            net.scale, net.scalei = netgen.data_layer([1] +
                                                      [netconf.fmap_output3d] +
                                                      netconf.output_shape3d)
            net.silence = L.Silence(net.datai, net.labeli, net.scalei, ntop=0)

        if netconf.loss_function == 'softmax':
            #            net.label, net.labeli = netgen.data_layer([1]+[netconf.fmap_output]+netconf.output_shape)
            net.label, net.labeli = netgen.data_layer([1] + [1] +
                                                      netconf.output_shape)
            net.silence = L.Silence(net.datai, net.labeli, ntop=0)

        # Start the actual network
        # Chained blob list to construct the network (forward direction)
        blobs = []
        # All networks start with data
        #blobs = blobs + [net.data]

        net_blobs, loss_flag = implement_parallel_unets(
            netconf, netgen, net, netmode)
        blobs = blobs + net_blobs
        last_blob = blobs[-1]

        # Implement the loss
        if netconf.loss_function == 'malis':
            last_blob = L.Sigmoid(last_blob, in_place=True)
            net.loss = L.MalisLoss(last_blob,
                                   net.label,
                                   net.components,
                                   net.nhood,
                                   ntop=0)

        if netconf.loss_function == 'euclid':
            last_blob = L.Sigmoid(last_blob, in_place=True)
            #net.loss = L.EuclideanLoss(last_blob, net.label, net.scale, ntop=0)
            net.loss = L.GatedEuclideanLoss(last_blob,
                                            net.label,
                                            net.scale,
                                            loss_flag,
                                            ntop=0)

        if netconf.loss_function == 'softmax':
            net.loss = L.SoftmaxWithLoss(last_blob, net.label, ntop=0)

    # Return the protocol buffer of the generated network
    return net.to_proto()
Esempio n. 26
0
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8)

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights)
    n.embed = L.TanH(n.embed_ba) 

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[T-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[T-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    # Tile question feature
    n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14)
    n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14)

    # Embed image feature
    n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1,
                            num_output=2048, pad=0, weight_filler=dict(type='xavier'),
                            param=fixed_weights)

    # Eltwise product and normalization
    n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_sqrt = L.SignedSqrt(n.eltwise)
    n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt)
    n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for VQA
    n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.att_feature  = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048])))

    # eltwise product + normalization again for VQA
    n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2)
    n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt)
    n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3})

    n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier'))

    # Merge VQA answer and visual+textual feature
    n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    #n.exp_eltwise = L.Eltwise(n.eltwise_drop,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.eltwise_emb,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.exp_att_feature_prev  = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy)
    n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
Esempio n. 27
0
def SsdDetector(net, train=True, data_layer="data", gt_label="label", \
                net_width=300, net_height=300, basenet="VGG", \
                visualize=False, extra_data="data", eval_enable=True, **ssdparam):
    """
    创建SSD检测器。
    train: TRAIN /TEST
    data_layer/gt_label: 数据输入和label输入。
    net_width/net_height: 网络的输入尺寸
    num_classes: 估计分类的数量。
    basenet: "vgg"/"res101",特征网络
    ssdparam: ssd检测器使用的参数列表。

    返回:整个SSD检测器网络。
    """
    # BaseNetWork
    if basenet == "VGG":
        net = VGG16Net(net, from_layer=data_layer, fully_conv=True, reduced=True, \
                dilated=True, dropout=False)
        base_feature_layers = ['conv4_3', 'fc7']
        add_layers = 3
        first_channels = 256
        second_channels = 512
    elif basenet == "Res101":
        net = ResNet101Net(net, from_layer=data_layer, use_pool5=False)
        # 1/8, 1/16, 1/32
        base_feature_layers = ['res3b3', 'res4b22', 'res5c']
        add_layers = 2
        first_channels = 256
        second_channels = 512
    elif basenet == "Res50":
        net = ResNet50Net(net, from_layer=data_layer, use_pool5=False)
        base_feature_layers = ['res3d', 'res4f', 'res5c']
        add_layers = 2
        first_channels = 256
        second_channels = 512
    elif basenet == "PVA":
        net = PvaNet(net, from_layer=data_layer)
        # 1/8, 1/16, 1/32
        base_feature_layers = [
            'conv4_1/incep/pre', 'conv5_1/incep/pre', 'conv5_4'
        ]
        add_layers = 2
        first_channels = 256
        second_channels = 512
    elif basenet == "Yolo":
        net = YoloNet(net, from_layer=data_layer)
        base_feature_layers = ssdparam.get("multilayers_feature_map", [])
        # add_layers = 2
        # first_channels = 256
        # second_channels = 512
        feature_layers = base_feature_layers

    else:
        raise ValueError(
            "only VGG16, Res50/101 and PVANet are supported in current version."
        )

    result = []
    for item in feature_layers:
        if len(item) == 1:
            result.append(item[0])
            continue
        name = ""
        for layers in item:
            name += layers
        tags = ["Down", "Ref"]
        down_methods = [["Reorg"]]
        UnifiedMultiScaleLayers(net,layers=item, tags=tags, \
                              unifiedlayer=name, dnsampleMethod=down_methods)
        result.append(name)
    feature_layers = result

    # Add extra layers
    # extralayers_use_batchnorm=True, extralayers_lr_mult=1, \
    # net, feature_layers = AddSsdExtraConvLayers(net, \
    #     use_batchnorm=ssdparam.get("extralayers_use_batchnorm",False), \
    #     feature_layers=base_feature_layers, add_layers=add_layers, \
    #     first_channels=first_channels, second_channels=second_channels)
    # create ssd detector deader
    mbox_layers = SsdDetectorHeaders(net, \
         min_ratio=ssdparam.get("multilayers_min_ratio",15), \
         max_ratio=ssdparam.get("multilayers_max_ratio",90), \
         boxsizes=ssdparam.get("multilayers_boxsizes", []), \
         net_width=net_width, \
         net_height=net_height, \
         data_layer=data_layer, \
         num_classes=ssdparam.get("num_classes",2), \
         from_layers=feature_layers, \
         use_batchnorm=ssdparam.get("multilayers_use_batchnorm",True), \
         prior_variance = ssdparam.get("multilayers_prior_variance",[0.1,0.1,0.2,0.2]), \
         normalizations=ssdparam.get("multilayers_normalizations",[]), \
         aspect_ratios=ssdparam.get("multilayers_aspect_ratios",[]), \
         flip=ssdparam.get("multilayers_flip",True), \
         clip=ssdparam.get("multilayers_clip",False), \
         inter_layer_channels=ssdparam.get("multilayers_inter_layer_channels",[]), \
         kernel_size=ssdparam.get("multilayers_kernel_size",3), \
         pad=ssdparam.get("multilayers_pad",1))
    if train == True:
        loss_param = get_loss_param(normalization=ssdparam.get(
            "multiloss_normalization", P.Loss.VALID))
        mbox_layers.append(net[gt_label])
        # create loss
        if not ssdparam["combine_yolo_ssd"]:
            multiboxloss_param = get_multiboxloss_param( \
               loc_loss_type=ssdparam.get("multiloss_loc_loss_type",P.MultiBoxLoss.SMOOTH_L1), \
               conf_loss_type=ssdparam.get("multiloss_conf_loss_type",P.MultiBoxLoss.SOFTMAX), \
               loc_weight=ssdparam.get("multiloss_loc_weight",1), \
               conf_weight=ssdparam.get("multiloss_conf_weight",1), \
               num_classes=ssdparam.get("num_classes",2), \
               share_location=ssdparam.get("multiloss_share_location",True), \
               match_type=ssdparam.get("multiloss_match_type",P.MultiBoxLoss.PER_PREDICTION), \
               overlap_threshold=ssdparam.get("multiloss_overlap_threshold",0.5), \
               use_prior_for_matching=ssdparam.get("multiloss_use_prior_for_matching",True), \
               background_label_id=ssdparam.get("multiloss_background_label_id",0), \
               use_difficult_gt=ssdparam.get("multiloss_use_difficult_gt",False), \
               do_neg_mining=ssdparam.get("multiloss_do_neg_mining",True), \
               neg_pos_ratio=ssdparam.get("multiloss_neg_pos_ratio",3), \
               neg_overlap=ssdparam.get("multiloss_neg_overlap",0.5), \
               code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \
               encode_variance_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \
               map_object_to_agnostic=ssdparam.get("multiloss_map_object_to_agnostic",False), \
               name_to_label_file=ssdparam.get("multiloss_name_to_label_file",""))

            net["mbox_loss"] = L.MultiBoxLoss(*mbox_layers, \
                                              multibox_loss_param=multiboxloss_param, \
                                              loss_param=loss_param, \
                                              include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), \
                                              propagate_down=[True, True, False, False])
        else:
            multimcboxloss_param = get_multimcboxloss_param( \
               loc_loss_type=ssdparam.get("multiloss_loc_loss_type",P.MultiBoxLoss.SMOOTH_L1), \
               loc_weight=ssdparam.get("multiloss_loc_weight",1), \
               conf_weight=ssdparam.get("multiloss_conf_weight",1), \
               num_classes=ssdparam.get("num_classes",2), \
               share_location=ssdparam.get("multiloss_share_location",True), \
               match_type=ssdparam.get("multiloss_match_type",P.MultiBoxLoss.PER_PREDICTION), \
               overlap_threshold=ssdparam.get("multiloss_overlap_threshold",0.5), \
               use_prior_for_matching=ssdparam.get("multiloss_use_prior_for_matching",True), \
               background_label_id=ssdparam.get("multiloss_background_label_id",0), \
               use_difficult_gt=ssdparam.get("multiloss_use_difficult_gt",False), \
               do_neg_mining=ssdparam.get("multiloss_do_neg_mining",True), \
               neg_pos_ratio=ssdparam.get("multiloss_neg_pos_ratio",3), \
               neg_overlap=ssdparam.get("multiloss_neg_overlap",0.5), \
               code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \
               encode_variance_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \
               map_object_to_agnostic=ssdparam.get("multiloss_map_object_to_agnostic",False), \
               name_to_label_file=ssdparam.get("multiloss_name_to_label_file",""),\
               rescore=ssdparam.get("multiloss_rescore",True),\
               object_scale=ssdparam.get("multiloss_object_scale",1),\
               noobject_scale=ssdparam.get("multiloss_noobject_scale",1),\
               class_scale=ssdparam.get("multiloss_class_scale",1),\
               loc_scale=ssdparam.get("multiloss_loc_scale",1))
            net["mbox_loss"] = L.MultiMcBoxLoss(*mbox_layers, \
                                              multimcbox_loss_param=multimcboxloss_param, \
                                              loss_param=loss_param, \
                                              include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), \
                                              propagate_down=[True, True, False, False])

        return net
    else:
        # create conf softmax layer
        # mbox_layers[1]
        if not ssdparam["combine_yolo_ssd"]:
            if ssdparam.get("multiloss_conf_loss_type",
                            P.MultiBoxLoss.SOFTMAX) == P.MultiBoxLoss.SOFTMAX:
                reshape_name = "mbox_conf_reshape"
                net[reshape_name] = L.Reshape(mbox_layers[1], \
                        shape=dict(dim=[0, -1, ssdparam.get("num_classes",2)]))
                softmax_name = "mbox_conf_softmax"
                net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
                flatten_name = "mbox_conf_flatten"
                net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
                mbox_layers[1] = net[flatten_name]
            elif ssdparam.get(
                    "multiloss_conf_loss_type",
                    P.MultiBoxLoss.SOFTMAX) == P.MultiBoxLoss.LOGISTIC:
                sigmoid_name = "mbox_conf_sigmoid"
                net[sigmoid_name] = L.Sigmoid(mbox_layers[1])
                mbox_layers[1] = net[sigmoid_name]
            else:
                raise ValueError("Unknown conf loss type.")
        det_out_param = get_detection_out_param( \
            num_classes=ssdparam.get("num_classes",2), \
            share_location=ssdparam.get("multiloss_share_location",True), \
            background_label_id=ssdparam.get("multiloss_background_label_id",0), \
            code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \
            variance_encoded_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \
            conf_threshold=ssdparam.get("detectionout_conf_threshold",0.01), \
            nms_threshold=ssdparam.get("detectionout_nms_threshold",0.45), \
            boxsize_threshold=ssdparam.get("detectionout_boxsize_threshold",0.001), \
            top_k=ssdparam.get("detectionout_top_k",30), \
            visualize=ssdparam.get("detectionout_visualize",False), \
            visual_conf_threshold=ssdparam.get("detectionout_visualize_conf_threshold", 0.5), \
            visual_size_threshold=ssdparam.get("detectionout_visualize_size_threshold", 0), \
            display_maxsize=ssdparam.get("detectionout_display_maxsize",1000), \
            line_width=ssdparam.get("detectionout_line_width",4), \
            color=ssdparam.get("detectionout_color",[[0,255,0],]))
        if visualize:
            mbox_layers.append(net[extra_data])
        if not ssdparam["combine_yolo_ssd"]:
            net.detection_out = L.DetectionOutput(*mbox_layers, \
         detection_output_param=det_out_param, \
         include=dict(phase=caffe_pb2.Phase.Value('TEST')))
        else:
            net.detection_out = L.DetectionMultiMcOutput(*mbox_layers, \
                detection_output_param=det_out_param, \
                include=dict(phase=caffe_pb2.Phase.Value('TEST')))
        if not visualize and eval_enable:
            # create eval layer
            det_eval_param = get_detection_eval_param( \
                 num_classes=ssdparam.get("num_classes",2), \
                 background_label_id=ssdparam.get("multiloss_background_label_id",0), \
                 evaluate_difficult_gt=ssdparam.get("detectioneval_evaluate_difficult_gt",False), \
                 boxsize_threshold=ssdparam.get("detectioneval_boxsize_threshold",[0,0.01,0.05,0.1,0.15,0.2,0.25]), \
                 iou_threshold=ssdparam.get("detectioneval_iou_threshold",[0.9,0.75,0.5]), \
                 name_size_file=ssdparam.get("detectioneval_name_size_file",""))
            net.detection_eval = L.DetectionEvaluate(net.detection_out, net[gt_label], \
               detection_evaluate_param=det_eval_param, \
               include=dict(phase=caffe_pb2.Phase.Value('TEST')))
        if not eval_enable:
            net.slience = L.Silence(net.detection_out, ntop=0, \
                include=dict(phase=caffe_pb2.Phase.Value('TEST')))
        return net
Esempio n. 28
0
def generate_model(split, config):
    n = caffe.NetSpec()
    batch_size = config.N
    mode_str = str(dict(split=split, batch_size=batch_size))
    n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider,
                                                               layer=config.data_provider_layer,
                                                               param_str=mode_str,
                                                               ntop=5)

    # the base net (VGG-16)
    n.conv1_1, n.relu1_1 = conv_relu(n.image, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool1 = max_pool(n.relu1_2)

    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool2 = max_pool(n.relu2_2)

    n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool3 = max_pool(n.relu3_3)

    n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool4 = max_pool(n.relu4_3)

    n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512,
                                     fix_param=config.fix_vgg,
                                     finetune=(not config.fix_vgg))
    n.pool5 = max_pool(n.relu5_3)

    # fully conv
    n.fcn_fc6, n.fcn_relu6 = conv_relu(n.pool5, 4096, ks=7, pad=3)
    if config.vgg_dropout:
        n.fcn_drop6 = L.Dropout(n.fcn_relu6, dropout_ratio=0.5, in_place=True)
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_drop6, 4096, ks=1, pad=0)
        n.fcn_drop7 = L.Dropout(n.fcn_relu7, dropout_ratio=0.5, in_place=True)
        n.fcn_fc8 = conv(n.fcn_drop7, 1000, ks=1, pad=0)
    else:
        n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_relu6, 4096, ks=1, pad=0)
        n.fcn_fc8 = conv(n.fcn_relu7, 1000, ks=1, pad=0)

    
    # embedding
    n.embed = L.Embed(n.language, input_dim=config.vocab_size,
                      num_output=config.embed_dim,
                      weight_filler=dict(type='uniform', min=-0.08, max=0.08))

    # LSTM
    n.lstm = L.LSTM(n.embed, n.cont,
                    recurrent_param=dict(num_output=config.lstm_dim,
                                         weight_filler=dict(type='uniform', min=-0.08, max=0.08),
                                         bias_filler=dict(type='constant', value=0)))
    tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0))
    for i in range(config.T - 1):
        n.__setattr__('slice'+str(i), tops[i])
        n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0))
    n.lstm_out = tops[-1]
    n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim])))

    # Tile LSTM feature
    n.lstm_resh = L.Reshape(n.lstm_feat, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim, 1, 1])))
    n.lstm_tile_1 = L.Tile(n.lstm_resh, axis=2, tiles=config.featmap_H)
    n.lstm_tile_2 = L.Tile(n.lstm_tile_1, axis=3, tiles=config.featmap_W)

    # L2 Normalize image and language features
    n.img_l2norm = L.L2Normalize(n.fcn_fc8)
    n.lstm_l2norm = L.L2Normalize(n.lstm_tile_2)

    # Concatenate
    n.feat_all = L.Concat(n.lstm_l2norm, n.img_l2norm, n.spatial, concat_param=dict(axis=1))

    # MLP Classifier over concatenated feature
    n.fcn_l1, n.fcn_relu1 = conv_relu(n.feat_all, config.mlp_hidden_dims, ks=1, pad=0)
    if config.mlp_dropout:
        n.fcn_drop1 = L.Dropout(n.fcn_relu1, dropout_ratio=0.5, in_place=True)
        n.fcn_scores = conv(n.fcn_drop1, 1, ks=1, pad=0)
    else:
        n.fcn_scores = conv(n.fcn_relu1, 1, ks=1, pad=0)
    
    # Loss Layer
    n.loss = L.SigmoidCrossEntropyLoss(n.fcn_scores, n.label)

    return n.to_proto()
Esempio n. 29
0
def setLayers_twoBranches(data_source,
                          batch_size,
                          layername,
                          kernel,
                          stride,
                          outCH,
                          label_name,
                          transform_param_in,
                          deploy=False,
                          batchnorm=0,
                          lr_mult_distro=[1, 1, 1]):
    # it is tricky to produce the deploy prototxt file, as the data input is not from a layer, so we have to creat a workaround
    # producing training and testing prototxt files is pretty straight forward
    n = caffe.NetSpec()
    assert len(layername) == len(kernel)
    assert len(layername) == len(stride)
    assert len(layername) == len(outCH)
    num_parts = transform_param['num_parts']

    if deploy == False and "lmdb" not in data_source:
        if (len(label_name) == 1):
            n.data, n.tops[label_name[0]] = L.HDF5Data(hdf5_data_param=dict(
                batch_size=batch_size, source=data_source),
                                                       ntop=2)
        elif (len(label_name) == 2):
            n.data, n.tops[label_name[0]], n.tops[label_name[1]] = L.HDF5Data(
                hdf5_data_param=dict(batch_size=batch_size,
                                     source=data_source),
                ntop=3)
    # produce data definition for deploy net
    elif deploy == False:
        n.data, n.tops['label'] = L.CPMData(
            data_param=dict(backend=1,
                            source=data_source,
                            batch_size=batch_size),
            cpm_transform_param=transform_param_in,
            ntop=2)
        n.tops[label_name[2]], n.tops[label_name[3]], n.tops[
            label_name[4]], n.tops[label_name[5]] = L.Slice(
                n.label,
                slice_param=dict(
                    axis=1, slice_point=[38, num_parts + 1, num_parts + 39]),
                ntop=4)
        n.tops[label_name[0]] = L.Eltwise(n.tops[label_name[2]],
                                          n.tops[label_name[4]],
                                          operation=P.Eltwise.PROD)
        n.tops[label_name[1]] = L.Eltwise(n.tops[label_name[3]],
                                          n.tops[label_name[5]],
                                          operation=P.Eltwise.PROD)

    else:
        input = "data"
        dim1 = 1
        dim2 = 4
        dim3 = 368
        dim4 = 368
        # make an empty "data" layer so the next layer accepting input will be able to take the correct blob name "data",
        # we will later have to remove this layer from the serialization string, since this is just a placeholder
        n.data = L.Layer()

    # something special before everything
    n.image, n.center_map = L.Slice(n.data,
                                    slice_param=dict(axis=1, slice_point=3),
                                    ntop=2)
    n.silence2 = L.Silence(n.center_map, ntop=0)
    #n.pool_center_lower = L.Pooling(n.center_map, kernel_size=9, stride=8, pool=P.Pooling.AVE)

    # just follow arrays..CPCPCPCPCCCC....
    last_layer = ['image', 'image']
    stage = 1
    conv_counter = 1
    pool_counter = 1
    drop_counter = 1
    local_counter = 1
    state = 'image'  # can be image or fuse
    share_point = 0

    for l in range(0, len(layername)):
        if layername[l] == 'V':  #pretrained VGG layers
            conv_name = 'conv%d_%d' % (pool_counter, local_counter)
            lr_m = lr_mult_distro[0]
            n.tops[conv_name] = L.Convolution(
                n.tops[last_layer[0]],
                kernel_size=kernel[l],
                num_output=outCH[l],
                pad=int(math.floor(kernel[l] / 2)),
                param=[
                    dict(lr_mult=lr_m, decay_mult=1),
                    dict(lr_mult=lr_m * 2, decay_mult=0)
                ],
                weight_filler=dict(type='gaussian', std=0.01),
                bias_filler=dict(type='constant'))
            last_layer[0] = conv_name
            last_layer[1] = conv_name
            print '%s\tch=%d\t%.1f' % (last_layer[0], outCH[l], lr_m)
            ReLUname = 'relu%d_%d' % (pool_counter, local_counter)
            n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]], in_place=True)
            local_counter += 1
            print ReLUname
        if layername[l] == 'B':
            pool_counter += 1
            local_counter = 1
        if layername[l] == 'C':
            if state == 'image':
                #conv_name = 'conv%d_stage%d' % (conv_counter, stage)
                conv_name = 'conv%d_%d_CPM' % (
                    pool_counter, local_counter
                )  # no image state in subsequent stages
                if stage == 1:
                    lr_m = lr_mult_distro[1]
                else:
                    lr_m = lr_mult_distro[1]
            else:  # fuse
                conv_name = 'Mconv%d_stage%d' % (conv_counter, stage)
                lr_m = lr_mult_distro[2]
                conv_counter += 1
            #if stage == 1:
            #    lr_m = 1
            #else:
            #    lr_m = lr_sub
            n.tops[conv_name] = L.Convolution(
                n.tops[last_layer[0]],
                kernel_size=kernel[l],
                num_output=outCH[l],
                pad=int(math.floor(kernel[l] / 2)),
                param=[
                    dict(lr_mult=lr_m, decay_mult=1),
                    dict(lr_mult=lr_m * 2, decay_mult=0)
                ],
                weight_filler=dict(type='gaussian', std=0.01),
                bias_filler=dict(type='constant'))
            last_layer[0] = conv_name
            last_layer[1] = conv_name
            print '%s\tch=%d\t%.1f' % (last_layer[0], outCH[l], lr_m)

            if layername[l + 1] != 'L':
                if (state == 'image'):
                    if (batchnorm == 1):
                        batchnorm_name = 'bn%d_stage%d' % (conv_counter, stage)
                        n.tops[batchnorm_name] = L.BatchNorm(
                            n.tops[last_layer[0]],
                            param=[
                                dict(lr_mult=0),
                                dict(lr_mult=0),
                                dict(lr_mult=0)
                            ])
                        #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001))
                        last_layer[0] = batchnorm_name
                    #ReLUname = 'relu%d_stage%d' % (conv_counter, stage)
                    ReLUname = 'relu%d_%d_CPM' % (pool_counter, local_counter)
                    n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]],
                                              in_place=True)
                else:
                    if (batchnorm == 1):
                        batchnorm_name = 'Mbn%d_stage%d' % (conv_counter,
                                                            stage)
                        n.tops[batchnorm_name] = L.BatchNorm(
                            n.tops[last_layer[0]],
                            param=[
                                dict(lr_mult=0),
                                dict(lr_mult=0),
                                dict(lr_mult=0)
                            ])
                        #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001))
                        last_layer[0] = batchnorm_name
                    ReLUname = 'Mrelu%d_stage%d' % (conv_counter, stage)
                    n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]],
                                              in_place=True)
                #last_layer = ReLUname
                print ReLUname

            #conv_counter += 1
            local_counter += 1

        elif layername[l] == 'C2':
            for level in range(0, 2):
                if state == 'image':
                    #conv_name = 'conv%d_stage%d' % (conv_counter, stage)
                    conv_name = 'conv%d_%d_CPM_L%d' % (
                        pool_counter, local_counter, level + 1
                    )  # no image state in subsequent stages
                    if stage == 1:
                        lr_m = lr_mult_distro[1]
                    else:
                        lr_m = lr_mult_distro[1]
                else:  # fuse
                    conv_name = 'Mconv%d_stage%d_L%d' % (conv_counter, stage,
                                                         level + 1)
                    lr_m = lr_mult_distro[2]
                    #conv_counter += 1
                #if stage == 1:
                #    lr_m = 1
                #else:
                #    lr_m = lr_sub
                if layername[l + 1] == 'L2' or layername[l + 1] == 'L3':
                    if level == 0:
                        outCH[l] = 38
                    else:
                        outCH[l] = 19

                n.tops[conv_name] = L.Convolution(
                    n.tops[last_layer[level]],
                    kernel_size=kernel[l],
                    num_output=outCH[l],
                    pad=int(math.floor(kernel[l] / 2)),
                    param=[
                        dict(lr_mult=lr_m, decay_mult=1),
                        dict(lr_mult=lr_m * 2, decay_mult=0)
                    ],
                    weight_filler=dict(type='gaussian', std=0.01),
                    bias_filler=dict(type='constant'))
                last_layer[level] = conv_name
                print '%s\tch=%d\t%.1f' % (last_layer[level], outCH[l], lr_m)

                if layername[l + 1] != 'L2' and layername[l + 1] != 'L3':
                    if (state == 'image'):
                        if (batchnorm == 1):
                            batchnorm_name = 'bn%d_stage%d_L%d' % (
                                conv_counter, stage, level + 1)
                            n.tops[batchnorm_name] = L.BatchNorm(
                                n.tops[last_layer[level]],
                                param=[
                                    dict(lr_mult=0),
                                    dict(lr_mult=0),
                                    dict(lr_mult=0)
                                ])
                            #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001))
                            last_layer[level] = batchnorm_name
                        #ReLUname = 'relu%d_stage%d' % (conv_counter, stage)
                        ReLUname = 'relu%d_%d_CPM_L%d' % (
                            pool_counter, local_counter, level + 1)
                        n.tops[ReLUname] = L.ReLU(n.tops[last_layer[level]],
                                                  in_place=True)
                    else:
                        if (batchnorm == 1):
                            batchnorm_name = 'Mbn%d_stage%d_L%d' % (
                                conv_counter, stage, level + 1)
                            n.tops[batchnorm_name] = L.BatchNorm(
                                n.tops[last_layer[level]],
                                param=[
                                    dict(lr_mult=0),
                                    dict(lr_mult=0),
                                    dict(lr_mult=0)
                                ])
                            #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001))
                            last_layer[level] = batchnorm_name
                        ReLUname = 'Mrelu%d_stage%d_L%d' % (conv_counter,
                                                            stage, level + 1)
                        n.tops[ReLUname] = L.ReLU(n.tops[last_layer[level]],
                                                  in_place=True)
                    print ReLUname

            conv_counter += 1
            local_counter += 1

        elif layername[l] == 'P':  # Pooling
            n.tops['pool%d_stage%d' % (pool_counter, stage)] = L.Pooling(
                n.tops[last_layer[0]],
                kernel_size=kernel[l],
                stride=stride[l],
                pool=P.Pooling.MAX)
            last_layer[0] = 'pool%d_stage%d' % (pool_counter, stage)
            pool_counter += 1
            local_counter = 1
            conv_counter += 1
            print last_layer[0]

        elif layername[l] == 'L':
            # Loss: n.loss layer is only in training and testing nets, but not in deploy net.
            if deploy == False and "lmdb" not in data_source:
                n.tops['map_vec_stage%d' % stage] = L.Flatten(
                    n.tops[last_layer[0]])
                n.tops['loss_stage%d' % stage] = L.EuclideanLoss(
                    n.tops['map_vec_stage%d' % stage], n.tops[label_name[1]])
            elif deploy == False:
                level = 1
                name = 'weight_stage%d' % stage
                n.tops[name] = L.Eltwise(n.tops[last_layer[level]],
                                         n.tops[label_name[(level + 2)]],
                                         operation=P.Eltwise.PROD)
                n.tops['loss_stage%d' % stage] = L.EuclideanLoss(
                    n.tops[name], n.tops[label_name[level]])

            print 'loss %d' % stage
            stage += 1
            conv_counter = 1
            pool_counter = 1
            drop_counter = 1
            local_counter = 1
            state = 'image'

        elif layername[l] == 'L2':
            # Loss: n.loss layer is only in training and testing nets, but not in deploy net.
            weight = [lr_mult_distro[3], 1]
            # print lr_mult_distro[3]
            for level in range(0, 2):
                if deploy == False and "lmdb" not in data_source:
                    n.tops['map_vec_stage%d_L%d' %
                           (stage, level + 1)] = L.Flatten(
                               n.tops[last_layer[level]])
                    n.tops['loss_stage%d_L%d' %
                           (stage, level + 1)] = L.EuclideanLoss(
                               n.tops['map_vec_stage%d' % stage],
                               n.tops[label_name[level]],
                               loss_weight=weight[level])
                elif deploy == False:
                    name = 'weight_stage%d_L%d' % (stage, level + 1)
                    n.tops[name] = L.Eltwise(n.tops[last_layer[level]],
                                             n.tops[label_name[(level + 2)]],
                                             operation=P.Eltwise.PROD)
                    n.tops['loss_stage%d_L%d' %
                           (stage, level + 1)] = L.EuclideanLoss(
                               n.tops[name],
                               n.tops[label_name[level]],
                               loss_weight=weight[level])

                print 'loss %d level %d' % (stage, level + 1)

            stage += 1
            #last_connect = last_layer
            #last_layer = 'image'
            conv_counter = 1
            pool_counter = 1
            drop_counter = 1
            local_counter = 1
            state = 'image'

        elif layername[l] == 'L3':
            # Loss: n.loss layer is only in training and testing nets, but not in deploy net.
            weight = [lr_mult_distro[3], 1]
            # print lr_mult_distro[3]
            if deploy == False:
                level = 0
                n.tops['loss_stage%d_L%d' %
                       (stage, level + 1)] = L.Euclidean2Loss(
                           n.tops[last_layer[level]],
                           n.tops[label_name[level]],
                           n.tops[label_name[2]],
                           loss_weight=weight[level])
                print 'loss %d level %d' % (stage, level + 1)
                level = 1
                n.tops['loss_stage%d_L%d' %
                       (stage, level + 1)] = L.EuclideanLoss(
                           n.tops[last_layer[level]],
                           n.tops[label_name[level]],
                           loss_weight=weight[level])
                print 'loss %d level %d' % (stage, level + 1)

            stage += 1
            #last_connect = last_layer
            #last_layer = 'image'
            conv_counter = 1
            pool_counter = 1
            drop_counter = 1
            local_counter = 1
            state = 'image'

        elif layername[l] == 'D':
            if deploy == False:
                n.tops['drop%d_stage%d' % (drop_counter, stage)] = L.Dropout(
                    n.tops[last_layer[0]],
                    in_place=True,
                    dropout_param=dict(dropout_ratio=0.5))
                drop_counter += 1
        elif layername[l] == '@':
            #if not share_point:
            #    share_point = last_layer
            n.tops['concat_stage%d' % stage] = L.Concat(
                n.tops[last_layer[0]],
                n.tops[last_layer[1]],
                n.tops[share_point],
                concat_param=dict(axis=1))

            local_counter = 1
            state = 'fuse'
            last_layer[0] = 'concat_stage%d' % stage
            last_layer[1] = 'concat_stage%d' % stage
            print last_layer
        elif layername[l] == '$':
            share_point = last_layer[0]
            pool_counter += 1
            local_counter = 1
            print 'share'

    # final process
    stage -= 1
    #if stage == 1:
    #    n.silence = L.Silence(n.pool_center_lower, ntop=0)

    if deploy == False:
        return str(n.to_proto())
        # for generating the deploy net
    else:
        # generate the input information header string
        deploy_str = 'input: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}'.format(
            '"' + input + '"', dim1, dim2, dim3, dim4)
        # assemble the input header with the net layers string.  remove the first placeholder layer from the net string.
        return deploy_str + '\n' + 'layer {' + 'layer {'.join(
            str(n.to_proto()).split('layer {')[2:])
Esempio n. 30
0
def mfh_baseline(mode, batchsize, T, question_vocab_size, folder):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
    if mode == 'val':
        n.data, n.cont, n.img_feature, n.label = L.Python( \
            module='vqa_data_layer', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 )
    else:
        n.data, n.cont, n.img_feature, n.label = L.Python(\
            module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
            param_str=mode_str, ntop=4 ) 
    n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
                         weight_filler=dict(type='xavier'))
    n.embed_tanh = L.TanH(n.embed) 

    # LSTM
    #n.lstm1 = L.LSTM(\
    #               n.embed_tanh, n.cont,\
    #               recurrent_param=dict(\
    #                   num_output=config.LSTM_UNIT_NUM,\
    #                   weight_filler=dict(type='xavier')))
    #tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    #for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
    #    n.__setattr__('slice_first'+str(i), tops1[int(i)])
    #    n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    #n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    #n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
    #                      reshape_param=dict(\
    #                          shape=dict(dim=[-1,1024])))
    #n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM,\
                       weight_filler=dict(type='xavier')))
    tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})

    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=config.LSTM_UNIT_NUM, 
                       weight_filler=dict(type='xavier')))
    tops2 = L.Slice(n.lstm2, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
    for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[config.MAX_WORDS_IN_QUESTION-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.q_feat = L.Concat(*concat_botom) 
    '''
    Coarse Image-Question MFH fusion
    '''

    n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0))
    n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out)
    n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt)

    n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
                                  weight_filler=dict(type='xavier'))
    n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0))
    n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
    n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
    n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \
                                      pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
    n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\
                                    reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
    n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out)
    n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt)

    n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2)

    n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'))

    if mode == 'val':
        n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
    else:
        n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
    return n.to_proto()