def create(self, dataLayerParams, phase="train"):
        n = caffe.NetSpec()

        n.data, n.label = L.Python(module="NpyDataLayer",
                                   layer=self.dataLayer,
                                   ntop=2,
                                   param_str=str(dataLayerParams))

        n.input_conv = L.Convolution(n.data,
                                     num_output=16,
                                     kernel_size=1,
                                     stride=1,
                                     pad=1,
                                     bias_term=False,
                                     param=[dict(lr_mult=1, decay_mult=1)],
                                     weight_filler=dict(type="xavier"))
        n.input_relu = L.ReLU(n.input_conv, in_place=False)

        for i in range(len(self.stages)):
            for j in range(self.stages[i]):
                stageString = self.resnetString

                bottomString = 'n.input_relu'
                if (i != 0) or (j != 0):
                    bottomString = 'n.res{}_add'.format(
                        str(sum(self.stages[:i]) + j))

                exec(
                    stageString.replace('(bottom)', bottomString).replace(
                        '(output)', str(2**i * 64)).replace(
                            '(n)', str(sum(self.stages[:i]) + j + 1)))

        exec(
            'n.pool_ave = L.Pooling(n.res{}_add, pool=P.Pooling.AVE, global_pooling=True)'
            .format(str(sum(self.stages))))
        n.classifier = L.InnerProduct(n.pool_ave,
                                      num_output=self.classCount,
                                      param=[
                                          dict(lr_mult=1, decay_mult=1),
                                          dict(lr_mult=2, decay_mult=0)
                                      ],
                                      weight_filler=dict(type='xavier'),
                                      bias_filler=dict(type='constant',
                                                       value=0))

        if phase == "train":
            n.loss = L.SoftmaxWithLoss(n.classifier, n.label)
        elif phase == "test":
            n.softmax_out = L.Softmax(n.classifier)
            n.accuracy_top1 = L.Accuracy(n.softmax_out,
                                         n.label,
                                         accuracy_param=dict(top_k=1, axis=1))
            n.accuracy_top5 = L.Accuracy(n.softmax_out,
                                         n.label,
                                         accuracy_param=dict(top_k=5, axis=1))
        else:  # deploy
            n.softmax_out = L.Softmax(n.classifier)
            n.result = L.ArgMax(n.softmax_out, argmax_param=dict(axis=1))

        return n.to_proto()
Exemple #2
0
def test_argmax2():
    # type: ()->caffe.NetSpec

    n = caffe.NetSpec()
    n.input1 = L.Input(shape=make_shape([6, 4, 64, 64]))
    n.argmax1 = L.ArgMax(n.input1, axis=-1)
    return n
Exemple #3
0
 def net():
     n = caffe.NetSpec()
     n.data = L.Input(input_param=dict(shape=dict(dim=data_shape)))
     n.dataout = L.ArgMax(n.data,
                          out_max_val=_out_max_val,
                          top_k=_top_k,
                          axis=_axis)
     return n.to_proto()
def create_UNet():
    n = caffe.NetSpec()
    n.data = L.Input(include={'phase': caffe.TEST}, input_param={'shape': {'dim': [1, 1, SEG_MASK_WIDTH, SEG_MASK_HEIGHT]}})

    # encoder => level 1
    n.conv1_1, n.relu1_1 = conv_relu(n.data, 3, 1, 64, 0, True)  # conv_relu(bottom, kernel_size, stride, nout, pad):
    n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 3, 1, 64, 0, True)
    n.pool1 = L.Pooling(n.relu1_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)

    # encoder => level 2
    n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 3, 1, 128, 0, True)
    n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 3, 1, 128, 0, True)
    n.pool2 = L.Pooling(n.relu2_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)

    # encoder => level 3
    n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 3, 1, 256, 0, True)
    n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 3, 1, 256, 0, True)
    n.pool3 = L.Pooling(n.relu3_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)

    # encoder => level 4
    n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 3, 1, 512, 0, True)
    n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 3, 1, 512, 0, True)
    n.pool4 = L.Pooling(n.relu4_2, pool=P.Pooling.MAX, kernel_size=2, stride=2)

    # encoder => level 5
    n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 3, 1, 1024, 0, True)
    n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 3, 1, 1024, 0, True)

    # (bottom, ks, stride, nout, pad, crop_offset, cat)
    n.upconv1, n.concat1 = upconv_concat(n.relu5_2, 2, 2, 512, 0, 4, n.relu4_2)

    # decoder => level 1
    n.conv6_1, n.relu6_1 = conv_relu(n.concat1, 3, 1, 512, 0, True)
    n.conv6_2, n.relu6_2 = conv_relu(n.relu6_1, 3, 1, 512, 0, True)
    n.upconv2, n.concat2 = upconv_concat(n.relu6_2, 2, 2, 256, 0, 16, n.relu3_2)

    # decoder => level 2
    n.conv7_1, n.relu7_1 = conv_relu(n.concat2, 3, 1, 256, 0, True)
    n.conv7_2, n.relu7_2 = conv_relu(n.relu7_1, 3, 1, 256, 0, True)
    n.upconv3, n.concat3 = upconv_concat(n.relu7_2, 2, 2, 128, 0, 40, n.relu2_2)

    # decoder => level 3
    n.conv8_1, n.relu8_1 = conv_relu(n.concat3, 3, 1, 128, 0, True)
    n.conv8_2, n.relu8_2 = conv_relu(n.relu8_1, 3, 1, 128, 0, True)
    n.upconv4, n.concat4 = upconv_concat(n.relu8_2, 2, 2, 64, 0, 88, n.relu1_2)

    # decoder => level 4
    n.conv9_1, n.relu9_1 = conv_relu(n.concat4, 3, 1, 64, 0, True)
    n.conv9_2, n.relu9_2 = conv_relu(n.relu9_1, 3, 1, 64, 0, True)
    n.score = L.Convolution(n.relu9_2, kernel_size=1, num_output=3, pad=0)

    # n.labelcrop = L.Crop(n.label, n.score, crop_param={'axis': 2, 'offset': 92})
    # n.loss = L.SoftmaxWithLoss(n.score, n.labelcrop, loss_param={'ignore_label': 3}, propagate_down=[True, False])
    n.argmax = L.ArgMax(n.score, argmax_param={'axis': 1}, include=dict(phase=caffe.TEST))
    # n.acc, accuracy_by_class = L.Accuracy(n.score, n.labelcrop, accuracy_param={'axis': 1}, include=dict(phase=caffe.TEST), ntop=2)
    # n.confmat = L.Python(n.argmax, n.labelcrop, python_param={'module': 'python_confmat', 'layer': 'PythonConfMat', 'param_str': '{"test_iter":3780}'}, include=dict(phase=caffe.TEST))

    return n.to_proto()
    def create(self, dataLayerParams, phase="train"):
        n = caffe.NetSpec()

        n.data, n.label = L.Python(module="VDataLayer",
                                   layer=self.dataLayer,
                                   ntop=2,
                                   param_str=str(dataLayerParams))

        n.input_conv = L.Convolution(n.data,
                                     num_output=16,
                                     kernel_size=1,
                                     stride=1,
                                     pad=1,
                                     bias_term=False,
                                     param=[dict(lr_mult=1, decay_mult=1)],
                                     weight_filler=dict(type="xavier"))
        n.input_relu = L.ReLU(n.input_conv, in_place=False)

        for i in range(len(self.stages)):
            for j in range(self.stages[i]):
                stageString = self.resnetString

                bottomString = 'n.input_relu'
                if (i != 0) or (j != 0):
                    bottomString = 'n.res{}_add'.format(
                        str(sum(self.stages[:i]) + j))

                exec(
                    stageString.replace('(bottom)', bottomString).replace(
                        '(output)', str(2**i * 64)).replace(
                            '(n)', str(sum(self.stages[:i]) + j + 1)))

        exec(
            'n.conv_output = L.Convolution(n.res{}_add, num_output=2, kernel_size=1, stride=1, pad=1, bias_term=False, param=[dict(lr_mult=1, decay_mult=1)], weight_filler=dict(type="xavier"))'
            .format(str(sum(self.stages))))

        # reshape result and label
        n.flat_output = L.Reshape(n.conv_output,
                                  reshape_param={"shape": {
                                      "dim": [0, 2, -1]
                                  }})
        n.flat_label = L.Reshape(n.label,
                                 reshape_param={"shape": {
                                     "dim": [0, 1, -1]
                                 }})

        if phase == "train":
            n.softmax_out = L.Softmax(n.flat_output)
            n.loss = L.DiceLoss(n.softmax_out, n.flat_label)
        elif phase == "test":
            n.softmax_out = L.Softmax(n.flat_output)
            n.accu = L.DiceLoss(n.softmax_out, n.flat_label)
        else:  # deploy
            n.softmax_out = L.Softmax(n.flat_output)
            n.result = L.ArgMax(n.softmax_out, argmax_param=dict(axis=1))

        return n.to_proto()
Exemple #6
0
def buildExecutableNet(lmdb_images, lmdb_labels, batch_size, phase):
    n = caffe.NetSpec()

    pref = common.layer_prefix

    # LOAD DATA
    n.data = L.Data(batch_size=batch_size,
                    backend=P.Data.LMDB,
                    source=lmdb_images,
                    transform_param=dict(mean_file=common.mean_file))
    n.label = L.Data(batch_size=batch_size,
                     backend=P.Data.LMDB,
                     source=lmdb_labels)

    if phase == "Test":
        n.data = L.Data(batch_size=1,
                        backend=P.Data.LMDB,
                        source=lmdb_images,
                        transform_param=dict(mean_file=common.mean_file))
        n.label = L.Data(batch_size=1, backend=P.Data.LMDB, source=lmdb_labels)

    # Embed net....
    output = multinet(n.data, n, phase, pref)

    # Upsample to full size (fixed weights upsampling)
    setattr(
        n, pref + "score_temp3",
        L.Deconvolution(output,
                        param=[
                            dict(lr_mult=common.lw, decay_mult=common.lw),
                            dict(lr_mult=common.lb, decay_mult=0)
                        ],
                        convolution_param=dict(
                            kernel_size=16,
                            stride=8,
                            num_output=common.c,
                            pad=4,
                            weight_filler=dict(type='bilinear'))))
    score_temp3 = getattr(n, pref + "score_temp3")
    batch_norm(pref + "score_temp3", n, "temp3", pref, "", phase)

    if phase == "Train":
        n.loss = L.SoftmaxWithLoss(score_temp3,
                                   n.label,
                                   loss_param=dict(normalize=False,
                                                   ignore_label=common.ig_lbl))
    if phase == "Test":
        n.score_argmax = L.ArgMax(score_temp3, argmax_param=dict(axis=1))
        n.class_iou = L.IntersectionOverUnion(
            n.score_argmax,
            n.label,
            parse_iou_param=dict(num_labels=common.c,
                                 ignore_label=common.ig_lbl,
                                 total_im_num=common.testset_size))

    return n.to_proto()
Exemple #7
0
def compute_final_spixel_labels(pixel_spixel_assoc,
                                spixel_init,
                                num_spixels_h, num_spixels_w):

    # Compute new spixel indices
    # 计算新的像素指数
    rel_label = L.ArgMax(pixel_spixel_assoc, argmax_param = dict(axis = 1),
                         propagate_down = False)
    new_spix_indices = L.RelToAbsIndex(rel_label, spixel_init,
                                       rel_to_abs_index_param = dict(num_spixels_h = int(num_spixels_h),
                                                                     num_spixels_w = int(num_spixels_w)),
                                                                     propagate_down = [False, False])

    return new_spix_indices
Exemple #8
0
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
    n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8)

    n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
        weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights)
    n.embed = L.TanH(n.embed_ba) 

    # LSTM1
    n.lstm1 = L.LSTM(\
                   n.embed, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_first'+str(i), tops1[int(i)])
        n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
    n.lstm1_out = tops1[T-1]
    n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
    n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
    # LSTM2
    n.lstm2 = L.LSTM(\
                   n.lstm1_droped, n.cont,\
                   recurrent_param=dict(\
                       num_output=1024,\
                       weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
                       bias_filler=dict(type='constant',value=0)),
                   param=fixed_weights_lstm)
    tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
    for i in range(T-1):
        n.__setattr__('slice_second'+str(i), tops2[int(i)])
        n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
    n.lstm2_out = tops2[T-1]
    n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
                          reshape_param=dict(\
                              shape=dict(dim=[-1,1024])))
    n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
    concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
    n.lstm_12 = L.Concat(*concat_botom)

    # Tile question feature
    n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14)
    n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14)

    # Embed image feature
    n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1,
                            num_output=2048, pad=0, weight_filler=dict(type='xavier'),
                            param=fixed_weights)

    # Eltwise product and normalization
    n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_sqrt = L.SignedSqrt(n.eltwise)
    n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt)
    n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for VQA
    n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.att_feature  = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048])))

    # eltwise product + normalization again for VQA
    n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights)
    n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2)
    n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt)
    n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3})

    n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300,
            weight_filler=dict(type='uniform', min=-0.08, max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier'))

    # Merge VQA answer and visual+textual feature
    n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    #n.exp_eltwise = L.Eltwise(n.eltwise_drop,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.eltwise_emb,  n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14])))
    
    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
    n.exp_att_feature_prev  = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy)
    n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier'))
    n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
def act_proto(mode, batchsize, exp_vocab_size, use_gt=True):
    n = caffe.NetSpec()
    mode_str = json.dumps({'mode': mode, 'batchsize': batchsize})
    n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \
        L.Python(module='activity_data_provider_layer', layer='ActivityDataProviderLayer', param_str=mode_str, ntop=6)

    # Attention
    n.att_conv1 = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=512,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_conv1_relu = L.ReLU(n.att_conv1)
    n.att_conv2 = L.Convolution(n.att_conv1_relu,
                                kernel_size=1,
                                stride=1,
                                num_output=1,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.att_reshaped = L.Reshape(
        n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
    n.att_map = L.Reshape(n.att_softmax,
                          reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                        data_filler=dict(type='constant', value=1),
                        ntop=1)
    n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy)
    n.att_feature_resh = L.Reshape(
        n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 2048])))

    # Prediction
    n.prediction = L.InnerProduct(n.att_feature_resh,
                                  num_output=config.NUM_OUTPUT_UNITS,
                                  weight_filler=dict(type='xavier'),
                                  param=fixed_weights)

    # Take GT answer or Take the logits of the VQA model and get predicted answer to embed
    if use_gt:
        n.exp_emb_ans = L.Embed(n.label,
                                input_dim=config.NUM_OUTPUT_UNITS,
                                num_output=300,
                                weight_filler=dict(type='uniform',
                                                   min=-0.08,
                                                   max=0.08))
    else:
        n.vqa_ans = L.ArgMax(n.prediction, axis=1)
        n.exp_emb_ans = L.Embed(n.vqa_ans,
                                input_dim=config.NUM_OUTPUT_UNITS,
                                num_output=300,
                                weight_filler=dict(type='uniform',
                                                   min=-0.08,
                                                   max=0.08))
    n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans)
    n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh,
                                    num_output=2048,
                                    weight_filler=dict(type='xavier'))

    # Merge activity answer and visual feature
    n.exp_emb_resh = L.Reshape(
        n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1])))
    n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14)
    n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14)

    n.img_embed = L.Convolution(n.img_feature,
                                kernel_size=1,
                                stride=1,
                                num_output=2048,
                                pad=0,
                                weight_filler=dict(type='xavier'))
    n.exp_eltwise = L.Eltwise(n.img_embed,
                              n.exp_emb_tiled,
                              eltwise_param={'operation': P.Eltwise.PROD})
    n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise)
    n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt)
    n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2,
                                   dropout_param={'dropout_ratio': 0.3})

    # Attention for Explanation
    n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=512,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1)
    n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu,
                                    kernel_size=1,
                                    stride=1,
                                    num_output=1,
                                    pad=0,
                                    weight_filler=dict(type='xavier'))
    n.exp_att_reshaped = L.Reshape(
        n.exp_att_conv2, reshape_param=dict(shape=dict(dim=[-1, 1, 14 * 14])))
    n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2)
    n.exp_att_map = L.Reshape(
        n.exp_att_softmax, reshape_param=dict(shape=dict(dim=[-1, 1, 14, 14])))

    exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]),
                            data_filler=dict(type='constant', value=1),
                            ntop=1)
    n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map,
                                             exp_dummy)
    n.exp_att_feature_resh = L.Reshape(
        n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048])))
    n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh,
                                             num_output=2048,
                                             weight_filler=dict(type='xavier'))
    n.exp_att_feature = L.Eltwise(n.exp_emb_ans2,
                                  n.exp_att_feature_embed,
                                  eltwise_param={'operation': P.Eltwise.PROD})

    n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0)

    return n.to_proto()
 def test_argmax2(self):
     n = caffe.NetSpec()
     n.input1 = L.Input(shape=make_shape([6, 4, 64, 64]))
     n.argmax1 = L.ArgMax(n.input1, axis=-1)
     self._test_model(*self._netspec_to_model(n, 'argmax2'))
    def create_network_structure(self):

        self.pad = (self.kernel_size - 1) / 2
        self.net_spec = caffe.NetSpec()

        self.net_spec.data = L.Input(ntop=1,
                                     input_param={
                                         'shape': {
                                             'dim': [
                                                 self.batch_size,
                                                 self.data_channels,
                                                 self.input_size,
                                                 self.input_size,
                                                 self.input_size
                                             ]
                                         }
                                     })
        self.net_spec.target = L.Input(ntop=1,
                                       input_param={
                                           'shape': {
                                               'dim': [
                                                   self.batch_size,
                                                   self.label_channels,
                                                   self.input_size,
                                                   self.input_size,
                                                   self.input_size
                                               ]
                                           }
                                       },
                                       exclude={'stage': 'deploy'})

        last_layer = self.net_spec.data
        for i in range(1, self.num_blocks + 1):
            last_layer = self.add_contraction_block(last_layer, i)

        if self.do_dropout:
            last_layer = L.Dropout(last_layer,
                                   dropout_ratio=0.4,
                                   in_place=True)

        if self.use_batchnorm:
            last_layer = self.add_batchnormscale(
                name='encode_1',
                input=L.ReLU(L.Convolution(last_layer,
                                           pad=self.pad,
                                           kernel_size=self.kernel_size,
                                           num_output=self.base_n_filters *
                                           pow(2, self.num_blocks),
                                           weight_filler=self.weight_filler),
                             in_place=True))
            last_layer = self.add_batchnormscale(
                name='encode_2',
                input=L.ReLU(L.Convolution(last_layer,
                                           pad=self.pad,
                                           kernel_size=self.kernel_size,
                                           num_output=self.base_n_filters *
                                           pow(2, self.num_blocks),
                                           weight_filler=self.weight_filler),
                             in_place=True))
        else:
            last_layer = self.add_conv(last_layer,
                                       name='encode_1',
                                       filter_mult=self.num_blocks)
            last_layer = self.add_conv(last_layer,
                                       name='encode_2',
                                       filter_mult=self.num_blocks)

        for i in range(1, self.num_blocks + 1)[::-1]:
            last_layer = self.add_expansion_block(last_layer, i)

        self.net_spec.seg = L.Convolution(last_layer,
                                          pad=0,
                                          kernel_size=1,
                                          num_output=self.num_classes,
                                          weight_filler=self.weight_filler)

        self.net_spec.softmax = L.Softmax(self.net_spec.seg)
        self.net_spec.argmax = L.ArgMax(self.net_spec.softmax, axis=1)
        self.net_spec.silence = L.Silence(self.net_spec.argmax,
                                          ntop=0,
                                          include={'phase': caffe.TRAIN})
        self.net_spec.target_argmax = L.ArgMax(self.net_spec.target,
                                               axis=1,
                                               exclude={'stage': 'deploy'})

        if self.loss_func == 'xent':

            if self.ignore_label is None:
                self.net_spec.loss = L.SoftmaxWithLoss(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'})
            else:
                self.net_spec.loss = L.SoftmaxWithLoss(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    loss_param={'ignore_label': self.ignore_label})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    accuracy_param={'ignore_label': self.ignore_label})

        elif self.loss_func == 'dice':

            if self.ignore_label is None:
                self.net_spec.loss = L.Python(self.net_spec.softmax,
                                              self.net_spec.target,
                                              loss_weight=1,
                                              python_param=dict(
                                                  module='DiceLoss',
                                                  layer='DiceLossLayer'),
                                              exclude={'stage': 'deploy'})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'})
            else:
                self.net_spec.loss = L.Python(
                    self.net_spec.softmax,
                    self.net_spec.target,
                    loss_weight=1,
                    python_param=dict(module='DiceLoss',
                                      layer='DiceLossLayer',
                                      param_str="{'param1': " +
                                      str(self.ignore_label) + "}"),
                    exclude={'stage': 'deploy'})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    accuracy_param={'ignore_label': self.ignore_label})

        elif self.loss_func == 'both':

            if self.ignore_label is None:
                self.net_spec.xent_loss = L.SoftmaxWithLoss(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    loss_weight=10)
                self.net_spec.loss = L.Python(self.net_spec.softmax,
                                              self.net_spec.target,
                                              loss_weight=1,
                                              python_param=dict(
                                                  module='DiceLoss',
                                                  layer='DiceLossLayer'),
                                              exclude={'stage': 'deploy'})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'})
            else:
                self.net_spec.xent_loss = L.SoftmaxWithLoss(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    loss_weight=10,
                    loss_param={'ignore_label': self.ignore_label})
                self.net_spec.loss = L.Python(
                    self.net_spec.softmax,
                    self.net_spec.target,
                    loss_weight=1,
                    python_param=dict(module='DiceLoss',
                                      layer='DiceLossLayer',
                                      param_str="{'param1': " +
                                      str(self.ignore_label) + "}"),
                    exclude={'stage': 'deploy'})
                self.net_spec.accuracy = L.Accuracy(
                    self.net_spec.seg,
                    self.net_spec.target_argmax,
                    exclude={'stage': 'deploy'},
                    accuracy_param={'ignore_label': self.ignore_label})

        self.net_spec.dice = L.Python(self.net_spec.softmax,
                                      self.net_spec.target,
                                      loss_weight=1,
                                      python_param=dict(
                                          module='DiceIndex',
                                          layer='DiceIndexLayer'),
                                      exclude={'stage': 'deploy'})
def create_unet_model(batch_size, num_classes, input_size, base_n_filters, output_file):

    kernel_size = 3
    pad = (kernel_size - 1) / 2
    do_dropout = True
    weight_filler = dict(type='msra')

    n = caffe.NetSpec()

    n.data = L.Input(ntop=1, input_param =  { 'shape' : { 'dim': [batch_size, 1, input_size, input_size] } })
    n.target = L.Input(ntop=1, input_param =  { 'shape' : { 'dim': [batch_size, 1, input_size, input_size] } }, exclude={'stage' : 'deploy'})

    n.contr_1_1 = L.BatchNorm(L.ReLU(L.Convolution(n.data, pad=pad, kernel_size=kernel_size, num_output=base_n_filters, weight_filler=weight_filler), in_place=True), in_place=True)
    n.contr_1_2 = L.BatchNorm(L.ReLU(L.Convolution(n.contr_1_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters, weight_filler=weight_filler), in_place=True), in_place=True)
    n.pool_1 = L.Pooling(n.contr_1_2, kernel_size=2, stride=2, pool=P.Pooling.MAX)

    n.contr_2_1 = L.BatchNorm(L.ReLU(L.Convolution(n.pool_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 2, weight_filler=weight_filler), in_place=True), in_place=True)
    n.contr_2_2 = L.BatchNorm(L.ReLU(L.Convolution(n.contr_2_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 2, weight_filler=weight_filler), in_place=True), in_place=True)
    n.pool_2 = L.Pooling(n.contr_2_2, kernel_size=2, stride=2, pool=P.Pooling.MAX)

    n.contr_3_1 = L.BatchNorm(L.ReLU(L.Convolution(n.pool_2, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 4, weight_filler=weight_filler), in_place=True), in_place=True)
    n.contr_3_2 = L.BatchNorm(L.ReLU(L.Convolution(n.contr_3_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 4, weight_filler=weight_filler), in_place=True), in_place=True)
    n.pool_3 = L.Pooling(n.contr_3_2, kernel_size=2, stride=2, pool=P.Pooling.MAX)

    n.contr_4_1 = L.BatchNorm(L.ReLU(L.Convolution(n.pool_3, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 8, weight_filler=weight_filler), in_place=True), in_place=True)
    n.contr_4_2 = L.BatchNorm(L.ReLU(L.Convolution(n.contr_4_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 8, weight_filler=weight_filler), in_place=True), in_place=True)
    n.pool_4 = L.Pooling(n.contr_4_2, kernel_size=2, stride=2, pool=P.Pooling.MAX)

    if do_dropout:
        n.pool_4 = L.Dropout(n.pool_4, dropout_ratio=0.4, in_place=True)

    n.encode_1 = L.BatchNorm(L.ReLU(L.Convolution(n.pool_4, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 16, weight_filler=weight_filler), in_place=True), in_place=True)
    n.encode_2 = L.BatchNorm(L.ReLU(L.Convolution(n.encode_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 16, weight_filler=weight_filler), in_place=True), in_place=True)
    n.upscale_1 = L.Deconvolution(n.encode_2, convolution_param=dict(num_output=base_n_filters * 16, kernel_size=2, stride=2))

    n.concat_1 = L.Concat(n.upscale_1, n.contr_4_2, axis=1)
    n.expand_1_1 = L.BatchNorm(L.ReLU(L.Convolution(n.concat_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 8, weight_filler=weight_filler), in_place=True), in_place=True)
    n.expand_1_2 = L.BatchNorm(L.ReLU(L.Convolution(n.expand_1_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 8, weight_filler=weight_filler), in_place=True), in_place=True)
    n.upscale_2 = L.Deconvolution(n.expand_1_2, convolution_param=dict(num_output=base_n_filters * 8, kernel_size=2, stride=2))

    n.concat_2 = L.Concat(n.upscale_2, n.contr_3_2, axis=1)
    n.expand_2_1 = L.BatchNorm(L.ReLU(L.Convolution(n.concat_2, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 4, weight_filler=weight_filler), in_place=True), in_place=True)
    n.expand_2_2 = L.BatchNorm(L.ReLU(L.Convolution(n.expand_2_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 4, weight_filler=weight_filler), in_place=True), in_place=True)
    n.upscale_3 = L.Deconvolution(n.expand_2_2, convolution_param=dict(num_output=base_n_filters * 4, kernel_size=2, stride=2))

    n.concat_3 = L.Concat(n.upscale_3, n.contr_2_2, axis=1)
    n.expand_3_1 = L.BatchNorm(L.ReLU(L.Convolution(n.concat_3, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 2, weight_filler=weight_filler), in_place=True), in_place=True)
    n.expand_3_2 = L.BatchNorm(L.ReLU(L.Convolution(n.expand_3_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters * 2, weight_filler=weight_filler), in_place=True), in_place=True)
    n.upscale_4 = L.Deconvolution(n.expand_3_2, convolution_param=dict(num_output=base_n_filters * 2, kernel_size=2, stride=2))

    n.concat_4 = L.Concat(n.upscale_4, n.contr_1_2, axis=1)
    n.expand_4_1 = L.BatchNorm(L.ReLU(L.Convolution(n.concat_4, pad=pad, kernel_size=kernel_size, num_output=base_n_filters, weight_filler=weight_filler), in_place=True), in_place=True)
    n.expand_4_2 = L.BatchNorm(L.ReLU(L.Convolution(n.expand_4_1, pad=pad, kernel_size=kernel_size, num_output=base_n_filters, weight_filler=weight_filler), in_place=True), in_place=True)

    n.seg = L.Convolution(n.expand_4_2, pad=0, kernel_size=1, num_output=num_classes, weight_filler=weight_filler)

    n.softmax = L.Softmax(n.seg, include={'phase':caffe.TEST})
    n.argmax = L.ArgMax(n.softmax, axis=1, include={'phase':caffe.TEST})
    n.loss = L.SoftmaxWithLoss(n.seg, n.target, include={'phase':caffe.TRAIN})
    n.accuracy = L.Accuracy(n.seg, n.target, exclude={'stage' : 'deploy'})

    if output_file is not None :
        f = open(output_file, 'w')
        f.write(str(n.to_proto()))
        f.close()

    return n