예제 #1
0
    def __init__(self, model_file, weightdir):
        netparam = NetParameter()
        layerparam = LayerParameter()
        with open(model_file, 'rb') as f:
            netparam.ParseFromString(f.read())

        cmd = 'mkdir %s/epoch0' % (weightdir) 
        res = subprocess.call(cmd, shell=True)

        print len(netparam.layers)
        curweights = 0
        for i in range(len(netparam.layers)):
            #print '%d %d' % (i, curweights)
            if hasattr(netparam.layers[i], 'blobs') and len(netparam.layers[i].blobs) == 2:
                filename = '%s/epoch0/weights_%d.dat' % (weightdir, curweights)
                #print filename
                if netparam.layers[i].type == layerparam.LayerType.Value('CONVOLUTION'):
                    num_output = netparam.layers[i].convolution_param.num_output
                    kernelsize = netparam.layers[i].convolution_param.kernel_size
                    orifilters = np.array(netparam.layers[i].blobs[0].data, dtype=np.float32)
                    channels = np.shape(orifilters)[0] / num_output / kernelsize / kernelsize
                    orifilters = orifilters.reshape([num_output, channels, kernelsize, kernelsize])
                    newfilters = np.zeros(np.shape(orifilters), dtype=np.float32)
                    for outidx in range(num_output):
                        for chaidx in range(channels):
                            newfilters[outidx, chaidx, :, :] = np.rot90(orifilters[outidx, chaidx, :,:],2)
                    newfilters.reshape(np.prod(np.shape(newfilters)[0:4])).tofile(filename)
                else:
                    num_output = netparam.layers[i].inner_product_param.num_output
                    input_dim = np.shape(np.array(netparam.layers[i].blobs[0].data, dtype=np.float32))[0] / num_output
                    theweight = np.transpose(np.array(netparam.layers[i].blobs[0].data, dtype=np.float32).reshape([num_output, input_dim]))
                    theweight.tofile(filename)
                #np.array(netparam.layers[i].blobs[0].data, dtype=np.float32).tofile(filename)
                
                filename = '%s/epoch0/bias_%d.dat' % (weightdir, curweights)
                #print filename
                np.array(netparam.layers[i].blobs[1].data, dtype=np.float32).tofile(filename)
                curweights += 1
            elif netparam.layers[i].type == layerparam.LayerType.Value('POOLING'):
                curweights += 1
    def get_net(self,
                param,
                batch_size,
                loss_weight=None,
                net=None,
                mode="train"):
        ''' Mode can be 'train','test','scst_decode' or 'scst_train'. '''

        if not net:
            net = NetParameter()

        if mode == "test":
            dummy_layer = net.layer.add()
            dummy_layer.name = "input"
            dummy_layer.top.append(dummy_layer.name)
            dummy_layer.type = "DummyData"
            filler = dummy_layer.dummy_data_param.data_filler.add()
            # Note that DataLayer uses the end_of_sequence to start sequences,
            # so we will do the same
            filler.value = param['end_of_sequence']
            blob_shape = dummy_layer.dummy_data_param.shape.add()
            blob_shape.dim.append(batch_size)
            blob_shape.dim.append(1)
        if mode == "train" or mode == "scst_train":
            input_slice_layer = net.layer.add()
            input_slice_layer.name = "input_slice"
            input_slice_layer.type = "Slice"
            input_slice_layer.slice_param.slice_dim = 1
            input_slice_layer.bottom.append('input_sentence')
            for i in range(param['max_length']):
                input_slice_layer.top.append("input" if i ==
                                             0 else "input_%d" % i)
                if i != 0:
                    input_slice_layer.slice_param.slice_point.append(i)

        for i in range(param['max_length']):
            if i == 0:

                for j in range(param['num_lstm_stacks']):
                    dummy_layer = net.layer.add()
                    dummy_layer.name = 'lstm%d_hidden_prev' % j
                    dummy_layer.top.append(dummy_layer.name)
                    dummy_layer.type = "DummyData"
                    blob_shape = dummy_layer.dummy_data_param.shape.add()
                    blob_shape.dim.append(batch_size)
                    blob_shape.dim.append(param['lstm_num_cells'])

                    dummy_mem_cell = net.layer.add()
                    dummy_mem_cell.name = 'lstm%d_mem_cell_prev' % j
                    dummy_mem_cell.top.append(dummy_mem_cell.name)
                    dummy_mem_cell.type = "DummyData"
                    blob_shape = dummy_mem_cell.dummy_data_param.shape.add()
                    blob_shape.dim.append(batch_size)
                    blob_shape.dim.append(param['lstm_num_cells'])

            for j in range(param['num_lstm_stacks']):

                if j == 0:
                    embed_layer = net.layer.add()
                    embed_layer.name = "embedding" if i == 0 else "embedding_%d" % i
                    embed_layer.type = "Embed"
                    embed_layer.bottom.append("input" if i ==
                                              0 else "input_%d" % i)
                    embed_layer.propagate_down.append(False)
                    embed_layer.top.append(embed_layer.name)
                    embed_layer.embed_param.bias_term = False
                    embed_layer.embed_param.input_dim = param['vocab_size']
                    embed_layer.embed_param.num_output = param[
                        'lstm_num_cells']
                    add_weight_filler(embed_layer.embed_param.weight_filler)
                    p = embed_layer.param.add()
                    p.name = 'embed_param'  # Share weights

                if j != 0:
                    # Set up attention mechanism

                    inner_product_layer = net.layer.add()
                    inner_product_layer.name = "hidden_att_%d" % i
                    inner_product_layer.bottom.append(
                        lstm_output_blob)  # batch_size x lstm_num_cells
                    inner_product_layer.top.append(
                        inner_product_layer.name
                    )  # batch_size x att_hidden_units
                    inner_product_layer.type = "InnerProduct"
                    inner_product_layer.inner_product_param.num_output = param[
                        'att_hidden_units']
                    inner_product_layer.inner_product_param.bias_term = False
                    add_weight_filler(
                        inner_product_layer.inner_product_param.weight_filler)
                    p = inner_product_layer.param.add()
                    p.name = 'hidden_att_param_0'  # Share weights

                    tile_layer = net.layer.add()
                    tile_layer.name = "tile_hidden_att_%d" % i
                    tile_layer.top.append(
                        tile_layer.name
                    )  # batch_size x (att_features x att_hidden_units)
                    tile_layer.bottom.append(inner_product_layer.name)
                    tile_layer.type = "Tile"
                    tile_layer.tile_param.axis = 1
                    tile_layer.tile_param.tiles = param['max_att_features']

                    reshape_layer = net.layer.add()
                    reshape_layer.name = "tile_hidden_reshape_%d" % i
                    reshape_layer.type = "Reshape"
                    reshape_layer.bottom.append(tile_layer.name)
                    reshape_layer.top.append(reshape_layer.name)
                    reshape_layer.reshape_param.shape.dim.append(
                        0)  # Batch size
                    reshape_layer.reshape_param.shape.dim.append(-1)
                    reshape_layer.reshape_param.shape.dim.append(
                        param['att_hidden_units'])

                    sum_layer = net.layer.add()
                    sum_layer.name = "sum_hidden_att_%d" % i
                    sum_layer.top.append(
                        sum_layer.name
                    )  # batch_size x att_features x att_hidden_units
                    if mode == 'test' or mode == 'scst_train':
                        sum_layer.bottom.append(
                            "beam_fc"
                        )  # batch_size x att_features x att_hidden_units
                    else:
                        sum_layer.bottom.append(
                            "fc"
                        )  # batch_size x att_features x att_hidden_units
                    sum_layer.bottom.append(
                        reshape_layer.name
                    )  # batch_size x att_features x att_hidden_units
                    sum_layer.type = "Eltwise"
                    sum_layer.eltwise_param.operation = sum_layer.eltwise_param.SUM

                    tanh_layer = net.layer.add()
                    tanh_layer.name = "hidden_tanh_%d" % i
                    tanh_layer.top.append(sum_layer.name)
                    tanh_layer.bottom.append(sum_layer.name)
                    tanh_layer.type = "TanH"

                    proj_layer = net.layer.add()
                    proj_layer.name = "predict_att_%d" % i
                    proj_layer.type = "InnerProduct"
                    proj_layer.bottom.append(
                        sum_layer.name
                    )  # batch_size x att_features x att_hidden_units
                    proj_layer.top.append(
                        proj_layer.name)  # batch_size x att_features x 1
                    proj_layer.inner_product_param.num_output = 1
                    proj_layer.inner_product_param.axis = 2
                    proj_layer.inner_product_param.bias_term = False
                    add_weight_filler(
                        proj_layer.inner_product_param.weight_filler)
                    p = proj_layer.param.add()
                    p.name = 'predict_att_param_0'  # Share weights

                    reshape_layer = net.layer.add()
                    reshape_layer.name = "reshape_predict_att_%d" % i
                    reshape_layer.type = "Reshape"
                    reshape_layer.bottom.append(proj_layer.name)
                    reshape_layer.top.append(
                        reshape_layer.name)  # batch_size x att_features
                    reshape_layer.reshape_param.shape.dim.append(
                        0)  # Batch size
                    reshape_layer.reshape_param.shape.dim.append(-1)

                    softmax_layer = net.layer.add()
                    softmax_layer.name = "att_weight_%d" % i
                    softmax_layer.type = "Softmax"
                    softmax_layer.bottom.append(
                        reshape_layer.name)  # batch_size x att_features
                    if mode == 'test' or mode == 'scst_train':
                        softmax_layer.bottom.append(
                            "beam_num_boxes")  # batch_size x 1
                    else:
                        softmax_layer.bottom.append(
                            "num_boxes")  # batch_size x 1
                    softmax_layer.top.append(
                        softmax_layer.name)  # batch_size x att_features
                    softmax_layer.softmax_param.axis = 1
                    softmax_layer.softmax_param.engine = softmax_layer.softmax_param.CAFFE  # Not implemented in CUDNN

                    scale_layer = net.layer.add()
                    scale_layer.name = "att_product_%d" % i
                    scale_layer.top.append(
                        scale_layer.name
                    )  # batch_size x att_features x input_feature_size
                    if mode == 'test' or mode == 'scst_train':
                        scale_layer.bottom.append(
                            "beam_spatial_features"
                        )  # batch_size x att_features x input_feature_size
                    else:
                        scale_layer.bottom.append(
                            "spatial_features"
                        )  # batch_size x att_features x input_feature_size
                    scale_layer.bottom.append(
                        softmax_layer.name)  # batch_size x att_features
                    scale_layer.type = "Scale"
                    scale_layer.scale_param.axis = 0

                    permute_layer = net.layer.add()
                    permute_layer.name = "permute_att_%d" % i
                    permute_layer.bottom.append(
                        scale_layer.name
                    )  # batch_size x att_features x input_feature_size
                    permute_layer.top.append(
                        permute_layer.name
                    )  # batch_size x input_feature_size x att_features
                    permute_layer.type = "Permute"
                    permute_layer.permute_param.order.append(0)
                    permute_layer.permute_param.order.append(2)
                    permute_layer.permute_param.order.append(1)

                    reduction_layer = net.layer.add()
                    reduction_layer.name = "fc8_%d" % i
                    reduction_layer.type = "Reduction"
                    reduction_layer.bottom.append(
                        permute_layer.name
                    )  # batch_size x input_feature_size x att_features
                    reduction_layer.top.append(
                        reduction_layer.name
                    )  # batch_size x input_feature_size
                    reduction_layer.reduction_param.axis = 2

                concat_layer = net.layer.add()
                concat_layer.name = 'concat%d_t%d' % (j, i)
                concat_layer.top.append(concat_layer.name)
                concat_layer.type = "Concat"
                # Data input is either the last word or the layer below output
                if j == 0:
                    concat_layer.bottom.append(embed_layer.name)
                    if mode == 'test' or mode == 'scst_train':
                        concat_layer.bottom.append(
                            "beam_context")  # Add CNN context
                    else:
                        concat_layer.bottom.append(
                            "context")  # Add CNN context
                    # Add copy down from lstm above
                    if i == 0:
                        concat_layer.bottom.append('lstm%d_hidden_prev' %
                                                   (j + 1))
                    else:
                        if mode == 'test' or mode == 'scst_decode':
                            concat_layer.bottom.append('lstm%d_hidden_prev%d' %
                                                       (j + 1, i))
                        else:
                            concat_layer.bottom.append('lstm%d_hidden%d' %
                                                       (j + 1, i - 1))
                else:
                    concat_layer.bottom.append('lstm%d_hidden%d' % (j - 1, i))
                    concat_layer.bottom.append(
                        reduction_layer.name
                    )  # Include attended cnn feature as next LSTM input
                # Plus either dummy or the diagonal connection from before
                if i == 0:
                    concat_layer.bottom.append('lstm%d_hidden_prev' % (j))
                else:
                    if mode == 'test' or mode == 'scst_decode':
                        concat_layer.bottom.append('lstm%d_hidden_prev%d' %
                                                   (j, i))
                    else:
                        concat_layer.bottom.append('lstm%d_hidden%d' %
                                                   (j, i - 1))

                lstm_layer = net.layer.add()
                lstm_layer.name = 'lstm%d' % (
                    j + 1) if i == 0 else 'lstm%d_t%d' % (j + 1, i)
                lstm_layer.type = "LSTMNode"
                lstm_layer.lstm_param.num_cells = param['lstm_num_cells']

                add_weight_filler(lstm_layer.lstm_param.input_weight_filler)
                add_weight_filler(
                    lstm_layer.lstm_param.input_gate_weight_filler)
                add_weight_filler(
                    lstm_layer.lstm_param.forget_gate_weight_filler)
                add_weight_filler(
                    lstm_layer.lstm_param.output_gate_weight_filler)

                add_bias_filler(lstm_layer.lstm_param.input_bias_filler)
                add_bias_filler(lstm_layer.lstm_param.input_gate_bias_filler)
                add_bias_filler(lstm_layer.lstm_param.forget_gate_bias_filler,
                                1)
                add_bias_filler(lstm_layer.lstm_param.output_gate_bias_filler)

                for k in range(2):
                    param_spec = lstm_layer.param.add()
                    param_spec.name = 'lstm%d_param_%d' % (j, k)
                lstm_output_blob = 'lstm%d_hidden%d' % (j, i)
                lstm_layer.top.append(lstm_output_blob)
                lstm_layer.top.append('lstm%d_mem_cell%d' % (j, i))
                lstm_layer.bottom.append('concat%d_t%d' % (j, i))
                lstm_layer.propagate_down.append(True)
                if i == 0:
                    lstm_layer.bottom.append('lstm%d_mem_cell_prev' % j)
                    lstm_layer.propagate_down.append(False)
                else:
                    if mode == 'test' or mode == 'scst_decode':
                        lstm_layer.bottom.append('lstm%d_mem_cell_prev%d' %
                                                 (j, i))
                    else:
                        lstm_layer.bottom.append('lstm%d_mem_cell%d' %
                                                 (j, i - 1))
                    lstm_layer.propagate_down.append(True)

                if j == param['num_lstm_stacks'] - 1:

                    inner_product_layer = net.layer.add()
                    inner_product_layer.name = "predict" if i == 0 else "predict_%d" % i
                    inner_product_layer.top.append(inner_product_layer.name)
                    inner_product_layer.bottom.append(lstm_output_blob)
                    inner_product_layer.type = "InnerProduct"
                    inner_product_layer.inner_product_param.num_output = param[
                        'vocab_size']
                    inner_product_layer.inner_product_param.axis = 1
                    p = inner_product_layer.param.add()
                    p.lr_mult = 1
                    p.decay_mult = 1
                    p.name = 'predict_param_0'
                    p = inner_product_layer.param.add()
                    p.lr_mult = 2
                    p.decay_mult = 0
                    p.name = 'predict_param_1'
                    add_weight_filler(
                        inner_product_layer.inner_product_param.weight_filler)
                    add_bias_filler(
                        inner_product_layer.inner_product_param.bias_filler)

                    if mode == 'test' or mode == 'scst_decode':
                        softmax_layer = net.layer.add()
                        softmax_layer.name = "probs_%d" % i
                        softmax_layer.type = "Softmax"
                        softmax_layer.bottom.append(inner_product_layer.name)
                        softmax_layer.top.append(softmax_layer.name)
                        softmax_layer.softmax_param.axis = 1

                        log_layer = net.layer.add()
                        log_layer.name = "logp_%d" % i
                        log_layer.type = "Log"
                        log_layer.bottom.append(softmax_layer.name)
                        log_layer.top.append(log_layer.name)

                    if mode == 'test':
                        # Beam search test
                        bs_layer = net.layer.add()
                        bs_layer.name = "beam_search_%d" % i
                        bs_layer.type = "BeamSearchNode"
                        if i > 0:
                            bs_layer.bottom.append("bs_scores_%d" % (i - 1))
                            bs_layer.bottom.append("bs_sentence_%d" % (i - 1))
                        bs_layer.bottom.append(log_layer.name)
                        for k in range(param['num_lstm_stacks']):
                            bs_layer.bottom.append("lstm%d_hidden%d" % (k, i))
                            bs_layer.bottom.append("lstm%d_mem_cell%d" %
                                                   (k, i))

                        bs_layer.top.append(
                            "log_prob" if i +
                            1 == param['max_length'] else "bs_scores_%d" % i)
                        bs_layer.top.append(
                            "caption" if i +
                            1 == param['max_length'] else "bs_sentence_%d" % i)
                        bs_layer.top.append("input_%d" % (i + 1))
                        for k in range(param['num_lstm_stacks']):
                            bs_layer.top.append("lstm%d_hidden_prev%d" %
                                                (k, i + 1))
                            bs_layer.top.append("lstm%d_mem_cell_prev%d" %
                                                (k, i + 1))

                        bs = bs_layer.beam_search_param
                        bs.beam_size = param['test_beam_size']
                        bs.end_of_sequence = param['end_of_sequence']
                        bs.ignore_label = param['ignore_label']
                        for word in param['allowed_multiple']:
                            bs.allowed_multiple.append(word)

        if mode == 'train' or mode == "scst_train":
            hidden_concat_layer = net.layer.add()
            hidden_concat_layer.type = "Concat"
            hidden_concat_layer.name = 'predict_concat'
            hidden_concat_layer.top.append(hidden_concat_layer.name)
            hidden_concat_layer.concat_param.concat_dim = 1
            for i in range(param['max_length']):
                hidden_concat_layer.bottom.append("predict" if i ==
                                                  0 else "predict_%d" % i)

            reshape_layer = net.layer.add()
            reshape_layer.name = 'predict_reshape'
            reshape_layer.type = "Reshape"
            reshape_layer.bottom.append('predict_concat')
            reshape_layer.top.append(reshape_layer.name)
            reshape_layer.reshape_param.shape.dim.append(0)  # Batch size
            reshape_layer.reshape_param.shape.dim.append(param['max_length'])
            reshape_layer.reshape_param.shape.dim.append(param['vocab_size'])

            word_loss_layer = net.layer.add()
            word_loss_layer.name = "cross_entropy_loss"
            word_loss_layer.type = "SoftmaxWithLoss"
            word_loss_layer.bottom.append("predict_reshape")
            word_loss_layer.bottom.append("target_sentence")
            word_loss_layer.propagate_down.append(True)
            word_loss_layer.propagate_down.append(False)
            if mode == "scst_train":
                word_loss_layer.bottom.append("score_weights")
                word_loss_layer.propagate_down.append(False)
            word_loss_layer.top.append(word_loss_layer.name)
            if loss_weight is None:
                word_loss_layer.loss_weight.append(param['max_length'])
            else:
                word_loss_layer.loss_weight.append(loss_weight)
            word_loss_layer.loss_param.ignore_label = param['ignore_label']
            word_loss_layer.softmax_param.axis = 2

            accuracy_layer = net.layer.add()
            accuracy_layer.name = "accuracy"
            accuracy_layer.type = "Accuracy"
            accuracy_layer.bottom.append("predict_reshape")
            accuracy_layer.bottom.append("target_sentence")
            accuracy_layer.top.append(accuracy_layer.name)
            accuracy_layer.accuracy_param.ignore_label = param['ignore_label']
            accuracy_layer.accuracy_param.axis = 2

        if mode != 'scst_decode':
            silence_layer = net.layer.add()
            silence_layer.name = "silence"
            silence_layer.type = "Silence"
            for j in range(param['num_lstm_stacks']):
                silence_layer.bottom.append("lstm%d_mem_cell%d" %
                                            (j, param['max_length'] - 1))
            silence_layer.bottom.append("image_id")
            silence_layer.bottom.append("boxes")
            if mode == 'test':
                silence_layer.bottom.append("input_%d" % param['max_length'])
                for j in range(param['num_lstm_stacks']):
                    silence_layer.bottom.append("lstm%d_hidden_prev%d" %
                                                (j, param['max_length']))
                    silence_layer.bottom.append("lstm%d_mem_cell_prev%d" %
                                                (j, param['max_length']))

        return net
    def get_data_layers(self, param, mode='train'):
        ''' Mode can be 'train','test' or 'scst' for self-critical sequence training. '''
        net = NetParameter()

        if mode == 'test':
            data = net.layer.add()
            data.type = "Python"
            data.name = "test_data"
            data.top.append("image_id")
            data.top.append(
                "num_boxes")  # (b, 1) - excludes the pooled box in count
            data.top.append("boxes")  # (b, max_boxes, 4)
            data.top.append("features")  # (b, max_boxes, 2048)
            data.python_param.module = "rcnn_layers"
            data.python_param.layer = "RCNNTestDataLayer"
            data.python_param.param_str = str({
                'batch_size':
                param['test_batch_size'],
                'feature_sources':
                param['test_feature_sources'],
                'pool':
                True,
                'max_boxes':
                param['max_att_features']
            })

        elif mode == 'train':
            data = net.layer.add()
            data.type = "Python"
            data.name = "sequence_data"
            data.top.append("image_id")
            data.top.append("input_sentence")
            data.top.append("target_sentence")
            data.top.append(
                "num_boxes")  # (b, 1) - excludes the pooled box in count
            data.top.append("boxes")  # (b, max_boxes, 4)
            data.top.append("features")  # (b, max_boxes, 2048)
            data.python_param.module = "rcnn_layers"
            data.python_param.layer = "RCNNCaptionTrainDataLayer"
            data.python_param.param_str = str({
                'end_of_sequence':
                param['end_of_sequence'],
                'ignore_label':
                param['ignore_label'],
                'sequence_length':
                param['max_length'],
                'batch_size':
                param['train_batch_size'],
                'caption_sources':
                param['train_caption_sources'],
                'feature_sources':
                param['train_feature_sources'],
                'pool':
                True,
                'max_boxes':
                param['max_att_features']
            })

        elif mode == 'scst':
            data = net.layer.add()
            data.type = "Python"
            data.name = "train_data"
            data.top.append("image_id")
            data.top.append(
                "num_boxes")  # (b, 1) - excludes the pooled box in count
            data.top.append("boxes")  # (b, max_boxes, 4)
            data.top.append("features")  # (b, max_boxes, 2048)
            data.python_param.module = "rcnn_layers"
            data.python_param.layer = "RCNNTestDataLayer"
            data.python_param.param_str = str({
                'batch_size':
                param['train_batch_size'],
                'feature_sources':
                param['train_feature_sources'],
                'pool':
                True,
                'max_boxes':
                param['max_att_features']
            })

        else:
            raise ValueError('unknown data layer mode')

        slice_layer = net.layer.add()
        slice_layer.name = "feature_slice_layer"
        slice_layer.type = "Slice"
        slice_layer.slice_param.slice_dim = 1
        slice_layer.slice_param.slice_point.append(1)
        slice_layer.bottom.append("features")
        slice_layer.top.append("avg_pool")  # batch_size x 1 x cnn_filters
        slice_layer.top.append(
            "spatial_features")  # batch_size x att_features x cnn_filters

        flatten_layer = net.layer.add()
        flatten_layer.name = "context"
        flatten_layer.type = "Flatten"
        flatten_layer.bottom.append("avg_pool")  # batch_size 2048
        flatten_layer.top.append(flatten_layer.name)  # batch_size x 2048
        flatten_layer.flatten_param.axis = 1

        fc_layer = net.layer.add()
        fc_layer.name = "fc"
        fc_layer.type = "InnerProduct"
        fc_layer.bottom.append(
            "spatial_features")  # batch_size x att_features x cnn_filters
        fc_layer.top.append(
            fc_layer.name)  # batch_size x att_features x att_hidden_units
        fc_layer.inner_product_param.num_output = param['att_hidden_units']
        fc_layer.inner_product_param.axis = 2
        fc_layer.inner_product_param.bias_term = False
        add_weight_filler(fc_layer.inner_product_param.weight_filler)

        if mode == 'test' or mode == 'scst':
            # Tile inputs for the number of beams and reshape into batchsize
            tile_layer = net.layer.add()
            tile_layer.name = "tile_context"
            tile_layer.top.append(tile_layer.name)
            tile_layer.bottom.append("context")
            tile_layer.type = "Tile"
            tile_layer.tile_param.tiles = param['test_beam_size']
            tile_layer.tile_param.axis = 1

            reshape_layer = net.layer.add()
            reshape_layer.name = "beam_context"
            reshape_layer.type = "Reshape"
            reshape_layer.bottom.append("tile_context")
            reshape_layer.top.append(reshape_layer.name)
            reshape_layer.reshape_param.shape.dim.append(-1)
            reshape_layer.reshape_param.shape.dim.append(param['cnn_filters'])

            tile_layer = net.layer.add()
            tile_layer.name = "tile_spatial_features"
            tile_layer.top.append(tile_layer.name)
            tile_layer.bottom.append("spatial_features")
            tile_layer.type = "Tile"
            tile_layer.tile_param.tiles = param['test_beam_size']
            tile_layer.tile_param.axis = 1

            reshape_layer = net.layer.add()
            reshape_layer.name = "beam_spatial_features"
            reshape_layer.type = "Reshape"
            reshape_layer.bottom.append(tile_layer.name)
            reshape_layer.top.append(reshape_layer.name)
            reshape_layer.reshape_param.shape.dim.append(-1)
            reshape_layer.reshape_param.shape.dim.append(
                param['max_att_features'])
            reshape_layer.reshape_param.shape.dim.append(param['cnn_filters'])

            tile_layer = net.layer.add()
            tile_layer.name = "tile_fc"
            tile_layer.top.append(tile_layer.name)
            tile_layer.bottom.append("fc")
            tile_layer.type = "Tile"
            tile_layer.tile_param.tiles = param['test_beam_size']
            tile_layer.tile_param.axis = 1

            reshape_layer = net.layer.add()
            reshape_layer.name = "beam_fc"
            reshape_layer.type = "Reshape"
            reshape_layer.bottom.append(tile_layer.name)
            reshape_layer.top.append(reshape_layer.name)
            reshape_layer.reshape_param.shape.dim.append(-1)
            reshape_layer.reshape_param.shape.dim.append(
                param['max_att_features'])
            reshape_layer.reshape_param.shape.dim.append(
                param['att_hidden_units'])

            tile_layer = net.layer.add()
            tile_layer.name = "tile_num_boxes"
            tile_layer.top.append(tile_layer.name)
            tile_layer.bottom.append("num_boxes")
            tile_layer.type = "Tile"
            tile_layer.tile_param.tiles = param['test_beam_size']
            tile_layer.tile_param.axis = 1

            reshape_layer = net.layer.add()
            reshape_layer.name = "beam_num_boxes"
            reshape_layer.type = "Reshape"
            reshape_layer.bottom.append(tile_layer.name)
            reshape_layer.top.append(reshape_layer.name)
            reshape_layer.reshape_param.shape.dim.append(-1)
            reshape_layer.reshape_param.shape.dim.append(1)
        return net
    def get_scst_net(self, param):
        net = NetParameter()

        beam_layer = net.layer.add()
        beam_layer.name = "beam"
        beam_layer.type = "BeamSearch"
        beam_layer.bottom.append("num_boxes")
        beam_layer.bottom.append("spatial_features")
        beam_layer.bottom.append("fc")
        beam_layer.bottom.append("context")
        beam_layer.top.append("caption")
        beam_layer.top.append("log_prob")
        beam_layer.top.append("log_prob_sequence")
        bs = beam_layer.beam_search_param
        bs.beam_size = param['test_beam_size']
        bs.sequence_length = param['max_length']
        bs.end_of_sequence = param['end_of_sequence']
        for word in param['allowed_multiple']:
            bs.allowed_multiple.append(word)
        for i in range(param['num_lstm_stacks']):
            # Previous hidden state
            rc = bs.recurrent_connection.add()
            rc.src = 'lstm%d_hidden0' % i
            rc.dest = 'lstm%d_hidden_prev' % i
            # Previous mem cell
            rc = bs.recurrent_connection.add()
            rc.src = 'lstm%d_mem_cell0' % i
            rc.dest = 'lstm%d_mem_cell_prev' % i
        bs.beam_search_connection.src = 'logp_0'
        bs.beam_search_connection.dest = 'input'
        for pname in [
                "embed_param", "lstm0_param_0", "lstm0_param_1",
                "hidden_att_param_0", "predict_att_param_0", "lstm1_param_0",
                "lstm1_param_1", "predict_param_0", "predict_param_1"
        ]:
            p = beam_layer.param.add()
            p.name = pname  # Share weights

        inner_net = bs.net_param
        input_layer = inner_net.layer.add()
        input_layer.name = "input"
        input_layer.type = "Input"
        input_layer.top.append("num_boxes")
        input_layer.top.append("spatial_features")
        input_layer.top.append("fc")
        input_layer.top.append("context")
        input_layer.top.append(input_layer.name)
        blob_shape = input_layer.input_param.shape.add()
        blob_shape.dim.append(param['train_batch_size'])
        blob_shape.dim.append(1)
        blob_shape = input_layer.input_param.shape.add()
        blob_shape.dim.append(param['train_batch_size'])
        blob_shape.dim.append(param['max_att_features'])
        blob_shape.dim.append(param['cnn_filters'])
        blob_shape = input_layer.input_param.shape.add()
        blob_shape.dim.append(param['train_batch_size'])
        blob_shape.dim.append(param['max_att_features'])
        blob_shape.dim.append(param['att_hidden_units'])
        blob_shape = input_layer.input_param.shape.add()
        blob_shape.dim.append(param['train_batch_size'])
        blob_shape.dim.append(param['cnn_filters'])
        blob_shape = input_layer.input_param.shape.add()
        blob_shape.dim.append(param['train_batch_size'])
        blob_shape.dim.append(1)

        max_length = param['max_length']
        param['max_length'] = 1
        inner_net = self.get_net(param,
                                 param['test_batch_size'],
                                 net=inner_net,
                                 mode="scst_decode")

        silence_layer = net.layer.add()
        silence_layer.name = "silence_bs"
        silence_layer.type = "Silence"
        silence_layer.bottom.append("log_prob")
        silence_layer.bottom.append("log_prob_sequence")

        scst_layer = net.layer.add()
        scst_layer.type = "Python"
        scst_layer.name = "scst"
        scst_layer.bottom.append("image_id")
        scst_layer.bottom.append("caption")
        scst_layer.propagate_down.append(False)
        scst_layer.propagate_down.append(False)
        scst_layer.top.append("score_weights")
        scst_layer.top.append("input_sentence")
        scst_layer.top.append("target_sentence")
        scst_layer.top.append("mean_score")
        scst_layer.python_param.module = "scst_layers"
        scst_layer.python_param.layer = "SCSTLayer"
        scst_layer.python_param.param_str = str({
            'vocab_path':
            param['data_dir'] + param['vocab_file'],
            'gt_caption_paths':
            param['gt_caption_paths'],
            'end_of_sequence':
            param['end_of_sequence'],
            'ignore_label':
            param['ignore_label']
        })

        # Add rest of training network
        param['max_length'] = max_length
        net = self.get_net(param,
                           param['test_batch_size'] * param['test_beam_size'],
                           net=net,
                           mode="scst_train")

        return net
예제 #5
0
#!/usr/bin/env python2

import argparse
import sys
sys.path.insert(
    0, '/home/hongyang/project/faster_rcnn/external/caffe/python/caffe/proto')
#import caffe
from caffe_pb2 import NetParameter

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        'Copy and rename certain layers into another model.')
    parser.add_argument('model', type=str, help='Source model.')
    args = parser.parse_args()

    source_model = NetParameter.FromString(open(args.model, 'rb').read())

    for layer in source_model.layer:
        print layer.name
예제 #6
0
def get_net(param, deploy, batch_size):
    net = NetParameter()

    def add_weight_filler(param, max_value=param['init_range']):
        param.type = 'uniform'
        param.min = -max_value
        param.max = max_value

    if not deploy:
        train_data = net.layer.add()
        train_data.type = "Data"
        train_data.name = "data"
        train_data.top.append(train_data.name)
        train_data.data_param.source = 'examples/language_model/lm_train_db'
        train_data.data_param.backend = DataParameter.LMDB
        train_data.data_param.batch_size = batch_size

        test_data = net.layer.add()
        test_data.type = "Data"
        test_data.name = "data"
        test_data.top.append(test_data.name)
        test_data.data_param.source = 'examples/language_model/lm_valid_db'
        test_data.data_param.backend = DataParameter.LMDB
        test_data.data_param.batch_size = batch_size

        test_data_rule = test_data.include.add()
        test_data_rule.phase = caffe_pb2.TEST
        train_data_rule = train_data.include.add()
        train_data_rule.phase = caffe_pb2.TRAIN

    data_slice_layer = net.layer.add()
    data_slice_layer.name = "data_slice_layer"
    data_slice_layer.type = "Slice"
    data_slice_layer.slice_param.slice_dim = 1
    data_slice_layer.bottom.append('data')
    data_slice_layer.top.append('input_words')
    data_slice_layer.top.append('target_words')
    data_slice_layer.slice_param.slice_point.append(param['maximum_length'])

    label_slice_layer = net.layer.add()
    label_slice_layer.name = "label_slice_layer"
    label_slice_layer.type = "Slice"
    label_slice_layer.slice_param.slice_dim = 1
    label_slice_layer.bottom.append('target_words')
    for i in range(param['maximum_length']):
        label_slice_layer.top.append('label%d' % i)
        if i != 0:
            label_slice_layer.slice_param.slice_point.append(i)

    wordvec_layer = net.layer.add()
    wordvec_layer.name = "wordvec_layer"
    wordvec_layer.type = "Wordvec"
    wordvec_layer.bottom.append('input_words')
    wordvec_layer.top.append(wordvec_layer.name)
    wordvec_layer.wordvec_param.dimension = param['wordvec_length']
    wordvec_layer.wordvec_param.vocab_size = param['vocab_size']
    add_weight_filler(wordvec_layer.wordvec_param.weight_filler)

    wordvec_slice_layer = net.layer.add()
    wordvec_slice_layer.name = "wordvec_slice_layer"
    wordvec_slice_layer.type = "Slice"
    wordvec_slice_layer.slice_param.slice_dim = 2
    wordvec_slice_layer.slice_param.fast_wordvec_slice = True
    wordvec_slice_layer.bottom.append('wordvec_layer')
    for i in range(param['maximum_length']):
        wordvec_slice_layer.top.append('target_wordvec%d' % i)
        if i != 0:
            wordvec_slice_layer.slice_param.slice_point.append(i)

    for i in range(param['maximum_length']):
        if i == 0:
            dummy_layer = net.layer.add()
            dummy_layer.name = 'dummy_layer'
            dummy_layer.top.append(dummy_layer.name)
            dummy_layer.type = "DummyData"
            dummy_layer.dummy_data_param.num.append(batch_size)
            dummy_layer.dummy_data_param.channels.append(
                param['lstm_num_cells'])
            dummy_layer.dummy_data_param.height.append(1)
            dummy_layer.dummy_data_param.width.append(1)

            dummy_mem_cell = net.layer.add()
            dummy_mem_cell.name = 'dummy_mem_cell'
            dummy_mem_cell.top.append(dummy_mem_cell.name)
            dummy_mem_cell.type = "DummyData"
            dummy_mem_cell.dummy_data_param.num.append(batch_size)
            dummy_mem_cell.dummy_data_param.channels.append(
                param['lstm_num_cells'])
            dummy_mem_cell.dummy_data_param.height.append(1)
            dummy_mem_cell.dummy_data_param.width.append(1)

        for j in range(param['num_lstm_stacks']):
            concat_layer = net.layer.add()
            concat_layer.name = 'concat%d_layer%d' % (j, i)

            concat_layer.top.append(concat_layer.name)
            concat_layer.type = "Concat"
            concat_layer.concat_param.fast_lstm_concat = True
            if j == 0:
                concat_layer.bottom.append('target_wordvec%d' % i)
            if j >= 1:
                concat_layer.bottom.append('dropout%d_%d' % (j - 1, i))
            if i == 0:
                concat_layer.bottom.append(dummy_layer.name)
            else:
                concat_layer.bottom.append('lstm%d_hidden%d' % (j, i - 1))

            lstm_layer = net.layer.add()
            lstm_layer.name = 'lstm%d_layer%d' % (j, i)
            lstm_layer.type = "Lstm"
            lstm_layer.lstm_param.num_cells = param['lstm_num_cells']

            add_weight_filler(lstm_layer.lstm_param.input_weight_filler)
            add_weight_filler(lstm_layer.lstm_param.input_gate_weight_filler)
            add_weight_filler(lstm_layer.lstm_param.forget_gate_weight_filler)
            add_weight_filler(lstm_layer.lstm_param.output_gate_weight_filler)

            for k in range(4):
                param_spec = lstm_layer.param.add()
                param_spec.name = 'lstm%d_param_%d' % (j, k)
            lstm_layer.top.append('lstm%d_hidden%d' % (j, i))
            lstm_layer.top.append('lstm%d_mem_cell%d' % (j, i))
            lstm_layer.bottom.append('concat%d_layer%d' % (j, i))
            if i == 0:
                lstm_layer.bottom.append('dummy_mem_cell')
            else:
                lstm_layer.bottom.append('lstm%d_mem_cell%d' % (j, i - 1))

            dropout_layer = net.layer.add()
            dropout_layer.name = 'dropout%d_%d' % (j, i)
            dropout_layer.type = "Dropout"
            dropout_layer.top.append(dropout_layer.name)
            dropout_layer.bottom.append('lstm%d_hidden%d' % (j, i))
            dropout_layer.dropout_param.dropout_ratio = param['dropout_ratio']

    hidden_concat_layer = net.layer.add()
    hidden_concat_layer.type = "Concat"
    hidden_concat_layer.name = 'hidden_concat'
    hidden_concat_layer.top.append(hidden_concat_layer.name)
    hidden_concat_layer.concat_param.concat_dim = 0
    for i in range(param['maximum_length']):
        hidden_concat_layer.bottom.append('dropout%d_%d' %
                                          (param['num_lstm_stacks'] - 1, i))

    inner_product_layer = net.layer.add()
    inner_product_layer.name = "inner_product"
    inner_product_layer.top.append(inner_product_layer.name)
    inner_product_layer.bottom.append('hidden_concat')
    inner_product_layer.type = "InnerProduct"
    inner_product_layer.inner_product_param.bias_term = False
    inner_product_layer.inner_product_param.num_output = param['vocab_size']
    add_weight_filler(inner_product_layer.inner_product_param.weight_filler)

    label_concat_layer = net.layer.add()
    label_concat_layer.name = "label_concat"
    label_concat_layer.type = "Concat"
    label_concat_layer.concat_param.concat_dim = 0
    label_concat_layer.top.append(label_concat_layer.name)
    for i in range(param['maximum_length']):
        label_concat_layer.bottom.append('label%d' % i)

    if deploy:
        word_prob_layer = net.layer.add()
        word_prob_layer.name = "word_probs"
        word_prob_layer.top.append(word_prob_layer.name)
        word_prob_layer.type = "Softmax"
        word_prob_layer.bottom.append("inner_product")

    else:
        word_loss_layer = net.layer.add()
        word_loss_layer.name = "word_loss"
        word_loss_layer.type = "SoftmaxWithLoss"
        word_loss_layer.bottom.append("inner_product")
        word_loss_layer.bottom.append("label_concat")
        word_loss_layer.top.append(word_loss_layer.name)
        word_loss_layer.loss_param.ignore_label = param['zero_symbol']

    silence_layer = net.layer.add()
    silence_layer.name = "silence"
    silence_layer.type = "Silence"
    for j in range(param['num_lstm_stacks']):
        silence_layer.bottom.append("lstm%d_mem_cell%d" %
                                    (j, param['maximum_length'] - 1))
    for j in range(param['num_lstm_stacks'] - 1):
        silence_layer.bottom.append("dropout%d_%d" %
                                    (j, param['maximum_length'] - 1))

    return net
예제 #7
0
 def __init__(self, configfile):
     file = open(configfile, "r")
     self.netconfig = NetParameter()        
     text_format.Merge(str(file.read()), self.netconfig)
     self.netconfig